diff options
Diffstat (limited to 'fs/ocfs2')
122 files changed, 11620 insertions, 9775 deletions
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 77a8de5f7119..2514d36cbe01 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -1,11 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only config OCFS2_FS tristate "OCFS2 file system support" - depends on NET && SYSFS && CONFIGFS_FS + depends on INET && SYSFS && CONFIGFS_FS + select BUFFER_HEAD select JBD2 select CRC32 select QUOTA select QUOTA_TREE select FS_POSIX_ACL + select LEGACY_DIRECT_IO help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode @@ -15,12 +18,12 @@ config OCFS2_FS You'll want to install the ocfs2-tools package in order to at least get "mount.ocfs2". - Project web page: http://oss.oracle.com/projects/ocfs2 - Tools web page: http://oss.oracle.com/projects/ocfs2-tools - OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ + Project web page: https://ocfs2.wiki.kernel.org/ + Tools web page: https://github.com/markfasheh/ocfs2-tools + OCFS2 mailing lists: https://subspace.kernel.org/lists.linux.dev.html For more information on OCFS2, see the file - <file:Documentation/filesystems/ocfs2.txt>. + <file:Documentation/filesystems/ocfs2.rst>. config OCFS2_FS_O2CB tristate "O2CB Kernelspace Clustering" diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f17e58b32989..cc9b32b9db7c 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -1,6 +1,5 @@ -ccflags-y := -Ifs/ocfs2 - -ccflags-y += -DCATCH_BH_JBD_RACES +# SPDX-License-Identifier: GPL-2.0 +ccflags-y := -I$(src) obj-$(CONFIG_OCFS2_FS) += \ ocfs2.o \ @@ -38,11 +37,11 @@ ocfs2-objs := \ symlink.o \ sysfile.o \ uptodate.o \ - ver.o \ quota_local.o \ quota_global.o \ xattr.o \ - acl.o + acl.o \ + filecheck.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 8a404576fb26..af1e2cedb217 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -1,6 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * acl.c * * Copyright (C) 2004, 2008 Oracle. All rights reserved. @@ -8,21 +7,13 @@ * CREDITS: * Lots of code in this file is copy from linux/fs/ext3/acl.c. * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/fs_struct.h> #include <cluster/masklog.h> @@ -51,10 +42,6 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size) return ERR_PTR(-EINVAL); count = size / sizeof(struct posix_acl_entry); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) @@ -164,36 +151,6 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, return acl; } - -/* - * Get posix acl. - */ -static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct buffer_head *di_bh = NULL; - struct posix_acl *acl; - int ret; - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return NULL; - - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - acl = ERR_PTR(ret); - return acl; - } - - acl = ocfs2_get_acl_nolock(inode, type, di_bh); - - ocfs2_inode_unlock(inode, 0); - - brelse(di_bh); - - return acl; -} - /* * Helper function to set i_mode in memory and disk. Some call paths * will not have di_bh or a journal handle to pass, in which case it @@ -235,10 +192,11 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, } inode->i_mode = new_mode; - inode->i_ctime = CURRENT_TIME; + inode_set_ctime_current(inode); di->i_mode = cpu_to_le16(inode->i_mode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); @@ -273,22 +231,6 @@ static int ocfs2_set_acl(handle_t *handle, switch (type) { case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - umode_t mode = inode->i_mode; - ret = posix_acl_equiv_mode(acl, &mode); - if (ret < 0) - return ret; - else { - if (ret == 0) - acl = NULL; - - ret = ocfs2_acl_set_mode(inode, di_bh, - handle, mode); - if (ret) - return ret; - - } - } break; case ACL_TYPE_DEFAULT: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; @@ -313,33 +255,71 @@ static int ocfs2_set_acl(handle_t *handle, ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0); kfree(value); + if (!ret) + set_cached_acl(inode, type, acl); return ret; } -struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) +int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + struct buffer_head *bh = NULL; + int status, had_lock; + struct ocfs2_lock_holder oh; + struct inode *inode = d_inode(dentry); + + had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); + if (had_lock < 0) + return had_lock; + if (type == ACL_TYPE_ACCESS && acl) { + umode_t mode; + + status = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, + &acl); + if (status) + goto unlock; + + status = ocfs2_acl_set_mode(inode, bh, NULL, mode); + if (status) + goto unlock; + } + status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); +unlock: + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); + brelse(bh); + return status; +} + +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu) { struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; - int ret = -EAGAIN; + int had_lock; + struct ocfs2_lock_holder oh; + + if (rcu) + return ERR_PTR(-ECHILD); osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return NULL; - ret = ocfs2_read_inode_block(inode, &di_bh); - if (ret < 0) - return ERR_PTR(ret); + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh); + if (had_lock < 0) + return ERR_PTR(had_lock); + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, type, di_bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); - return acl; } -int ocfs2_acl_chmod(struct inode *inode) +int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl; @@ -351,10 +331,12 @@ int ocfs2_acl_chmod(struct inode *inode) if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return 0; - acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + down_read(&OCFS2_I(inode)->ip_xattr_sem); + acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); + if (IS_ERR_OR_NULL(acl)) + return PTR_ERR_OR_ZERO(acl); + ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (ret) return ret; ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, @@ -382,8 +364,10 @@ int ocfs2_init_acl(handle_t *handle, if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, dir_bh); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (IS_ERR(acl)) return PTR_ERR(acl); } @@ -405,7 +389,7 @@ int ocfs2_init_acl(handle_t *handle, goto cleanup; } mode = inode->i_mode; - ret = posix_acl_create(&acl, GFP_NOFS, &mode); + ret = __posix_acl_create(&acl, GFP_NOFS, &mode); if (ret < 0) return ret; @@ -425,113 +409,3 @@ cleanup: posix_acl_release(acl); return ret; } - -static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry, - char *list, - size_t list_len, - const char *name, - size_t name_len, - int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry, - char *list, - size_t list_len, - const char *name, - size_t name_len, - int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - struct posix_acl *acl; - int ret; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ocfs2_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return ret; -} - -static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl; - int ret = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return -EOPNOTSUPP; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto cleanup; - } - } else - acl = NULL; - - ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); - -cleanup: - posix_acl_release(acl); - return ret; -} - -const struct xattr_handler ocfs2_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ocfs2_xattr_list_acl_access, - .get = ocfs2_xattr_get_acl, - .set = ocfs2_xattr_set_acl, -}; - -const struct xattr_handler ocfs2_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ocfs2_xattr_list_acl_default, - .get = ocfs2_xattr_get_acl, - .set = ocfs2_xattr_set_acl, -}; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 071fbd380f2f..667c6f03fa60 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -1,18 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * acl.h * * Copyright (C) 2004, 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef OCFS2_ACL_H @@ -26,8 +16,10 @@ struct ocfs2_acl_entry { __le32 e_id; }; -struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); -extern int ocfs2_acl_chmod(struct inode *); +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu); +int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type); +extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, struct ocfs2_alloc_context *, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 17e6bdde96c5..b267ec580da9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * alloc.c * * Extent allocs and frees * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -30,6 +14,7 @@ #include <linux/swap.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/sched/signal.h> #include <cluster/masklog.h> @@ -164,7 +149,14 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); -static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { + +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head **new_eb_bh, + int blk_wanted, int *blk_given); +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et); + +static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, .eo_update_clusters = ocfs2_dinode_update_clusters, @@ -286,7 +278,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_value_update_clusters, @@ -332,7 +324,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_tree_update_clusters, @@ -379,7 +371,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) et->et_root_el = &dx_root->dr_list; } -static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, .eo_update_clusters = ocfs2_dx_root_update_clusters, @@ -425,7 +417,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, return CONTIG_NONE; } -static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_refcount_tree_update_clusters, @@ -438,7 +430,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct buffer_head *bh, ocfs2_journal_access_func access, void *obj, - struct ocfs2_extent_tree_operations *ops) + const struct ocfs2_extent_tree_operations *ops) { et->et_ops = ops; et->et_root_bh = bh; @@ -447,6 +439,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, if (!obj) obj = (void *)bh->b_data; et->et_object = obj; + et->et_dealloc = NULL; et->et_ops->eo_fill_root_el(et); if (!et->et_ops->eo_fill_max_leaf_clusters) @@ -573,7 +566,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec); /* - * Reset the actual path elements so that we can re-use the structure + * Reset the actual path elements so that we can reuse the structure * to build another path. Generally, this involves freeing the buffer * heads. */ @@ -908,32 +901,28 @@ static int ocfs2_validate_extent_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - ocfs2_error(sb, - "Extent block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - eb->h_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + goto bail; } if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extent block #%llu has an invalid h_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(eb->h_blkno)); - return -EINVAL; - } - - if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extent block #%llu has an invalid " - "h_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(eb->h_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + goto bail; } - return 0; + if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); +bail: + return rc; } int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, @@ -956,8 +945,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, /* * How many free extents have we got before we need more meta data? */ -int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct ocfs2_extent_tree *et) +int ocfs2_num_free_extents(struct ocfs2_extent_tree *et) { int retval; struct ocfs2_extent_list *el = NULL; @@ -979,7 +967,14 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb, el = &eb->h_list; } - BUG_ON(el->l_tree_depth != 0); + if (el->l_tree_depth != 0) { + retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)last_eb_blk, + le16_to_cpu(el->l_tree_depth)); + goto bail; + } retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); bail: @@ -1025,7 +1020,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, for(i = count; i < (num_got + count); i++) { bhs[i] = sb_getblk(osb->sb, first_blkno); if (bhs[i] == NULL) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } @@ -1070,7 +1065,6 @@ bail: brelse(bhs[i]); bhs[i] = NULL; } - mlog_errno(status); } return status; } @@ -1160,7 +1154,7 @@ static int ocfs2_add_branch(handle_t *handle, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { - int status, new_blocks, i; + int status, new_blocks, i, block_given = 0; u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; @@ -1188,7 +1182,7 @@ static int ocfs2_add_branch(handle_t *handle, /* * If there is a gap before the root end and the real end - * of the righmost leaf block, we need to remove the gap + * of the rightmost leaf block, we need to remove the gap * between new_cpos and root_end first so that the tree * is consistent after we add a new branch(it will start * from new_cpos). @@ -1215,16 +1209,36 @@ static int ocfs2_add_branch(handle_t *handle, goto bail; } - status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, - meta_ac, new_eb_bhs); - if (status < 0) { - mlog_errno(status); - goto bail; + /* Firstyly, try to reuse dealloc since we have already estimated how + * many extent blocks we may use. + */ + if (!ocfs2_is_dealloc_empty(et)) { + status = ocfs2_reuse_blk_from_dealloc(handle, et, + new_eb_bhs, new_blocks, + &block_given); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + BUG_ON(block_given > new_blocks); + + if (block_given < new_blocks) { + BUG_ON(!meta_ac); + status = ocfs2_create_new_meta_bhs(handle, et, + new_blocks - block_given, + meta_ac, + &new_eb_bhs[block_given]); + if (status < 0) { + mlog_errno(status); + goto bail; + } } /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. - * conversly, new_eb_bhs[0] is the new bottommost leaf. + * conversely, new_eb_bhs[0] is the new bottommost leaf. * * when we leave the loop, new_last_eb_blk will point to the * newest leaf, and next_blkno will point to the topmost extent @@ -1342,15 +1356,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { - int status, i; + int status, i, block_given = 0; u32 new_clusters; struct buffer_head *new_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; - status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, - &new_eb_bh); + if (!ocfs2_is_dealloc_empty(et)) { + status = ocfs2_reuse_blk_from_dealloc(handle, et, + &new_eb_bh, 1, + &block_given); + } else if (meta_ac) { + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, + &new_eb_bh); + + } else { + BUG(); + } + if (status < 0) { mlog_errno(status); goto bail; @@ -1445,22 +1469,17 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { - ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty " - "extent list (next_free_rec == 0)", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); - status = -EIO; + status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent list (next_free_rec == 0)\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); goto bail; } i = le16_to_cpu(el->l_next_free_rec) - 1; blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { - ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has extent " - "list where extent # %d has no physical " - "block start", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); - status = -EIO; + status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has extent list where extent # %d has no physical block start\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); goto bail; } @@ -1516,7 +1535,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, int depth = le16_to_cpu(el->l_tree_depth); struct buffer_head *bh = NULL; - BUG_ON(meta_ac == NULL); + BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et)); shift = ocfs2_find_branch_target(et, &bh); if (shift < 0) { @@ -1565,10 +1584,8 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, * the new data. */ ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, meta_ac); - if (ret < 0) { + if (ret < 0) mlog_errno(ret); - goto out; - } out: if (final_depth) @@ -1786,10 +1803,17 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, el = root_el; while (el->l_tree_depth) { + if (unlikely(le16_to_cpu(el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH)) { + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has invalid tree depth %u in extent list\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; + } if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has empty extent list at " - "depth %u\n", + "Owner %llu has empty extent list at depth %u\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth)); ret = -EROFS; @@ -1814,8 +1838,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (blkno == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad blkno in extent list " - "at depth %u (index %d)\n", + "Owner %llu has bad blkno in extent list at depth %u (index %d)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth), i); ret = -EROFS; @@ -1836,8 +1859,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad count in extent list " - "at block %llu (next free=%u, count=%u)\n", + "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr, le16_to_cpu(el->l_next_free_rec), @@ -1940,14 +1962,12 @@ out: * the new changes. * * left_rec: the record on the left. - * left_child_el: is the child list pointed to by left_rec * right_rec: the record to the right of left_rec * right_child_el: is the child list pointed to by right_rec * * By definition, this only works on interior nodes. */ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, - struct ocfs2_extent_list *left_child_el, struct ocfs2_extent_rec *right_rec, struct ocfs2_extent_list *right_child_el) { @@ -2010,7 +2030,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, */ BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); - ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], &root_el->l_recs[i + 1], right_el); } @@ -2035,7 +2055,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, int i, idx; struct ocfs2_extent_list *el, *left_el, *right_el; struct ocfs2_extent_rec *left_rec, *right_rec; - struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + struct buffer_head *root_bh; /* * Update the counts and position values within all the @@ -2067,8 +2087,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, el = right_path->p_node[i].el; right_rec = &el->l_recs[0]; - ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, - right_el); + ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el); ocfs2_journal_dirty(handle, left_path->p_node[i].bh); ocfs2_journal_dirty(handle, right_path->p_node[i].bh); @@ -2116,8 +2135,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, if (left_el->l_next_free_rec != left_el->l_count) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Inode %llu has non-full interior leaf node %llu" - "(next free = %u)", + "Inode %llu has non-full interior leaf node %llu (next free = %u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)left_leaf_bh->b_blocknr, le16_to_cpu(left_el->l_next_free_rec)); @@ -2256,8 +2274,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -2283,9 +2300,9 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, int ret = 0; int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; - if (handle->h_buffer_credits < credits) + if (jbd2_handle_buffer_credits(handle) < credits) ret = ocfs2_extend_trans(handle, - credits - handle->h_buffer_credits); + credits - jbd2_handle_buffer_credits(handle)); return ret; } @@ -2362,7 +2379,7 @@ static int ocfs2_rotate_tree_right(handle_t *handle, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) { - int ret, start, orig_credits = handle->h_buffer_credits; + int ret, start, orig_credits = jbd2_handle_buffer_credits(handle); u32 cpos; struct ocfs2_path *left_path = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); @@ -2518,7 +2535,7 @@ out_ret_path: static int ocfs2_update_edge_lengths(handle_t *handle, struct ocfs2_extent_tree *et, - int subtree_index, struct ocfs2_path *path) + struct ocfs2_path *path) { int i, idx, ret; struct ocfs2_extent_rec *rec; @@ -2526,21 +2543,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle, struct ocfs2_extent_block *eb; u32 range; - /* - * In normal tree rotation process, we will never touch the - * tree branch above subtree_index and ocfs2_extend_rotate_transaction - * doesn't reserve the credits for them either. - * - * But we do have a special case here which will update the rightmost - * records for all the bh in the path. - * So we have to allocate extra credits and access them. - */ - ret = ocfs2_extend_trans(handle, subtree_index); - if (ret) { - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); @@ -2626,11 +2628,8 @@ static void ocfs2_unlink_subtree(handle_t *handle, int i; struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; - struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; - el = path_leaf_el(left_path); - eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) @@ -2779,8 +2778,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle, if (del_right_subtree) { ocfs2_unlink_subtree(handle, et, left_path, right_path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(handle, et, subtree_index, - left_path); + ret = ocfs2_update_edge_lengths(handle, et, left_path); if (ret) { mlog_errno(ret); goto out; @@ -2872,8 +2870,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -2925,7 +2922,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, struct ocfs2_path *right_path = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); - BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); + if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))) + return 0; *empty_extent_path = NULL; @@ -2966,7 +2964,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, right_path->p_node[subtree_root].bh->b_blocknr, right_path->p_tree_depth); - ret = ocfs2_extend_rotate_transaction(handle, subtree_root, + ret = ocfs2_extend_rotate_transaction(handle, 0, orig_credits, left_path); if (ret) { mlog_errno(ret); @@ -3039,21 +3037,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_et_sanity_check(et); if (ret) goto out; - /* - * There's two ways we handle this depending on - * whether path is the only existing one. - */ - ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, - path); - if (ret) { - mlog_errno(ret); - goto out; - } ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { @@ -3096,8 +3082,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, ocfs2_unlink_subtree(handle, et, left_path, path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(handle, et, subtree_index, - left_path); + ret = ocfs2_update_edge_lengths(handle, et, left_path); if (ret) { mlog_errno(ret); goto out; @@ -3130,6 +3115,30 @@ out: return ret; } +static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + handle_t *handle; + int ret; + int credits = path->p_tree_depth * 2 + 1; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); + return ret; +} + /* * Left rotation of btree records. * @@ -3151,7 +3160,7 @@ static int ocfs2_rotate_tree_left(handle_t *handle, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc) { - int ret, orig_credits = handle->h_buffer_credits; + int ret, orig_credits = jbd2_handle_buffer_credits(handle); struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; @@ -3197,11 +3206,10 @@ rightmost_no_delete: goto rightmost_no_delete; if (le16_to_cpu(el->l_next_free_rec) == 0) { - ret = -EIO; - ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty extent block at %llu", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), - (unsigned long long)le64_to_cpu(eb->h_blkno)); + ret = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent block at %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; } @@ -3370,7 +3378,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, ret = ocfs2_get_right_path(et, left_path, &right_path); if (ret) { mlog_errno(ret); - goto out; + return ret; } right_el = path_leaf_el(right_path); @@ -3390,8 +3398,8 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, - handle->h_buffer_credits, - right_path); + jbd2_handle_buffer_credits(handle), + right_path); if (ret) { mlog_errno(ret); goto out; @@ -3453,8 +3461,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, subtree_index); } out: - if (right_path) - ocfs2_free_path(right_path); + ocfs2_free_path(right_path); return ret; } @@ -3536,7 +3543,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, ret = ocfs2_get_left_path(et, right_path, &left_path); if (ret) { mlog_errno(ret); - goto out; + return ret; } left_el = path_leaf_el(left_path); @@ -3553,8 +3560,8 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, - handle->h_buffer_credits, - left_path); + jbd2_handle_buffer_credits(handle), + left_path); if (ret) { mlog_errno(ret); goto out; @@ -3604,8 +3611,6 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, * The easy case - we can just plop the record right in. */ *left_rec = *split_rec; - - has_empty_extent = 0; } else le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); @@ -3628,6 +3633,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, */ if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && le16_to_cpu(el->l_next_free_rec) == 1) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + jbd2_handle_buffer_credits(handle), + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } ret = ocfs2_remove_rightmost_path(handle, et, right_path, @@ -3647,8 +3660,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, right_path, subtree_index); } out: - if (left_path) - ocfs2_free_path(left_path); + ocfs2_free_path(left_path); return ret; } @@ -3667,6 +3679,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, BUG_ON(ctxt->c_contig_type == CONTIG_NONE); if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + jbd2_handle_buffer_credits(handle), + path); + if (ret) { + mlog_errno(ret); + goto out; + } /* * The merge code will need to create an empty * extent to take the place of the newly @@ -3700,7 +3720,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, * update split_index here. * * When the split_index is zero, we need to merge it to the - * prevoius extent block. It is more efficient and easier + * previous extent block. It is more efficient and easier * if we do merge_right first and merge_left later. */ ret = ocfs2_merge_rec_right(path, handle, et, split_rec, @@ -3715,6 +3735,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, */ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + jbd2_handle_buffer_credits(handle), + path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* The merge left us with an empty extent, remove it. */ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { @@ -3736,6 +3765,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, goto out; } + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + jbd2_handle_buffer_credits(handle), + path); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); /* * Error from this last rotate is not critical, so @@ -3771,6 +3809,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, } if (ctxt->c_split_covers_rec) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + jbd2_handle_buffer_credits(handle), + path); + if (ret) { + mlog_errno(ret); + ret = 0; + goto out; + } + /* * The merge may have left an empty extent in * our leaf. Try to rotate it away. @@ -3906,7 +3954,7 @@ rotate: * above. * * This leaf needs to have space, either by the empty 1st - * extent record, or by virtue of an l_next_rec < l_count. + * extent record, or by virtue of an l_next_free_rec < l_count. */ ocfs2_rotate_leaf(el, insert_rec); } @@ -3916,7 +3964,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec) { - int ret, i, next_free; + int i, next_free; struct buffer_head *bh; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; @@ -3931,9 +3979,8 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has a bad extent list", + "Owner %llu has a bad extent list\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); - ret = -EIO; return; } @@ -4313,13 +4360,13 @@ out: return ret; } -static enum ocfs2_contig_type -ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, +static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_list *el, int index, - struct ocfs2_extent_rec *split_rec) + struct ocfs2_extent_rec *split_rec, + struct ocfs2_merge_ctxt *ctxt) { - int status; + int status = 0; enum ocfs2_contig_type ret = CONTIG_NONE; u32 left_cpos, right_cpos; struct ocfs2_extent_rec *rec = NULL; @@ -4334,17 +4381,20 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, } else if (path->p_tree_depth > 0) { status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); if (status) - goto out; + goto exit; if (left_cpos != 0) { left_path = ocfs2_new_path_from_path(path); - if (!left_path) - goto out; + if (!left_path) { + status = -ENOMEM; + mlog_errno(status); + goto exit; + } status = ocfs2_find_path(et->et_ci, left_path, left_cpos); if (status) - goto out; + goto free_left_path; new_el = path_leaf_el(left_path); @@ -4352,16 +4402,12 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, le16_to_cpu(new_el->l_count)) { bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; - ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of " - "%d. It should have " - "matched the l_count of %d", - (unsigned long long)le64_to_cpu(eb->h_blkno), - le16_to_cpu(new_el->l_next_free_rec), - le16_to_cpu(new_el->l_count)); - status = -EINVAL; - goto out; + status = ocfs2_error(sb, + "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec), + le16_to_cpu(new_el->l_count)); + goto free_left_path; } rec = &new_el->l_recs[ le16_to_cpu(new_el->l_next_free_rec) - 1]; @@ -4388,18 +4434,21 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, path->p_tree_depth > 0) { status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); if (status) - goto out; + goto free_left_path; if (right_cpos == 0) - goto out; + goto free_left_path; right_path = ocfs2_new_path_from_path(path); - if (!right_path) - goto out; + if (!right_path) { + status = -ENOMEM; + mlog_errno(status); + goto free_left_path; + } status = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (status) - goto out; + goto free_right_path; new_el = path_leaf_el(right_path); rec = &new_el->l_recs[0]; @@ -4407,13 +4456,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; - ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of %d", - (unsigned long long)le64_to_cpu(eb->h_blkno), - le16_to_cpu(new_el->l_next_free_rec)); - status = -EINVAL; - goto out; + status = ocfs2_error(sb, + "Extent block #%llu has an invalid l_next_free_rec of %d\n", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec)); + goto free_right_path; } rec = &new_el->l_recs[1]; } @@ -4430,13 +4477,15 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, ret = contig_type; } -out: - if (left_path) - ocfs2_free_path(left_path); - if (right_path) - ocfs2_free_path(right_path); +free_right_path: + ocfs2_free_path(right_path); +free_left_path: + ocfs2_free_path(left_path); +exit: + if (status == 0) + ctxt->c_contig_type = ret; - return ret; + return status; } static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, @@ -4476,7 +4525,7 @@ static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, } /* - * This should only be called against the righmost leaf extent list. + * This should only be called against the rightmost leaf extent list. * * ocfs2_figure_appending_type() will figure out whether we'll have to * insert at the tail of the rightmost leaf. @@ -4671,7 +4720,7 @@ int ocfs2_insert_extent(handle_t *handle, struct ocfs2_alloc_context *meta_ac) { int status; - int uninitialized_var(free_records); + int free_records; struct buffer_head *last_eb_bh = NULL; struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; @@ -4726,7 +4775,7 @@ bail: } /* - * Allcate and add clusters into the extent b-tree. + * Allocate and add clusters into the extent b-tree. * The new clusters(clusters_to_add) will be inserted at logical_offset. * The extent b-tree's root is specified by et, and * it is not limited to the file storage. Any extent tree can use this @@ -4742,6 +4791,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, enum ocfs2_alloc_restarted *reason_ret) { int status = 0, err = 0; + int need_free = 0; int free_extents; enum ocfs2_alloc_restarted reason = RESTART_NONE; u32 bit_off, num_bits; @@ -4755,7 +4805,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, if (mark_unwritten) flags = OCFS2_EXT_UNWRITTEN; - free_extents = ocfs2_num_free_extents(osb, et); + free_extents = ocfs2_num_free_extents(et); if (free_extents < 0) { status = free_extents; mlog_errno(status); @@ -4796,7 +4846,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); - goto leave; + need_free = 1; + goto bail; } block = ocfs2_clusters_to_blocks(osb->sb, bit_off); @@ -4807,7 +4858,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, num_bits, flags, meta_ac); if (status < 0) { mlog_errno(status); - goto leave; + need_free = 1; + goto bail; } ocfs2_journal_dirty(handle, et->et_root_bh); @@ -4821,6 +4873,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, reason = RESTART_TRANS; } +bail: + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num_bits); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num_bits); + } + leave: if (reason_ret) *reason_ret = reason; @@ -4945,6 +5010,14 @@ leftright: el = path_leaf_el(path); split_index = ocfs2_search_extent_list(el, cpos); + if (split_index == -1) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } goto leftright; } out: @@ -5007,7 +5080,6 @@ int ocfs2_split_extent(handle_t *handle, struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; struct ocfs2_merge_ctxt ctxt; - struct ocfs2_extent_list *rightmost_el; if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < @@ -5017,9 +5089,14 @@ int ocfs2_split_extent(handle_t *handle, goto out; } - ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, - split_index, - split_rec); + ret = ocfs2_figure_merge_contig_type(et, path, el, + split_index, + split_rec, + &ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } /* * The core merge / split code wants to know how much room is @@ -5027,8 +5104,6 @@ int ocfs2_split_extent(handle_t *handle, * rightmost extent list. */ if (path->p_tree_depth) { - struct ocfs2_extent_block *eb; - ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); @@ -5036,11 +5111,7 @@ int ocfs2_split_extent(handle_t *handle, mlog_errno(ret); goto out; } - - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - rightmost_el = &eb->h_list; - } else - rightmost_el = path_root_el(path); + } if (rec->e_cpos == split_rec->e_cpos && rec->e_leaf_clusters == split_rec->e_leaf_clusters) @@ -5119,12 +5190,11 @@ int ocfs2_change_extent_flag(handle_t *handle, el = path_leaf_el(left_path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long) - ocfs2_metadata_cache_owner(et->et_ci), cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5133,7 +5203,7 @@ int ocfs2_change_extent_flag(handle_t *handle, rec = &el->l_recs[index]; if (new_flags && (rec->e_flags & new_flags)) { mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " - "extent that already had them", + "extent that already had them\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), new_flags); goto out; @@ -5141,7 +5211,7 @@ int ocfs2_change_extent_flag(handle_t *handle, if (clear_flags && !(rec->e_flags & clear_flags)) { mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " - "extent that didn't have them", + "extent that didn't have them\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), clear_flags); goto out; @@ -5191,9 +5261,7 @@ int ocfs2_mark_extent_written(struct inode *inode, cpos, len, phys); if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " - "that are being written to, but the feature bit " - "is not set in the super block.", + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); ret = -EROFS; goto out; @@ -5292,7 +5360,7 @@ static int ocfs2_truncate_rec(handle_t *handle, { int ret; u32 left_cpos, rec_range, trunc_range; - int wants_rotate = 0, is_rightmost_tree_rec = 0; + int is_rightmost_tree_rec = 0; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -5300,6 +5368,15 @@ static int ocfs2_truncate_rec(handle_t *handle, struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + jbd2_handle_buffer_credits(handle), + path); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); @@ -5362,8 +5439,8 @@ static int ocfs2_truncate_rec(handle_t *handle, } ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, - path); + jbd2_handle_buffer_credits(handle), + path); if (ret) { mlog_errno(ret); goto out; @@ -5389,7 +5466,6 @@ static int ocfs2_truncate_rec(handle_t *handle, memset(rec, 0, sizeof(*rec)); ocfs2_cleanup_merge(el, index); - wants_rotate = 1; next_free = le16_to_cpu(el->l_next_free_rec); if (is_rightmost_tree_rec && next_free > 1) { @@ -5432,10 +5508,8 @@ static int ocfs2_truncate_rec(handle_t *handle, ocfs2_journal_dirty(handle, path_leaf_bh(path)); ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); - if (ret) { + if (ret) mlog_errno(ret); - goto out; - } out: ocfs2_free_path(left_path); @@ -5475,10 +5549,9 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", + "Owner %llu has an extent at cpos %u which can no longer be found\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5541,9 +5614,9 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: split at cpos %u lost record.", + "Owner %llu: split at cpos %u lost record\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5559,8 +5632,7 @@ int ocfs2_remove_extent(handle_t *handle, ocfs2_rec_clusters(el, rec); if (rec_range != trunc_range) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: error after split at cpos %u" - "trunc len %u, existing record is (%u,%u)", + "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos, len, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); @@ -5570,10 +5642,8 @@ int ocfs2_remove_extent(handle_t *handle, ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, cpos, len); - if (ret) { + if (ret) mlog_errno(ret); - goto out; - } } out: @@ -5602,7 +5672,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, *ac = NULL; - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); @@ -5618,7 +5688,6 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); - goto out; } } @@ -5637,7 +5706,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_extent_tree *et, u32 cpos, u32 phys_cpos, u32 len, int flags, struct ocfs2_cached_dealloc_ctxt *dealloc, - u64 refcount_loc) + u64 refcount_loc, bool refcount_tree_locked) { int ret, credits = 0, extra_blocks = 0; u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); @@ -5648,14 +5717,15 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_refcount_tree *ref_tree = NULL; if ((flags & OCFS2_EXT_REFCOUNTED) && len) { - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); - ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, - &ref_tree, NULL); - if (ret) { - mlog_errno(ret); - goto bail; + if (!refcount_tree_locked) { + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + goto bail; + } } ret = ocfs2_prepare_refcount_change_for_del(inode, @@ -5677,7 +5747,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto bail; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5712,6 +5782,7 @@ int ocfs2_remove_btree_range(struct inode *inode, } ocfs2_et_update_clusters(et, -len); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, et->et_root_bh); @@ -5733,7 +5804,7 @@ int ocfs2_remove_btree_range(struct inode *inode, out_commit: ocfs2_commit_trans(osb, handle); out: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); @@ -5789,7 +5860,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); @@ -5854,7 +5925,6 @@ bail: } static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, - handle_t *handle, struct inode *data_alloc_inode, struct buffer_head *data_alloc_bh) { @@ -5867,16 +5937,25 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, struct ocfs2_truncate_log *tl; struct inode *tl_inode = osb->osb_tl_inode; struct buffer_head *tl_bh = osb->osb_tl_bh; + handle_t *handle; di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; i = le16_to_cpu(tl->tl_used) - 1; while (i >= 0) { + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail; + } + /* Caller has given us at least enough credits to * update the truncate log dinode */ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { + ocfs2_commit_trans(osb, handle); mlog_errno(status); goto bail; } @@ -5885,16 +5964,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, tl_bh); - /* TODO: Perhaps we can calculate the bulk of the - * credits up front rather than extending like - * this. */ - status = ocfs2_extend_trans(handle, - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (status < 0) { - mlog_errno(status); - goto bail; - } - rec = tl->tl_recs[i]; start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, le32_to_cpu(rec.t_start)); @@ -5911,10 +5980,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, data_alloc_bh, start_blk, num_clusters); if (status < 0) { + ocfs2_commit_trans(osb, handle); mlog_errno(status); goto bail; } } + + ocfs2_commit_trans(osb, handle); i--; } @@ -5924,20 +5996,20 @@ bail: return status; } -/* Expects you to already be holding tl_inode->i_mutex */ +/* Expects you to already be holding tl_inode->i_rwsem */ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; unsigned int num_to_flush; - handle_t *handle; struct inode *tl_inode = osb->osb_tl_inode; struct inode *data_alloc_inode = NULL; struct buffer_head *tl_bh = osb->osb_tl_bh; struct buffer_head *data_alloc_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; + struct ocfs2_journal *journal = osb->journal; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); di = (struct ocfs2_dinode *) tl_bh->b_data; @@ -5956,6 +6028,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } + /* Appending truncate log(TA) and flushing truncate log(TF) are + * two separated transactions. They can be both committed but not + * checkpointed. If crash occurs then, both two transaction will be + * replayed with several already released to global bitmap clusters. + * Then truncate log will be replayed resulting in cluster double free. + */ + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) { + mlog_errno(status); + goto out; + } + data_alloc_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -5965,7 +6051,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - mutex_lock(&data_alloc_inode->i_mutex); + inode_lock(data_alloc_inode); status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); if (status < 0) { @@ -5973,26 +6059,16 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_unlock; - } - - status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + status = ocfs2_replay_truncate_records(osb, data_alloc_inode, data_alloc_bh); if (status < 0) mlog_errno(status); - ocfs2_commit_trans(osb, handle); - -out_unlock: brelse(data_alloc_bh); ocfs2_inode_unlock(data_alloc_inode, 1); out_mutex: - mutex_unlock(&data_alloc_inode->i_mutex); + inode_unlock(data_alloc_inode); iput(data_alloc_inode); out: @@ -6004,9 +6080,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); status = __ocfs2_flush_truncate_log(osb); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6029,17 +6105,55 @@ static void ocfs2_truncate_log_worker(struct work_struct *work) void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, int cancel) { - if (osb->osb_tl_inode) { + if (osb->osb_tl_inode && + atomic_read(&osb->osb_tl_disable) == 0) { /* We want to push off log flushes while truncates are * still running. */ if (cancel) cancel_delayed_work(&osb->osb_truncate_log_wq); - queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, + queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq, OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); } } +/* + * Try to flush truncate logs if we can free enough clusters from it. + * As for return value, "< 0" means error, "0" no space and "1" means + * we have freed enough spaces and let the caller try to allocate again. + */ +int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, + unsigned int needed) +{ + tid_t target; + int ret = 0; + unsigned int truncated_clusters; + + inode_lock(osb->osb_tl_inode); + truncated_clusters = osb->truncated_clusters; + inode_unlock(osb->osb_tl_inode); + + /* + * Check whether we can succeed in allocating if we free + * the truncate log. + */ + if (truncated_clusters < needed) + goto out; + + ret = ocfs2_flush_truncate_log(osb); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { + jbd2_log_wait_commit(osb->journal->j_journal, target); + ret = 1; + } +out: + return ret; +} + static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, int slot_num, struct inode **tl_inode, @@ -6048,6 +6162,9 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, int status; struct inode *inode = NULL; struct buffer_head *bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + unsigned int tl_count; inode = ocfs2_get_system_file_inode(osb, TRUNCATE_LOG_SYSTEM_INODE, @@ -6065,6 +6182,18 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, goto bail; } + di = (struct ocfs2_dinode *)bh->b_data; + tl = &di->id2.i_dealloc; + tl_count = le16_to_cpu(tl->tl_count); + if (unlikely(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || + tl_count == 0)) { + status = -EFSCORRUPTED; + iput(inode); + brelse(bh); + mlog_errno(status); + goto bail; + } + *tl_inode = inode; *tl_bh = bh; bail: @@ -6106,17 +6235,17 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, if (le16_to_cpu(tl->tl_used)) { trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used)); - *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); + /* + * Assuming the write-out below goes well, this copy will be + * passed back to recovery for processing. + */ + *tl_copy = kmemdup(tl_bh->b_data, tl_bh->b_size, GFP_KERNEL); if (!(*tl_copy)) { status = -ENOMEM; mlog_errno(status); goto bail; } - /* Assuming the write-out below goes well, this copy - * will be passed back to recovery for processing. */ - memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); - /* All we need to do to clear the truncate log is set * tl_used. */ tl->tl_used = 0; @@ -6130,11 +6259,10 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, } bail: - if (tl_inode) - iput(tl_inode); + iput(tl_inode); brelse(tl_bh); - if (status < 0 && (*tl_copy)) { + if (status < 0) { kfree(*tl_copy); *tl_copy = NULL; mlog_errno(status); @@ -6165,7 +6293,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, (unsigned long long)le64_to_cpu(tl_copy->i_blkno), num_recs); - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); for(i = 0; i < num_recs; i++) { if (ocfs2_truncate_log_needs_flush(osb)) { status = __ocfs2_flush_truncate_log(osb); @@ -6196,7 +6324,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, } bail_up: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6206,9 +6334,11 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; + atomic_set(&osb->osb_tl_disable, 1); + if (tl_inode) { cancel_delayed_work(&osb->osb_truncate_log_wq); - flush_workqueue(ocfs2_wq); + flush_workqueue(osb->ocfs2_wq); status = ocfs2_flush_truncate_log(osb); if (status < 0) @@ -6237,6 +6367,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) * until we're sure all is well. */ INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker); + atomic_set(&osb->osb_tl_disable, 0); osb->osb_tl_bh = tl_bh; osb->osb_tl_inode = tl_inode; @@ -6300,7 +6431,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { @@ -6308,48 +6439,39 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out_unlock; - } - while (head) { if (head->free_bg) bg_blkno = head->free_bg; else bg_blkno = ocfs2_which_suballoc_group(head->free_blk, head->free_bit); + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + trace_ocfs2_free_cached_blocks( (unsigned long long)head->free_blk, head->free_bit); ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, head->free_bit, bg_blkno, 1); - if (ret) { + if (ret) mlog_errno(ret); - goto out_journal; - } - ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); - if (ret) { - mlog_errno(ret); - goto out_journal; - } + ocfs2_commit_trans(osb, handle); tmp = head; head = head->free_next; kfree(tmp); } -out_journal: - ocfs2_commit_trans(osb, handle); - out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); out: while(head) { @@ -6393,7 +6515,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, handle_t *handle; int ret = 0; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); while (head) { if (ocfs2_truncate_log_needs_flush(osb)) { @@ -6425,7 +6547,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); while (head) { /* Premature exit may have left some dangling items. */ @@ -6506,6 +6628,154 @@ ocfs2_find_per_slot_free_list(int type, return fl; } +static struct ocfs2_per_slot_free_list * +ocfs2_find_preferred_free_list(int type, + int preferred_slot, + int *real_slot, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; + + while (fl) { + if (fl->f_inode_type == type && fl->f_slot == preferred_slot) { + *real_slot = fl->f_slot; + return fl; + } + + fl = fl->f_next_suballocator; + } + + /* If we can't find any free list matching preferred slot, just use + * the first one. + */ + fl = ctxt->c_first_suballocator; + *real_slot = fl->f_slot; + + return fl; +} + +/* Return Value 1 indicates empty */ +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et) +{ + struct ocfs2_per_slot_free_list *fl = NULL; + + if (!et->et_dealloc) + return 1; + + fl = et->et_dealloc->c_first_suballocator; + if (!fl) + return 1; + + if (!fl->f_first) + return 1; + + return 0; +} + +/* If extent was deleted from tree due to extent rotation and merging, and + * no metadata is reserved ahead of time. Try to reuse some extents + * just deleted. This is only used to reuse extent blocks. + * It is supposed to find enough extent blocks in dealloc if our estimation + * on metadata is accurate. + */ +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head **new_eb_bh, + int blk_wanted, int *blk_given) +{ + int i, status = 0, real_slot; + struct ocfs2_cached_dealloc_ctxt *dealloc; + struct ocfs2_per_slot_free_list *fl; + struct ocfs2_cached_block_free *bf; + struct ocfs2_extent_block *eb; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + + *blk_given = 0; + + /* If extent tree doesn't have a dealloc, this is not faulty. Just + * tell upper caller dealloc can't provide any block and it should + * ask for alloc to claim more space. + */ + dealloc = et->et_dealloc; + if (!dealloc) + goto bail; + + for (i = 0; i < blk_wanted; i++) { + /* Prefer to use local slot */ + fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE, + osb->slot_num, &real_slot, + dealloc); + /* If no more block can be reused, we should claim more + * from alloc. Just return here normally. + */ + if (!fl) { + status = 0; + break; + } + + bf = fl->f_first; + fl->f_first = bf->free_next; + + new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk); + if (new_eb_bh[i] == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + mlog(0, "Reusing block(%llu) from " + "dealloc(local slot:%d, real slot:%d)\n", + bf->free_blk, osb->slot_num, real_slot); + + ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]); + + status = ocfs2_journal_access_eb(handle, et->et_ci, + new_eb_bh[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize); + eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data; + + /* We can't guarantee that buffer head is still cached, so + * polutlate the extent block again. + */ + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); + eb->h_blkno = cpu_to_le64(bf->free_blk); + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); + eb->h_suballoc_slot = cpu_to_le16(real_slot); + eb->h_suballoc_loc = cpu_to_le64(bf->free_bg); + eb->h_suballoc_bit = cpu_to_le16(bf->free_bit); + eb->h_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); + + /* We'll also be dirtied by the caller, so + * this isn't absolutely necessary. + */ + ocfs2_journal_dirty(handle, new_eb_bh[i]); + + if (!fl->f_first) { + dealloc->c_first_suballocator = fl->f_next_suballocator; + kfree(fl); + } + kfree(bf); + } + + *blk_given = i; + +bail: + if (unlikely(status < 0)) { + for (i = 0; i < blk_wanted; i++) + brelse(new_eb_bh[i]); + } + + return status; +} + int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, int type, int slot, u64 suballoc, u64 blkno, unsigned int bit) @@ -6561,138 +6831,136 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) return 0; } -void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, - unsigned int from, unsigned int to, - struct page *page, int zero, u64 *phys) +void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle, + size_t from, size_t to, struct folio *folio, int zero, + u64 *phys) { int ret, partial = 0; + loff_t start_byte = folio_pos(folio) + from; + loff_t length = to - from; - ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); + ret = ocfs2_map_folio_blocks(folio, phys, inode, from, to, 0); if (ret) mlog_errno(ret); if (zero) - zero_user_segment(page, from, to); + folio_zero_segment(folio, from, to); /* * Need to set the buffers we zero'd into uptodate * here if they aren't - ocfs2_map_page_blocks() * might've skipped some */ - ret = walk_page_buffers(handle, page_buffers(page), + ret = walk_page_buffers(handle, folio_buffers(folio), from, to, &partial, ocfs2_zero_func); if (ret < 0) mlog_errno(ret); else if (ocfs2_should_order_data(inode)) { - ret = ocfs2_jbd2_file_inode(handle, inode); + ret = ocfs2_jbd2_inode_add_write(handle, inode, + start_byte, length); if (ret < 0) mlog_errno(ret); } if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); - flush_dcache_page(page); + flush_dcache_folio(folio); } -static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, - loff_t end, struct page **pages, - int numpages, u64 phys, handle_t *handle) +static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start, + loff_t end, struct folio **folios, int numfolios, + u64 phys, handle_t *handle) { int i; - struct page *page; - unsigned int from, to = PAGE_CACHE_SIZE; struct super_block *sb = inode->i_sb; BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); - if (numpages == 0) + if (numfolios == 0) goto out; - to = PAGE_CACHE_SIZE; - for(i = 0; i < numpages; i++) { - page = pages[i]; - - from = start & (PAGE_CACHE_SIZE - 1); - if ((end >> PAGE_CACHE_SHIFT) == page->index) - to = end & (PAGE_CACHE_SIZE - 1); + for (i = 0; i < numfolios; i++) { + struct folio *folio = folios[i]; + size_t to = folio_size(folio); + size_t from = offset_in_folio(folio, start); - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); + if (to > end - folio_pos(folio)) + to = end - folio_pos(folio); - ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, - &phys); + ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1, + &phys); - start = (page->index + 1) << PAGE_CACHE_SHIFT; + start = folio_next_pos(folio); } out: - if (pages) - ocfs2_unlock_and_free_pages(pages, numpages); + if (folios) + ocfs2_unlock_and_free_folios(folios, numfolios); } -int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num) +static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end, + struct folio **folios, int *num) { - int numpages, ret = 0; + int numfolios, ret = 0; struct address_space *mapping = inode->i_mapping; unsigned long index; loff_t last_page_bytes; BUG_ON(start > end); - numpages = 0; + numfolios = 0; last_page_bytes = PAGE_ALIGN(end); - index = start >> PAGE_CACHE_SHIFT; + index = start >> PAGE_SHIFT; do { - pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); - if (!pages[numpages]) { - ret = -ENOMEM; + folios[numfolios] = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folios[numfolios])) { + ret = PTR_ERR(folios[numfolios]); mlog_errno(ret); + folios[numfolios] = NULL; goto out; } - numpages++; - index++; - } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); + index = folio_next_index(folios[numfolios]); + numfolios++; + } while (index < (last_page_bytes >> PAGE_SHIFT)); out: if (ret != 0) { - if (pages) - ocfs2_unlock_and_free_pages(pages, numpages); - numpages = 0; + ocfs2_unlock_and_free_folios(folios, numfolios); + numfolios = 0; } - *num = numpages; + *num = numfolios; return ret; } -static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num) +static int ocfs2_grab_eof_folios(struct inode *inode, loff_t start, loff_t end, + struct folio **folios, int *num) { struct super_block *sb = inode->i_sb; BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); - return ocfs2_grab_pages(inode, start, end, pages, num); + return ocfs2_grab_folios(inode, start, end, folios, num); } /* - * Zero the area past i_size but still within an allocated - * cluster. This avoids exposing nonzero data on subsequent file - * extends. + * Zero partial cluster for a hole punch or truncate. This avoids exposing + * nonzero data on subsequent file extends. * * We need to call this before i_size is updated on the inode because - * otherwise block_write_full_page() will skip writeout of pages past - * i_size. The new_i_size parameter is passed for this reason. + * otherwise block_write_full_folio() will skip writeout of pages past + * i_size. */ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, u64 range_start, u64 range_end) { - int ret = 0, numpages; - struct page **pages = NULL; + int ret = 0, numfolios; + struct folio **folios = NULL; u64 phys; unsigned int ext_flags; struct super_block *sb = inode->i_sb; @@ -6704,17 +6972,23 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) return 0; - pages = kcalloc(ocfs2_pages_per_cluster(sb), - sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { + /* + * Avoid zeroing folios fully beyond current i_size. It is pointless as + * underlying blocks of those folios should be already zeroed out and + * page writeback will skip them anyway. + */ + range_end = min_t(u64, range_end, i_size_read(inode)); + if (range_start >= range_end) + return 0; + + folios = kcalloc(ocfs2_pages_per_cluster(sb), + sizeof(struct folio *), GFP_NOFS); + if (folios == NULL) { ret = -ENOMEM; mlog_errno(ret); goto out; } - if (range_start == range_end) - goto out; - ret = ocfs2_extent_map_get_blocks(inode, range_start >> sb->s_blocksize_bits, &phys, NULL, &ext_flags); @@ -6730,18 +7004,18 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN) goto out; - ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, - &numpages); + ret = ocfs2_grab_eof_folios(inode, range_start, range_end, folios, + &numfolios); if (ret) { mlog_errno(ret); goto out; } - ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, - numpages, phys, handle); + ocfs2_zero_cluster_folios(inode, range_start, range_end, folios, + numfolios, phys, handle); /* - * Initiate writeout of the pages we zero'd here. We don't + * Initiate writeout of the folios we zero'd here. We don't * wait on them - the truncate_inode_pages() call later will * do that for us. */ @@ -6751,7 +7025,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, mlog_errno(ret); out: - kfree(pages); + kfree(folios); return ret; } @@ -6804,29 +7078,22 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { - int ret, i, has_data, num_pages = 0; + int ret, has_data, num_folios = 0; + int need_free = 0; + u32 bit_off, num; handle_t *handle; - u64 uninitialized_var(block); + u64 block; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_alloc_context *data_ac = NULL; - struct page **pages = NULL; - loff_t end = osb->s_clustersize; + struct folio *folio = NULL; struct ocfs2_extent_tree et; int did_quota = 0; has_data = i_size_read(inode) ? 1 : 0; if (has_data) { - pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), - sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - ret = ocfs2_reserve_clusters(osb, 1, &data_ac); if (ret) { mlog_errno(ret); @@ -6839,7 +7106,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_unlock; + goto out; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, @@ -6850,8 +7117,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } if (has_data) { - u32 bit_off, num; - unsigned int page_end; + unsigned int page_end = min_t(unsigned, PAGE_SIZE, + osb->s_clustersize); u64 phys; ret = dquot_alloc_space_nodirty(inode, @@ -6860,7 +7127,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_commit; did_quota = 1; - data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + data_ac->ac_resv = &oi->ip_la_data_resv; ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &num); @@ -6871,21 +7138,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, /* * Save two copies, one for insert, and one that can - * be changed by ocfs2_map_and_dirty_page() below. + * be changed by ocfs2_map_and_dirty_folio() below. */ block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); - /* - * Non sparse file systems zero on extend, so no need - * to do that now. - */ - if (!ocfs2_sparse_alloc(osb) && - PAGE_CACHE_SIZE < osb->s_clustersize) - end = PAGE_CACHE_SIZE; - - ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); + ret = ocfs2_grab_eof_folios(inode, 0, page_end, &folio, + &num_folios); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6893,19 +7154,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * This should populate the 1st page for us and mark * it up to date. */ - ret = ocfs2_read_inline_data(inode, pages[0], di_bh); + ret = ocfs2_read_inline_data(inode, folio, di_bh); if (ret) { mlog_errno(ret); - goto out_commit; + need_free = 1; + goto out_unlock; } - page_end = PAGE_CACHE_SIZE; - if (PAGE_CACHE_SIZE > osb->s_clustersize) - page_end = osb->s_clustersize; - - for (i = 0; i < num_pages; i++) - ocfs2_map_and_dirty_page(inode, handle, 0, page_end, - pages[i], i > 0, &phys); + ocfs2_map_and_dirty_folio(inode, handle, 0, page_end, folio, 0, + &phys); } spin_lock(&oi->ip_lock); @@ -6913,6 +7170,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_dinode_new_extent_list(inode, di); ocfs2_journal_dirty(handle, di_bh); @@ -6927,29 +7185,39 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); - goto out_commit; + need_free = 1; + goto out_unlock; } inode->i_blocks = ocfs2_inode_sector_count(inode); } +out_unlock: + if (folio) + ocfs2_unlock_and_free_folios(&folio, num_folios); + out_commit: if (ret < 0 && did_quota) dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num); + } + ocfs2_commit_trans(osb, handle); -out_unlock: +out: if (data_ac) ocfs2_free_alloc_context(data_ac); - -out: - if (pages) { - ocfs2_unlock_and_free_pages(pages, num_pages); - kfree(pages); - } - return ret; } @@ -6974,6 +7242,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, u64 refcount_loc = le64_to_cpu(di->i_refcount_loc); struct ocfs2_extent_tree et; struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_refcount_tree *ref_tree = NULL; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); ocfs2_init_dealloc_ctxt(&dealloc); @@ -7047,15 +7316,23 @@ start: * to check it up here before changing the tree. */ if (root_el->l_tree_depth && rec->e_int_clusters == 0) { - ocfs2_error(inode->i_sb, "Inode %lu has an empty " + mlog(ML_ERROR, "Inode %lu has an empty " "extent record, depth %u\n", inode->i_ino, le16_to_cpu(root_el->l_tree_depth)); - status = -EROFS; - goto bail; + status = ocfs2_remove_rightmost_empty_extent(osb, + &et, path, &dealloc); + if (status) { + mlog_errno(status); + goto bail; + } + + ocfs2_reinit_path(path, 1); + goto start; + } else { + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; } - trunc_cpos = le32_to_cpu(rec->e_cpos); - trunc_len = 0; - blkno = 0; } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { /* * Truncate entire record. @@ -7083,9 +7360,18 @@ start: phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) { + status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (status) { + mlog_errno(status); + goto bail; + } + } + status = ocfs2_remove_btree_range(inode, &et, trunc_cpos, phys_cpos, trunc_len, flags, &dealloc, - refcount_loc); + refcount_loc, true); if (status < 0) { mlog_errno(status); goto bail; @@ -7100,6 +7386,8 @@ start: goto start; bail: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); ocfs2_schedule_truncate_log_flush(osb, 1); @@ -7123,17 +7411,20 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_inline_data *idata = &di->id2.i_data; + /* No need to punch hole beyond i_size. */ + if (start >= i_size_read(inode)) + return 0; + if (end > i_size_read(inode)) end = i_size_read(inode); - BUG_ON(start >= end); + BUG_ON(start > end); if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || !ocfs2_supports_inline_data(osb)) { ocfs2_error(inode->i_sb, - "Inline data flags for inode %llu don't agree! " - "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", + "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, le16_to_cpu(di->i_dyn_features), OCFS2_I(inode)->ip_dyn_features, @@ -7162,7 +7453,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, /* * No need to worry about the data page here - it's been * truncated already and inline data doesn't need it for - * pushing zero's to disk, so we'll let readpage pick it up + * pushing zero's to disk, so we'll let read_folio pick it up * later. */ if (trunc) { @@ -7171,11 +7462,12 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, } inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); out_commit: @@ -7187,13 +7479,24 @@ out: static int ocfs2_trim_extent(struct super_block *sb, struct ocfs2_group_desc *gd, - u32 start, u32 count) + u64 group, u32 start, u32 count) { u64 discard, bcount; + struct ocfs2_super *osb = OCFS2_SB(sb); bcount = ocfs2_clusters_to_blocks(sb, count); - discard = le64_to_cpu(gd->bg_blkno) + - ocfs2_clusters_to_blocks(sb, start); + discard = ocfs2_clusters_to_blocks(sb, start); + + /* + * For the first cluster group, the gd->bg_blkno is not at the start + * of the group, but at an offset from the start. If we add it while + * calculating discard for first group, we will wrongly start fstrim a + * few blocks after the desried start block and the range can cross + * over into the next cluster group. So, add it only if this is not + * the first cluster group. + */ + if (group != osb->first_cluster_group_blkno) + discard += le64_to_cpu(gd->bg_blkno); trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount); @@ -7201,7 +7504,7 @@ static int ocfs2_trim_extent(struct super_block *sb, } static int ocfs2_trim_group(struct super_block *sb, - struct ocfs2_group_desc *gd, + struct ocfs2_group_desc *gd, u64 group, u32 start, u32 max, u32 minbits) { int ret = 0, count = 0, next; @@ -7220,7 +7523,7 @@ static int ocfs2_trim_group(struct super_block *sb, next = ocfs2_find_next_bit(bitmap, max, start); if ((next - start) >= minbits) { - ret = ocfs2_trim_extent(sb, gd, + ret = ocfs2_trim_extent(sb, gd, group, start, next - start); if (ret < 0) { mlog_errno(ret); @@ -7245,10 +7548,11 @@ static int ocfs2_trim_group(struct super_block *sb, return count; } -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +static +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range) { struct ocfs2_super *osb = OCFS2_SB(sb); - u64 start, len, trimmed, first_group, last_group, group; + u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0; int ret, cnt; u32 first_bit, last_bit, minlen; struct buffer_head *main_bm_bh = NULL; @@ -7260,16 +7564,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; minlen = range->minlen >> osb->s_clustersize_bits; - trimmed = 0; - if (!len) { - range->len = 0; - return 0; - } - - if (minlen >= osb->bitmap_cpg) + if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) return -EINVAL; + trace_ocfs2_trim_mainbm(start, len, minlen); + +next_group: main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -7279,7 +7580,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); if (ret < 0) { @@ -7288,26 +7589,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) } main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; - if (start >= le32_to_cpu(main_bm->i_clusters)) { - ret = -EINVAL; - goto out_unlock; - } - - if (start + len > le32_to_cpu(main_bm->i_clusters)) - len = le32_to_cpu(main_bm->i_clusters) - start; + /* + * Do some check before trim the first group. + */ + if (!group) { + if (start >= le32_to_cpu(main_bm->i_clusters)) { + ret = -EINVAL; + goto out_unlock; + } - trace_ocfs2_trim_fs(start, len, minlen); + if (start + len > le32_to_cpu(main_bm->i_clusters)) + len = le32_to_cpu(main_bm->i_clusters) - start; - /* Determine first and last group to examine based on start and len */ - first_group = ocfs2_which_cluster_group(main_bm_inode, start); - if (first_group == osb->first_cluster_group_blkno) - first_bit = start; - else - first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); - last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); - last_bit = osb->bitmap_cpg; + /* + * Determine first and last group to examine based on + * start and len + */ + first_group = ocfs2_which_cluster_group(main_bm_inode, start); + if (first_group == osb->first_cluster_group_blkno) + first_bit = start; + else + first_bit = start - ocfs2_blocks_to_clusters(sb, + first_group); + last_group = ocfs2_which_cluster_group(main_bm_inode, + start + len - 1); + group = first_group; + } - for (group = first_group; group <= last_group;) { + do { if (first_bit + len >= osb->bitmap_cpg) last_bit = osb->bitmap_cpg; else @@ -7322,7 +7631,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) } gd = (struct ocfs2_group_desc *)gd_bh->b_data; - cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen); + cnt = ocfs2_trim_group(sb, gd, group, + first_bit, last_bit, minlen); brelse(gd_bh); gd_bh = NULL; if (cnt < 0) { @@ -7338,14 +7648,83 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); else group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); - } - range->len = trimmed * sb->s_blocksize; + } while (0); + out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); + main_bm_bh = NULL; out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); + + /* + * If all the groups trim are not done or failed, but we should release + * main_bm related locks for avoiding the current IO starve, then go to + * trim the next group + */ + if (ret >= 0 && group <= last_group) { + cond_resched(); + goto next_group; + } +out: + range->len = trimmed * osb->s_clustersize; + return ret; +} + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct ocfs2_trim_fs_info info, *pinfo = NULL; + + ocfs2_trim_fs_lock_res_init(osb); + + trace_ocfs2_trim_fs(range->start, range->len, range->minlen); + + ret = ocfs2_trim_fs_lock(osb, NULL, 1); + if (ret < 0) { + if (ret != -EAGAIN) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + return ret; + } + + mlog(ML_NOTICE, "Wait for trim on device (%s) to " + "finish, which is running from another node.\n", + osb->dev_str); + ret = ocfs2_trim_fs_lock(osb, &info, 0); + if (ret < 0) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + return ret; + } + + if (info.tf_valid && info.tf_success && + info.tf_start == range->start && + info.tf_len == range->len && + info.tf_minlen == range->minlen) { + /* Avoid sending duplicated trim to a shared device */ + mlog(ML_NOTICE, "The same trim on device (%s) was " + "just done from node (%u), return.\n", + osb->dev_str, info.tf_nodenum); + range->len = info.tf_trimlen; + goto out; + } + } + + info.tf_nodenum = osb->node_num; + info.tf_start = range->start; + info.tf_len = range->len; + info.tf_minlen = range->minlen; + + ret = ocfs2_trim_mainbm(sb, range); + + info.tf_trimlen = range->len; + info.tf_success = (ret < 0 ? 0 : 1); + pinfo = &info; out: + ocfs2_trim_fs_unlock(osb, pinfo); + ocfs2_trim_fs_lock_res_uninit(osb); return ret; } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index ca381c584127..1c0c83362904 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * alloc.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_ALLOC_H @@ -54,13 +38,14 @@ */ struct ocfs2_extent_tree_operations; struct ocfs2_extent_tree { - struct ocfs2_extent_tree_operations *et_ops; + const struct ocfs2_extent_tree_operations *et_ops; struct buffer_head *et_root_bh; struct ocfs2_extent_list *et_root_el; struct ocfs2_caching_info *et_ci; ocfs2_journal_access_func et_root_journal_access; void *et_object; unsigned int et_max_leaf_clusters; + struct ocfs2_cached_dealloc_ctxt *et_dealloc; }; /* @@ -142,10 +127,9 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_extent_tree *et, u32 cpos, u32 phys_cpos, u32 len, int flags, struct ocfs2_cached_dealloc_ctxt *dealloc, - u64 refcount_loc); + u64 refcount_loc, bool refcount_tree_locked); -int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct ocfs2_extent_tree *et); +int ocfs2_num_free_extents(struct ocfs2_extent_tree *et); /* * how many new metadata chunks would an allocation need at maximum? @@ -188,6 +172,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, u64 start_blk, unsigned int num_clusters); int __ocfs2_flush_truncate_log(struct ocfs2_super *osb); +int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, + unsigned int needed); /* * Process local structure which describes the block unlinks done @@ -268,11 +254,9 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) return !rec->e_leaf_clusters; } -int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num); -void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, - unsigned int from, unsigned int to, - struct page *page, int zero, u64 *phys); +void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle, + size_t from, size_t to, struct folio *folio, int zero, + u64 *phys); /* * Structures which describe a path through a btree, and functions to * manipulate them. diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 79736a28d84f..76c86f1c2b1c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1,22 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -25,9 +9,11 @@ #include <linux/pagemap.h> #include <asm/byteorder.h> #include <linux/swap.h> -#include <linux/pipe_fs_i.h> #include <linux/mpage.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> +#include <linux/uio.h> +#include <linux/mm.h> #include <cluster/masklog.h> @@ -47,6 +33,9 @@ #include "ocfs2_trace.h" #include "buffer_head_io.h" +#include "dir.h" +#include "namei.h" +#include "sysfile.h" static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -57,7 +46,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh = NULL; struct buffer_head *buffer_cache_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - void *kaddr; trace_ocfs2_symlink_get_block( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -80,6 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, le32_to_cpu(fe->i_clusters))) { + err = -ENOMEM; mlog(ML_ERROR, "block offset is outside the allocated size: " "%llu\n", (unsigned long long)iblock); goto bail; @@ -92,6 +81,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, iblock; buffer_cache_bh = sb_getblk(osb->sb, blkno); if (!buffer_cache_bh) { + err = -ENOMEM; mlog(ML_ERROR, "couldn't getblock for symlink!\n"); goto bail; } @@ -100,17 +90,11 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, * could've happened. Since we've got a reference on * the bh, even if it commits while we're doing the * copy, the data is still good. */ - if (buffer_jbd(buffer_cache_bh) - && ocfs2_inode_is_new(inode)) { - kaddr = kmap_atomic(bh_result->b_page); - if (!kaddr) { - mlog(ML_ERROR, "couldn't kmap!\n"); - goto bail; - } - memcpy(kaddr + (bh_result->b_size * iblock), - buffer_cache_bh->b_data, - bh_result->b_size); - kunmap_atomic(kaddr); + if (buffer_jbd(buffer_cache_bh) && ocfs2_inode_is_new(inode)) { + memcpy_to_folio(bh_result->b_folio, + bh_result->b_size * iblock, + buffer_cache_bh->b_data, + bh_result->b_size); set_buffer_uptodate(bh_result); } brelse(buffer_cache_bh); @@ -127,6 +111,19 @@ bail: return err; } +static int ocfs2_lock_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + down_read(&oi->ip_alloc_sem); + ret = ocfs2_get_block(inode, iblock, bh_result, create); + up_read(&oi->ip_alloc_sem); + + return ret; +} + int ocfs2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -152,9 +149,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count, &ext_flags); if (err) { - mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " - "%llu, NULL)\n", err, inode, (unsigned long long)iblock, - (unsigned long long)p_blkno); + mlog(ML_ERROR, "get_blocks() failed, inode: 0x%p, " + "block: %llu\n", inode, (unsigned long long)iblock); goto bail; } @@ -212,49 +208,41 @@ bail: return err; } -int ocfs2_read_inline_data(struct inode *inode, struct page *page, +int ocfs2_read_inline_data(struct inode *inode, struct folio *folio, struct buffer_head *di_bh) { - void *kaddr; loff_t size; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { - ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", + ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); return -EROFS; } size = i_size_read(inode); - if (size > PAGE_CACHE_SIZE || + if (size > folio_size(folio) || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, - "Inode %llu has with inline data has bad size: %Lu", + "Inode %llu has with inline data has bad size: %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)size); return -EROFS; } - kaddr = kmap_atomic(page); - if (size) - memcpy(kaddr, di->id2.i_data.id_data, size); - /* Clear the remaining part of the page */ - memset(kaddr + size, 0, PAGE_CACHE_SIZE - size); - flush_dcache_page(page); - kunmap_atomic(kaddr); - - SetPageUptodate(page); + folio_fill_tail(folio, 0, di->id2.i_data.id_data, size); + folio_mark_uptodate(folio); return 0; } -static int ocfs2_readpage_inline(struct inode *inode, struct page *page) +static int ocfs2_readpage_inline(struct inode *inode, struct folio *folio) { int ret; struct buffer_head *di_bh = NULL; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); ret = ocfs2_read_inode_block(inode, &di_bh); @@ -263,25 +251,24 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) goto out; } - ret = ocfs2_read_inline_data(inode, page, di_bh); + ret = ocfs2_read_inline_data(inode, folio, di_bh); out: - unlock_page(page); + folio_unlock(folio); brelse(di_bh); return ret; } -static int ocfs2_readpage(struct file *file, struct page *page) +static int ocfs2_read_folio(struct file *file, struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; + loff_t start = folio_pos(folio); int ret, unlock = 1; - trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, - (page ? page->index : 0)); + trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, folio->index); - ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); + ret = ocfs2_inode_lock_with_folio(inode, NULL, 0, folio); if (ret != 0) { if (ret == AOP_TRUNCATED_PAGE) unlock = 0; @@ -291,11 +278,11 @@ static int ocfs2_readpage(struct file *file, struct page *page) if (down_read_trylock(&oi->ip_alloc_sem) == 0) { /* - * Unlock the page and cycle ip_alloc_sem so that we don't + * Unlock the folio and cycle ip_alloc_sem so that we don't * busyloop waiting for ip_alloc_sem to unlock */ ret = AOP_TRUNCATED_PAGE; - unlock_page(page); + folio_unlock(folio); unlock = 0; down_read(&oi->ip_alloc_sem); up_read(&oi->ip_alloc_sem); @@ -303,35 +290,35 @@ static int ocfs2_readpage(struct file *file, struct page *page) } /* - * i_size might have just been updated as we grabed the meta lock. We + * i_size might have just been updated as we grabbed the meta lock. We * might now be discovering a truncate that hit on another node. - * block_read_full_page->get_block freaks out if it is asked to read + * block_read_full_folio->get_block freaks out if it is asked to read * beyond the end of a file, so we check here. Callers * (generic_file_read, vm_ops->fault) are clever enough to check i_size - * and notice that the page they just read isn't needed. + * and notice that the folio they just read isn't needed. * * XXX sys_readahead() seems to get that wrong? */ if (start >= i_size_read(inode)) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); ret = 0; goto out_alloc; } if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - ret = ocfs2_readpage_inline(inode, page); + ret = ocfs2_readpage_inline(inode, folio); else - ret = block_read_full_page(page, ocfs2_get_block); + ret = block_read_full_folio(folio, ocfs2_get_block); unlock = 0; out_alloc: - up_read(&OCFS2_I(inode)->ip_alloc_sem); + up_read(&oi->ip_alloc_sem); out_inode_unlock: ocfs2_inode_unlock(inode, 0); out: if (unlock) - unlock_page(page); + folio_unlock(folio); return ret; } @@ -344,14 +331,11 @@ out: * grow out to a tree. If need be, detecting boundary extents could * trivially be added in a future version of ocfs2_get_block(). */ -static int ocfs2_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ocfs2_readahead(struct readahead_control *rac) { - int ret, err = -EIO; - struct inode *inode = mapping->host; + int ret; + struct inode *inode = rac->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start; - struct page *last; /* * Use the nonblocking flag for the dlm code to avoid page @@ -359,56 +343,48 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, */ ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); if (ret) - return err; + return; - if (down_read_trylock(&oi->ip_alloc_sem) == 0) { - ocfs2_inode_unlock(inode, 0); - return err; - } + if (down_read_trylock(&oi->ip_alloc_sem) == 0) + goto out_unlock; /* * Don't bother with inline-data. There isn't anything * to read-ahead in that case anyway... */ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - goto out_unlock; + goto out_up; /* * Check whether a remote node truncated this file - we just * drop out in that case as it's not worth handling here. */ - last = list_entry(pages->prev, struct page, lru); - start = (loff_t)last->index << PAGE_CACHE_SHIFT; - if (start >= i_size_read(inode)) - goto out_unlock; + if (readahead_pos(rac) >= i_size_read(inode)) + goto out_up; - err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + mpage_readahead(rac, ocfs2_get_block); -out_unlock: +out_up: up_read(&oi->ip_alloc_sem); +out_unlock: ocfs2_inode_unlock(inode, 0); - - return err; } /* Note: Because we don't support holes, our allocation has * already happened (allocation writes zeros to the file data) * so we don't have to worry about ordered writes in - * ocfs2_writepage. + * ocfs2_writepages. * - * ->writepage is called during the process of invalidating the page cache + * ->writepages is called during the process of invalidating the page cache * during blocked lock processing. It can't block on any cluster locks * to during block mapping. It's relying on the fact that the block * mapping can't have disappeared under the dirty pages that it is * being asked to write back. */ -static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) +static int ocfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - trace_ocfs2_writepage( - (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno, - page->index); - - return block_write_full_page(page, ocfs2_get_block, wbc); + return mpage_writepages(mapping, wbc, ocfs2_get_block); } /* Taken from ext3. We don't necessarily need the full blown @@ -457,6 +433,15 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)block); + /* + * The swap code (ab-)uses ->bmap to get a block mapping and then + * bypasseÑ• the file system for actual I/O. We really can't allow + * that on refcounted inodes, so we have to skip out here. And yes, + * 0 is the magic code for a bmap error.. + */ + if (ocfs2_is_refcount_inode(inode)) + return 0; + /* We don't need to lock journal system files, since they aren't * accessed concurrently from multiple nodes. */ @@ -492,158 +477,11 @@ bail: return status; } -/* - * TODO: Make this into a generic get_blocks function. - * - * From do_direct_io in direct-io.c: - * "So what we do is to permit the ->get_blocks function to populate - * bh.b_size with the size of IO which is permitted at this offset and - * this i_blkbits." - * - * This function is called directly from get_more_blocks in direct-io.c. - * - * called like this: dio->get_blocks(dio->inode, fs_startblk, - * fs_count, map_bh, dio->rw == WRITE); - * - * Note that we never bother to allocate blocks here, and thus ignore the - * create argument. - */ -static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static bool ocfs2_release_folio(struct folio *folio, gfp_t wait) { - int ret; - u64 p_blkno, inode_blocks, contig_blocks; - unsigned int ext_flags; - unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; - unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; - - /* This function won't even be called if the request isn't all - * nicely aligned and of the right size, so there's no need - * for us to check any of that. */ - - inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - - /* This figures out the size of the next contiguous block, and - * our logical offset */ - ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, - &contig_blocks, &ext_flags); - if (ret) { - mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", - (unsigned long long)iblock); - ret = -EIO; - goto bail; - } - - /* We should already CoW the refcounted extent in case of create. */ - BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); - - /* - * get_more_blocks() expects us to describe a hole by clearing - * the mapped bit on bh_result(). - * - * Consider an unwritten extent as a hole. - */ - if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) - map_bh(bh_result, inode->i_sb, p_blkno); - else - clear_buffer_mapped(bh_result); - - /* make sure we don't map more than max_blocks blocks here as - that's all the kernel will handle at this point. */ - if (max_blocks < contig_blocks) - contig_blocks = max_blocks; - bh_result->b_size = contig_blocks << blocksize_bits; -bail: - return ret; -} - -/* - * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're - * particularly interested in the aio/dio case. We use the rw_lock DLM lock - * to protect io on one node from truncation on another. - */ -static void ocfs2_dio_end_io(struct kiocb *iocb, - loff_t offset, - ssize_t bytes, - void *private, - int ret, - bool is_async) -{ - struct inode *inode = file_inode(iocb->ki_filp); - int level; - wait_queue_head_t *wq = ocfs2_ioend_wq(inode); - - /* this io's submitter should not have unlocked this before we could */ - BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - - if (ocfs2_iocb_is_sem_locked(iocb)) - ocfs2_iocb_clear_sem_locked(iocb); - - if (ocfs2_iocb_is_unaligned_aio(iocb)) { - ocfs2_iocb_clear_unaligned_aio(iocb); - - if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && - waitqueue_active(wq)) { - wake_up_all(wq); - } - } - - ocfs2_iocb_clear_rw_locked(iocb); - - level = ocfs2_iocb_rw_locked_level(iocb); - ocfs2_rw_unlock(inode, level); - - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); -} - -/* - * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen - * from ext3. PageChecked() bits have been removed as OCFS2 does not - * do journalled data. - */ -static void ocfs2_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - - jbd2_journal_invalidatepage(journal, page, offset, length); -} - -static int ocfs2_releasepage(struct page *page, gfp_t wait) -{ - journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - - if (!page_has_buffers(page)) - return 0; - return jbd2_journal_try_to_free_buffers(journal, page, wait); -} - -static ssize_t ocfs2_direct_IO(int rw, - struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file)->i_mapping->host; - - /* - * Fallback to buffered I/O if we see an inode without - * extents. - */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) - return 0; - - /* Fallback to buffered I/O if we are appending. */ - if (i_size_read(inode) <= offset) - return 0; - - return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, - iov, offset, nr_segs, - ocfs2_direct_IO_get_blocks, - ocfs2_dio_end_io, NULL, 0); + if (!folio_buffers(folio)) + return false; + return try_to_free_buffers(folio); } static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, @@ -651,12 +489,12 @@ static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, unsigned int *start, unsigned int *end) { - unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; + unsigned int cluster_start = 0, cluster_end = PAGE_SIZE; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { + if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) { unsigned int cpp; - cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); + cpp = 1 << (PAGE_SHIFT - osb->s_clustersize_bits); cluster_start = cpos % cpp; cluster_start = cluster_start << osb->s_clustersize_bits; @@ -681,7 +519,7 @@ static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, * * from == to == 0 is code for "zero the entire cluster region" */ -static void ocfs2_clear_page_regions(struct page *page, +static void ocfs2_clear_folio_regions(struct folio *folio, struct ocfs2_super *osb, u32 cpos, unsigned from, unsigned to) { @@ -690,7 +528,7 @@ static void ocfs2_clear_page_regions(struct page *page, ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); if (from || to) { if (from > cluster_start) @@ -701,20 +539,20 @@ static void ocfs2_clear_page_regions(struct page *page, memset(kaddr + cluster_start, 0, cluster_end - cluster_start); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); } /* * Nonsparse file systems fully allocate before we get to the write * code. This prevents ocfs2_write() from tagging the write as an - * allocating one, which means ocfs2_map_page_blocks() might try to + * allocating one, which means ocfs2_map_folio_blocks() might try to * read-in the blocks at the tail of our file. Avoid reading them by * testing i_size against each block offset. */ -static int ocfs2_should_read_blk(struct inode *inode, struct page *page, +static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio, unsigned int block_start) { - u64 offset = page_offset(page) + block_start; + u64 offset = folio_pos(folio) + block_start; if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) return 1; @@ -732,19 +570,19 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page, * * This will also skip zeroing, which is handled externally. */ -int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, +int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new) { int ret = 0; struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; unsigned int block_end, block_start; - unsigned int bsize = 1 << inode->i_blkbits; + unsigned int bsize = i_blocksize(inode); - if (!page_has_buffers(page)) - create_empty_buffers(page, bsize, 0); + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, bsize, 0); - head = page_buffers(page); for (bh = head, block_start = 0; bh != head || !block_start; bh = bh->b_this_page, block_start += bsize) { block_end = block_start + bsize; @@ -756,7 +594,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, * they may belong to unallocated clusters. */ if (block_start >= to || block_end <= from) { - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); continue; } @@ -770,17 +608,16 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (!buffer_mapped(bh)) { map_bh(bh, inode->i_sb, *p_blkno); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); } - if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); + if (folio_test_uptodate(folio)) { + set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_new(bh) && - ocfs2_should_read_blk(inode, page, block_start) && + ocfs2_should_read_blk(inode, folio, block_start) && (block_start < from || block_end > to)) { - ll_rw_block(READ, 1, &bh); + bh_read_nowait(bh, 0); *wait_bh++=bh; } @@ -812,7 +649,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (block_start >= to) break; - zero_user(page, block_start, bh->b_size); + folio_zero_range(folio, block_start, bh->b_size); set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -824,13 +661,20 @@ next_bh: return ret; } -#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) +#if (PAGE_SIZE >= OCFS2_MAX_CLUSTERSIZE) #define OCFS2_MAX_CTXT_PAGES 1 #else -#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE) #endif -#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE) + +struct ocfs2_unwritten_extent { + struct list_head ue_node; + struct list_head ue_ip_node; + u32 ue_cpos; + u32 ue_phys; +}; /* * Describe the state of a single cluster to be written to. @@ -843,7 +687,7 @@ struct ocfs2_write_cluster_desc { * filled. */ unsigned c_new; - unsigned c_unwritten; + unsigned c_clear_unwritten; unsigned c_needs_zero; }; @@ -855,6 +699,9 @@ struct ocfs2_write_ctxt { /* First cluster allocated in a nonsparse extend */ u32 w_first_new_cpos; + /* Type of caller. Must be one of buffer, mmap, direct. */ + ocfs2_write_type_t w_type; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; /* @@ -866,24 +713,24 @@ struct ocfs2_write_ctxt { unsigned int w_large_pages; /* - * Pages involved in this write. + * Folios involved in this write. * - * w_target_page is the page being written to by the user. + * w_target_folio is the folio being written to by the user. * - * w_pages is an array of pages which always contains - * w_target_page, and in the case of an allocating write with + * w_folios is an array of folios which always contains + * w_target_folio, and in the case of an allocating write with * page_size < cluster size, it will contain zero'd and mapped - * pages adjacent to w_target_page which need to be written + * pages adjacent to w_target_folio which need to be written * out in so that future reads from that region will get * zero's. */ - unsigned int w_num_pages; - struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; - struct page *w_target_page; + unsigned int w_num_folios; + struct folio *w_folios[OCFS2_MAX_CTXT_PAGES]; + struct folio *w_target_folio; /* * w_target_locked is used for page_mkwrite path indicating no unlocking - * against w_target_page in ocfs2_write_end_nolock. + * against w_target_folio in ocfs2_write_end_nolock. */ unsigned int w_target_locked:1; @@ -903,50 +750,75 @@ struct ocfs2_write_ctxt { struct buffer_head *w_di_bh; struct ocfs2_cached_dealloc_ctxt w_dealloc; + + struct list_head w_unwritten_list; + unsigned int w_unwritten_count; }; -void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) +void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios) { int i; - for(i = 0; i < num_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - mark_page_accessed(pages[i]); - page_cache_release(pages[i]); - } + for(i = 0; i < num_folios; i++) { + if (!folios[i]) + continue; + folio_unlock(folios[i]); + folio_mark_accessed(folios[i]); + folio_put(folios[i]); } } -static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +static void ocfs2_unlock_folios(struct ocfs2_write_ctxt *wc) { int i; /* * w_target_locked is only set to true in the page_mkwrite() case. * The intent is to allow us to lock the target page from write_begin() - * to write_end(). The caller must hold a ref on w_target_page. + * to write_end(). The caller must hold a ref on w_target_folio. */ if (wc->w_target_locked) { - BUG_ON(!wc->w_target_page); - for (i = 0; i < wc->w_num_pages; i++) { - if (wc->w_target_page == wc->w_pages[i]) { - wc->w_pages[i] = NULL; + BUG_ON(!wc->w_target_folio); + for (i = 0; i < wc->w_num_folios; i++) { + if (wc->w_target_folio == wc->w_folios[i]) { + wc->w_folios[i] = NULL; break; } } - mark_page_accessed(wc->w_target_page); - page_cache_release(wc->w_target_page); + folio_mark_accessed(wc->w_target_folio); + folio_put(wc->w_target_folio); } - ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); + ocfs2_unlock_and_free_folios(wc->w_folios, wc->w_num_folios); +} + +static void ocfs2_free_unwritten_list(struct inode *inode, + struct list_head *head) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL; + list_for_each_entry_safe(ue, tmp, head, ue_node) { + list_del(&ue->ue_node); + spin_lock(&oi->ip_lock); + list_del(&ue->ue_ip_node); + spin_unlock(&oi->ip_lock); + kfree(ue); + } +} + +static void ocfs2_free_write_ctxt(struct inode *inode, + struct ocfs2_write_ctxt *wc) +{ + ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list); + ocfs2_unlock_folios(wc); brelse(wc->w_di_bh); kfree(wc); } static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, struct ocfs2_super *osb, loff_t pos, - unsigned len, struct buffer_head *di_bh) + unsigned len, ocfs2_write_type_t type, + struct buffer_head *di_bh) { u32 cend; struct ocfs2_write_ctxt *wc; @@ -961,13 +833,15 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, wc->w_clen = cend - wc->w_cpos + 1; get_bh(di_bh); wc->w_di_bh = di_bh; + wc->w_type = type; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) wc->w_large_pages = 1; else wc->w_large_pages = 0; ocfs2_init_dealloc_ctxt(&wc->w_dealloc); + INIT_LIST_HEAD(&wc->w_unwritten_list); *wcp = wc; @@ -979,29 +853,30 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, * and dirty so they'll be written out (in order to prevent uninitialised * block data from leaking). And clear the new bit. */ -static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) +static void ocfs2_zero_new_buffers(struct folio *folio, size_t from, size_t to) { unsigned int block_start, block_end; struct buffer_head *head, *bh; - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) + BUG_ON(!folio_test_locked(folio)); + head = folio_buffers(folio); + if (!head) return; - bh = head = page_buffers(page); + bh = head; block_start = 0; do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { unsigned start, end; start = max(from, block_start); end = min(to, block_end); - zero_user_segment(page, start, end); + folio_zero_segment(folio, start, end); set_buffer_uptodate(bh); } @@ -1024,29 +899,28 @@ static void ocfs2_write_failure(struct inode *inode, loff_t user_pos, unsigned user_len) { int i; - unsigned from = user_pos & (PAGE_CACHE_SIZE - 1), + unsigned from = user_pos & (PAGE_SIZE - 1), to = user_pos + user_len; - struct page *tmppage; - ocfs2_zero_new_buffers(wc->w_target_page, from, to); + if (wc->w_target_folio) + ocfs2_zero_new_buffers(wc->w_target_folio, from, to); - for(i = 0; i < wc->w_num_pages; i++) { - tmppage = wc->w_pages[i]; + for (i = 0; i < wc->w_num_folios; i++) { + struct folio *folio = wc->w_folios[i]; - if (page_has_buffers(tmppage)) { + if (folio && folio_buffers(folio)) { if (ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(wc->w_handle, inode); + ocfs2_jbd2_inode_add_write(wc->w_handle, inode, + user_pos, user_len); - block_commit_write(tmppage, from, to); + block_commit_write(folio, from, to); } } } -static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, - struct ocfs2_write_ctxt *wc, - struct page *page, u32 cpos, - loff_t user_pos, unsigned user_len, - int new) +static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno, + struct ocfs2_write_ctxt *wc, struct folio *folio, u32 cpos, + loff_t user_pos, unsigned user_len, int new) { int ret; unsigned int map_from = 0, map_to = 0; @@ -1059,20 +933,19 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, /* treat the write as new if the a hole/lseek spanned across * the page boundary. */ - new = new | ((i_size_read(inode) <= page_offset(page)) && - (page_offset(page) <= user_pos)); + new = new | ((i_size_read(inode) <= folio_pos(folio)) && + (folio_pos(folio) <= user_pos)); - if (page == wc->w_target_page) { - map_from = user_pos & (PAGE_CACHE_SIZE - 1); + if (folio == wc->w_target_folio) { + map_from = user_pos & (PAGE_SIZE - 1); map_to = map_from + user_len; if (new) - ret = ocfs2_map_page_blocks(page, p_blkno, inode, - cluster_start, cluster_end, - new); + ret = ocfs2_map_folio_blocks(folio, p_blkno, inode, + cluster_start, cluster_end, new); else - ret = ocfs2_map_page_blocks(page, p_blkno, inode, - map_from, map_to, new); + ret = ocfs2_map_folio_blocks(folio, p_blkno, inode, + map_from, map_to, new); if (ret) { mlog_errno(ret); goto out; @@ -1086,7 +959,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, } } else { /* - * If we haven't allocated the new page yet, we + * If we haven't allocated the new folio yet, we * shouldn't be writing it out without copying user * data. This is likely a math error from the caller. */ @@ -1095,8 +968,8 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, map_from = cluster_start; map_to = cluster_end; - ret = ocfs2_map_page_blocks(page, p_blkno, inode, - cluster_start, cluster_end, new); + ret = ocfs2_map_folio_blocks(folio, p_blkno, inode, + cluster_start, cluster_end, new); if (ret) { mlog_errno(ret); goto out; @@ -1104,20 +977,20 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, } /* - * Parts of newly allocated pages need to be zero'd. + * Parts of newly allocated folios need to be zero'd. * * Above, we have also rewritten 'to' and 'from' - as far as * the rest of the function is concerned, the entire cluster - * range inside of a page needs to be written. + * range inside of a folio needs to be written. * - * We can skip this if the page is up to date - it's already + * We can skip this if the folio is uptodate - it's already * been zero'd from being read in as a hole. */ - if (new && !PageUptodate(page)) - ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), + if (new && !folio_test_uptodate(folio)) + ocfs2_clear_folio_regions(folio, OCFS2_SB(inode->i_sb), cpos, user_data_from, user_data_to); - flush_dcache_page(page); + flush_dcache_folio(folio); out: return ret; @@ -1126,18 +999,16 @@ out: /* * This function will only grab one clusters worth of pages. */ -static int ocfs2_grab_pages_for_write(struct address_space *mapping, - struct ocfs2_write_ctxt *wc, - u32 cpos, loff_t user_pos, - unsigned user_len, int new, - struct page *mmap_page) +static int ocfs2_grab_folios_for_write(struct address_space *mapping, + struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, + unsigned user_len, int new, struct folio *mmap_folio) { int ret = 0, i; unsigned long start, target_index, end_index, index; struct inode *inode = mapping->host; loff_t last_byte; - target_index = user_pos >> PAGE_CACHE_SHIFT; + target_index = user_pos >> PAGE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For @@ -1147,7 +1018,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, * last page of the write. */ if (new) { - wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); + wc->w_num_folios = ocfs2_pages_per_cluster(inode->i_sb); start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); /* * We need the index *past* the last page we could possibly @@ -1156,49 +1027,58 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, */ last_byte = max(user_pos + user_len, i_size_read(inode)); BUG_ON(last_byte < 1); - end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; - if ((start + wc->w_num_pages) > end_index) - wc->w_num_pages = end_index - start; + end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1; + if ((start + wc->w_num_folios) > end_index) + wc->w_num_folios = end_index - start; } else { - wc->w_num_pages = 1; + wc->w_num_folios = 1; start = target_index; } + end_index = (user_pos + user_len - 1) >> PAGE_SHIFT; - for(i = 0; i < wc->w_num_pages; i++) { + for(i = 0; i < wc->w_num_folios; i++) { index = start + i; - if (index == target_index && mmap_page) { + if (index >= target_index && index <= end_index && + wc->w_type == OCFS2_WRITE_MMAP) { /* * ocfs2_pagemkwrite() is a little different * and wants us to directly use the page * passed in. */ - lock_page(mmap_page); + folio_lock(mmap_folio); /* Exit and let the caller retry */ - if (mmap_page->mapping != mapping) { - WARN_ON(mmap_page->mapping); - unlock_page(mmap_page); + if (mmap_folio->mapping != mapping) { + WARN_ON(mmap_folio->mapping); + folio_unlock(mmap_folio); ret = -EAGAIN; goto out; } - page_cache_get(mmap_page); - wc->w_pages[i] = mmap_page; + folio_get(mmap_folio); + wc->w_folios[i] = mmap_folio; wc->w_target_locked = true; + } else if (index >= target_index && index <= end_index && + wc->w_type == OCFS2_WRITE_DIRECT) { + /* Direct write has no mapping page. */ + wc->w_folios[i] = NULL; + continue; } else { - wc->w_pages[i] = find_or_create_page(mapping, index, - GFP_NOFS); - if (!wc->w_pages[i]) { - ret = -ENOMEM; + wc->w_folios[i] = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + GFP_NOFS); + if (IS_ERR(wc->w_folios[i])) { + ret = PTR_ERR(wc->w_folios[i]); mlog_errno(ret); + wc->w_folios[i] = NULL; goto out; } } - wait_for_stable_page(wc->w_pages[i]); + folio_wait_stable(wc->w_folios[i]); if (index == target_index) - wc->w_target_page = wc->w_pages[i]; + wc->w_target_folio = wc->w_folios[i]; } out: if (ret) @@ -1210,19 +1090,20 @@ out: * Prepare a single cluster for write one cluster into the file. */ static int ocfs2_write_cluster(struct address_space *mapping, - u32 phys, unsigned int unwritten, + u32 *phys, unsigned int new, + unsigned int clear_unwritten, unsigned int should_zero, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, unsigned user_len) { - int ret, i, new; - u64 v_blkno, p_blkno; + int ret, i; + u64 p_blkno; struct inode *inode = mapping->host; struct ocfs2_extent_tree et; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); - new = phys == 0 ? 1 : 0; if (new) { u32 tmp_pos; @@ -1232,9 +1113,9 @@ static int ocfs2_write_cluster(struct address_space *mapping, */ tmp_pos = cpos; ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, 0, wc->w_di_bh, - wc->w_handle, data_ac, - meta_ac, NULL); + &tmp_pos, 1, !clear_unwritten, + wc->w_di_bh, wc->w_handle, + data_ac, meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1251,11 +1132,11 @@ static int ocfs2_write_cluster(struct address_space *mapping, mlog_errno(ret); goto out; } - } else if (unwritten) { + } else if (clear_unwritten) { ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), wc->w_di_bh); ret = ocfs2_mark_extent_written(inode, &et, - wc->w_handle, cpos, 1, phys, + wc->w_handle, cpos, 1, *phys, meta_ac, &wc->w_dealloc); if (ret < 0) { mlog_errno(ret); @@ -1263,34 +1144,36 @@ static int ocfs2_write_cluster(struct address_space *mapping, } } - if (should_zero) - v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); - else - v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; - /* * The only reason this should fail is due to an inability to * find the extent added. */ - ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, - NULL); + ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL); if (ret < 0) { - ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " - "at logical block %llu", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)v_blkno); + mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " + "at logical cluster %u", + (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); goto out; } - BUG_ON(p_blkno == 0); + BUG_ON(*phys == 0); + + p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys); + if (!should_zero) + p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1); - for(i = 0; i < wc->w_num_pages; i++) { + for (i = 0; i < wc->w_num_folios; i++) { int tmpret; - tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, - wc->w_pages[i], cpos, - user_pos, user_len, - should_zero); + /* This is the direct io target page. */ + if (wc->w_folios[i] == NULL) { + p_blkno += (1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits)); + continue; + } + + tmpret = ocfs2_prepare_folio_for_write(inode, &p_blkno, wc, + wc->w_folios[i], cpos, user_pos, user_len, + should_zero); if (tmpret) { mlog_errno(tmpret); if (ret == 0) @@ -1333,8 +1216,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, if ((cluster_off + local_len) > osb->s_clustersize) local_len = osb->s_clustersize - cluster_off; - ret = ocfs2_write_cluster(mapping, desc->c_phys, - desc->c_unwritten, + ret = ocfs2_write_cluster(mapping, &desc->c_phys, + desc->c_new, + desc->c_clear_unwritten, desc->c_needs_zero, data_ac, meta_ac, wc, desc->c_cpos, pos, local_len); @@ -1363,7 +1247,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, { struct ocfs2_write_cluster_desc *desc; - wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); + wc->w_target_from = pos & (PAGE_SIZE - 1); wc->w_target_to = wc->w_target_from + len; if (alloc == 0) @@ -1400,11 +1284,71 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, &wc->w_target_to); } else { wc->w_target_from = 0; - wc->w_target_to = PAGE_CACHE_SIZE; + wc->w_target_to = PAGE_SIZE; } } /* + * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to + * do the zero work. And should not to clear UNWRITTEN since it will be cleared + * by the direct io procedure. + * If this is a new extent that allocated by direct io, we should mark it in + * the ip_unwritten_list. + */ +static int ocfs2_unwritten_check(struct inode *inode, + struct ocfs2_write_ctxt *wc, + struct ocfs2_write_cluster_desc *desc) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_unwritten_extent *ue = NULL, *new = NULL; + int ret = 0; + + if (!desc->c_needs_zero) + return 0; + +retry: + spin_lock(&oi->ip_lock); + /* Needs not to zero no metter buffer or direct. The one who is zero + * the cluster is doing zero. And he will clear unwritten after all + * cluster io finished. */ + list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) { + if (desc->c_cpos == ue->ue_cpos) { + BUG_ON(desc->c_new); + desc->c_needs_zero = 0; + desc->c_clear_unwritten = 0; + goto unlock; + } + } + + if (wc->w_type != OCFS2_WRITE_DIRECT) + goto unlock; + + if (new == NULL) { + spin_unlock(&oi->ip_lock); + new = kmalloc(sizeof(struct ocfs2_unwritten_extent), + GFP_NOFS); + if (new == NULL) { + ret = -ENOMEM; + goto out; + } + goto retry; + } + /* This direct write will doing zero. */ + new->ue_cpos = desc->c_cpos; + new->ue_phys = desc->c_phys; + desc->c_clear_unwritten = 0; + list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); + list_add_tail(&new->ue_node, &wc->w_unwritten_list); + wc->w_unwritten_count++; + new = NULL; +unlock: + spin_unlock(&oi->ip_lock); +out: + kfree(new); + return ret; +} + +/* * Populate each single-cluster write descriptor in the write context * with information about the i/o to be done. * @@ -1479,14 +1423,21 @@ static int ocfs2_populate_write_desc(struct inode *inode, if (phys == 0) { desc->c_new = 1; desc->c_needs_zero = 1; + desc->c_clear_unwritten = 1; *clusters_to_alloc = *clusters_to_alloc + 1; } if (ext_flags & OCFS2_EXT_UNWRITTEN) { - desc->c_unwritten = 1; + desc->c_clear_unwritten = 1; desc->c_needs_zero = 1; } + ret = ocfs2_unwritten_check(inode, wc, desc); + if (ret) { + mlog_errno(ret); + goto out; + } + num_clusters--; } @@ -1501,29 +1452,32 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct page *page; + struct folio *folio; handle_t *handle; struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; - page = find_or_create_page(mapping, 0, GFP_NOFS); - if (!page) { - ret = -ENOMEM; + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } - /* - * If we don't set w_num_pages then this page won't get unlocked - * and freed on cleanup of the write context. - */ - wc->w_pages[0] = wc->w_target_page = page; - wc->w_num_pages = 1; - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); + folio = __filemap_get_folio(mapping, 0, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { + ocfs2_commit_trans(osb, handle); + ret = PTR_ERR(folio); mlog_errno(ret); goto out; } + /* + * If we don't set w_num_folios then this folio won't get unlocked + * and freed on cleanup of the write context. + */ + wc->w_target_folio = folio; + wc->w_folios[0] = folio; + wc->w_num_folios = 1; ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1537,8 +1491,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) ocfs2_set_inode_data_inline(inode, di); - if (!PageUptodate(page)) { - ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh); + if (!folio_test_uptodate(folio)) { + ret = ocfs2_read_inline_data(inode, folio, wc->w_di_bh); if (ret) { ocfs2_commit_trans(osb, handle); @@ -1561,9 +1515,8 @@ int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size) } static int ocfs2_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, loff_t pos, - unsigned len, struct page *mmap_page, - struct ocfs2_write_ctxt *wc) + struct inode *inode, loff_t pos, size_t len, + struct folio *mmap_folio, struct ocfs2_write_ctxt *wc) { int ret, written = 0; loff_t end = pos + len; @@ -1578,7 +1531,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, * Handle inodes which already have inline data 1st. */ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - if (mmap_page == NULL && + if (mmap_folio == NULL && ocfs2_size_fits_inline_data(wc->w_di_bh, end)) goto do_inline_write; @@ -1602,7 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, * Check whether the write can fit. */ di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; - if (mmap_page || + if (mmap_folio || end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) return 0; @@ -1648,8 +1601,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, if (ret) mlog_errno(ret); - wc->w_first_new_cpos = - ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + /* There is no wc if this is call from direct. */ + if (wc) + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); return ret; } @@ -1666,48 +1621,10 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, return ret; } -/* - * Try to flush truncate logs if we can free enough clusters from it. - * As for return value, "< 0" means error, "0" no space and "1" means - * we have freed enough spaces and let the caller try to allocate again. - */ -static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, - unsigned int needed) -{ - tid_t target; - int ret = 0; - unsigned int truncated_clusters; - - mutex_lock(&osb->osb_tl_inode->i_mutex); - truncated_clusters = osb->truncated_clusters; - mutex_unlock(&osb->osb_tl_inode->i_mutex); - - /* - * Check whether we can succeed in allocating if we free - * the truncate log. - */ - if (truncated_clusters < needed) - goto out; - - ret = ocfs2_flush_truncate_log(osb); - if (ret) { - mlog_errno(ret); - goto out; - } - - if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { - jbd2_log_wait_commit(osb->journal->j_journal, target); - ret = 1; - } -out: - return ret; -} - -int ocfs2_write_begin_nolock(struct file *filp, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata, - struct buffer_head *di_bh, struct page *mmap_page) +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, ocfs2_write_type_t type, + struct folio **foliop, void **fsdata, + struct buffer_head *di_bh, struct folio *mmap_folio) { int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0; @@ -1722,7 +1639,7 @@ int ocfs2_write_begin_nolock(struct file *filp, int try_free = 1, ret1; try_again: - ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh); if (ret) { mlog_errno(ret); return ret; @@ -1730,7 +1647,7 @@ try_again: if (ocfs2_supports_inline_data(osb)) { ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len, - mmap_page, wc); + mmap_folio, wc); if (ret == 1) { ret = 0; goto success; @@ -1741,14 +1658,17 @@ try_again: } } - if (ocfs2_sparse_alloc(osb)) - ret = ocfs2_zero_tail(inode, di_bh, pos); - else - ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, - wc); - if (ret) { - mlog_errno(ret); - goto out; + /* Direct io change i_size late, should not zero tail here. */ + if (type != OCFS2_WRITE_DIRECT) { + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, + len, wc); + if (ret) { + mlog_errno(ret); + goto out; + } } ret = ocfs2_check_range_for_refcount(inode, pos, len); @@ -1757,7 +1677,7 @@ try_again: goto out; } else if (ret == 1) { clusters_need = wc->w_clen; - ret = ocfs2_refcount_cow(inode, filp, di_bh, + ret = ocfs2_refcount_cow(inode, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { mlog_errno(ret); @@ -1779,7 +1699,7 @@ try_again: (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), - pos, len, flags, mmap_page, + pos, len, type, mmap_folio, clusters_to_alloc, extents_to_split); /* @@ -1808,19 +1728,18 @@ try_again: data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; credits = ocfs2_calc_extend_credits(inode->i_sb, - &di->id2.i_list, - clusters_to_alloc); - - } + &di->id2.i_list); + } else if (type == OCFS2_WRITE_DIRECT) + /* direct write needs not to start trans if no extents alloc. */ + goto success; /* * We have to zero sparse allocated clusters, unwritten extent clusters, * and non-sparse clusters we just extended. For non-sparse writes, * we know zeros will only be needed in the first and/or last cluster. */ - if (clusters_to_alloc || extents_to_split || - (wc->w_clen && (wc->w_desc[0].c_needs_zero || - wc->w_desc[wc->w_clen - 1].c_needs_zero))) + if (wc->w_clen && (wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero)) cluster_of_pages = 1; else cluster_of_pages = 0; @@ -1842,10 +1761,7 @@ try_again: if (ret) goto out_commit; } - /* - * We don't want this to fail in ocfs2_write_end(), so do it - * here. - */ + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -1854,26 +1770,26 @@ try_again: } /* - * Fill our page array first. That way we've grabbed enough so + * Fill our folio array first. That way we've grabbed enough so * that we can zero and flush if we error after adding the * extent. */ - ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, - cluster_of_pages, mmap_page); - if (ret && ret != -EAGAIN) { - mlog_errno(ret); - goto out_quota; - } + ret = ocfs2_grab_folios_for_write(mapping, wc, wc->w_cpos, pos, len, + cluster_of_pages, mmap_folio); + if (ret) { + /* + * ocfs2_grab_folios_for_write() returns -EAGAIN if it + * could not lock the target folio. In this case, we exit + * with no error and no target folio. This will trigger + * the caller, page_mkwrite(), to re-try the operation. + */ + if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) { + BUG_ON(wc->w_target_folio); + ret = 0; + goto out_quota; + } - /* - * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock - * the target page. In this case, we exit with no error and no target - * page. This will trigger the caller, page_mkwrite(), to re-try - * the operation. - */ - if (ret == -EAGAIN) { - BUG_ON(wc->w_target_page); - ret = 0; + mlog_errno(ret); goto out_quota; } @@ -1890,7 +1806,8 @@ try_again: ocfs2_free_alloc_context(meta_ac); success: - *pagep = wc->w_target_page; + if (foliop) + *foliop = wc->w_target_folio; *fsdata = wc; return 0; out_quota: @@ -1901,12 +1818,26 @@ out_commit: ocfs2_commit_trans(osb, handle); out: - ocfs2_free_write_ctxt(wc); + /* + * The mmapped page won't be unlocked in ocfs2_free_write_ctxt(), + * even in case of error here like ENOSPC and ENOMEM. So, we need + * to unlock the target page manually to prevent deadlocks when + * retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED + * to VM code. + */ + if (wc->w_target_locked) + folio_unlock(mmap_folio); - if (data_ac) + ocfs2_free_write_ctxt(inode, wc); + + if (data_ac) { ocfs2_free_alloc_context(data_ac); - if (meta_ac) + data_ac = NULL; + } + if (meta_ac) { ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } if (ret == -ENOSPC && try_free) { /* @@ -1926,9 +1857,10 @@ out: return ret; } -static int ocfs2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) +static int ocfs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; struct buffer_head *di_bh = NULL; @@ -1943,14 +1875,14 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, /* * Take alloc sem here to prevent concurrent lookups. That way * the mapping, zeroing and tree manipulation within - * ocfs2_write() will be safe against ->readpage(). This + * ocfs2_write() will be safe against ->read_folio(). This * should also serve to lock out allocation from a shared * writeable region. */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, - fsdata, di_bh, NULL); + ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, + foliop, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); goto out_fail; @@ -1974,18 +1906,15 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, struct ocfs2_dinode *di, struct ocfs2_write_ctxt *wc) { - void *kaddr; - if (unlikely(*copied < len)) { - if (!PageUptodate(wc->w_target_page)) { + if (!folio_test_uptodate(wc->w_target_folio)) { *copied = 0; return; } } - kaddr = kmap_atomic(wc->w_target_page); - memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); - kunmap_atomic(kaddr); + memcpy_from_folio(di->id2.i_data.id_data + pos, wc->w_target_folio, + pos, *copied); trace_ocfs2_write_end_inline( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1994,42 +1923,72 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, le16_to_cpu(di->i_dyn_features)); } -int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, + unsigned len, unsigned copied, void *fsdata) { - int i; - unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); + int i, ret; + size_t from, to, start = pos & (PAGE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_write_ctxt *wc = fsdata; struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; handle_t *handle = wc->w_handle; - struct page *tmppage; + + BUG_ON(!list_empty(&wc->w_unwritten_list)); + + if (handle) { + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + copied = ret; + mlog_errno(ret); + goto out; + } + } if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); goto out_write_size; } - if (unlikely(copied < len)) { - if (!PageUptodate(wc->w_target_page)) + if (unlikely(copied < len) && wc->w_target_folio) { + loff_t new_isize; + + if (!folio_test_uptodate(wc->w_target_folio)) copied = 0; - ocfs2_zero_new_buffers(wc->w_target_page, start+copied, - start+len); + new_isize = max_t(loff_t, i_size_read(inode), pos + copied); + if (new_isize > folio_pos(wc->w_target_folio)) + ocfs2_zero_new_buffers(wc->w_target_folio, start+copied, + start+len); + else { + /* + * When folio is fully beyond new isize (data copy + * failed), do not bother zeroing the folio. Invalidate + * it instead so that writeback does not get confused + * put page & buffer dirty bits into inconsistent + * state. + */ + block_invalidate_folio(wc->w_target_folio, 0, + folio_size(wc->w_target_folio)); + } } - flush_dcache_page(wc->w_target_page); + if (wc->w_target_folio) + flush_dcache_folio(wc->w_target_folio); - for(i = 0; i < wc->w_num_pages; i++) { - tmppage = wc->w_pages[i]; + for (i = 0; i < wc->w_num_folios; i++) { + struct folio *folio = wc->w_folios[i]; - if (tmppage == wc->w_target_page) { + /* This is the direct io target folio */ + if (folio == NULL) + continue; + + if (folio == wc->w_target_folio) { from = wc->w_target_from; to = wc->w_target_to; - BUG_ON(from > PAGE_CACHE_SIZE || - to > PAGE_CACHE_SIZE || + BUG_ON(from > folio_size(folio) || + to > folio_size(folio) || to < from); } else { /* @@ -2038,46 +1997,67 @@ int ocfs2_write_end_nolock(struct address_space *mapping, * to flush their entire range. */ from = 0; - to = PAGE_CACHE_SIZE; + to = folio_size(folio); } - if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(wc->w_handle, inode); - block_commit_write(tmppage, from, to); + if (folio_buffers(folio)) { + if (handle && ocfs2_should_order_data(inode)) { + loff_t start_byte = folio_pos(folio) + from; + loff_t length = to - from; + ocfs2_jbd2_inode_add_write(handle, inode, + start_byte, length); + } + block_commit_write(folio, from, to); } } out_write_size: - pos += copied; - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - inode->i_blocks = ocfs2_inode_sector_count(inode); - di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ocfs2_journal_dirty(handle, wc->w_di_bh); + /* Direct io do not update i_size here. */ + if (wc->w_type != OCFS2_WRITE_DIRECT) { + pos += copied; + if (pos > i_size_read(inode)) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode)); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); + if (handle) + ocfs2_update_inode_fsync_trans(handle, inode, 1); + } + if (handle) + ocfs2_journal_dirty(handle, wc->w_di_bh); - ocfs2_commit_trans(osb, handle); +out: + /* unlock pages before dealloc since it needs acquiring j_trans_barrier + * lock, or it will cause a deadlock since journal commit threads holds + * this lock and will ask for the page lock when flushing the data. + * put it here to preserve the unlock order. + */ + ocfs2_unlock_folios(wc); + + if (handle) + ocfs2_commit_trans(osb, handle); ocfs2_run_deallocs(osb, &wc->w_dealloc); - ocfs2_free_write_ctxt(wc); + brelse(wc->w_di_bh); + kfree(wc); return copied; } -static int ocfs2_write_end(struct file *file, struct address_space *mapping, +static int ocfs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; struct inode *inode = mapping->host; - ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); + ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata); up_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_inode_unlock(inode, 1); @@ -2085,17 +2065,396 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, return ret; } +struct ocfs2_dio_write_ctxt { + struct list_head dw_zero_list; + unsigned dw_zero_count; + int dw_orphaned; + pid_t dw_writer_pid; +}; + +static struct ocfs2_dio_write_ctxt * +ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc) +{ + struct ocfs2_dio_write_ctxt *dwc = NULL; + + if (bh->b_private) + return bh->b_private; + + dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS); + if (dwc == NULL) + return NULL; + INIT_LIST_HEAD(&dwc->dw_zero_list); + dwc->dw_zero_count = 0; + dwc->dw_orphaned = 0; + dwc->dw_writer_pid = task_pid_nr(current); + bh->b_private = dwc; + *alloc = 1; + + return dwc; +} + +static void ocfs2_dio_free_write_ctx(struct inode *inode, + struct ocfs2_dio_write_ctxt *dwc) +{ + ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list); + kfree(dwc); +} + +/* + * TODO: Make this into a generic get_blocks function. + * + * From do_direct_io in direct-io.c: + * "So what we do is to permit the ->get_blocks function to populate + * bh.b_size with the size of IO which is permitted at this offset and + * this i_blkbits." + * + * This function is called directly from get_more_blocks in direct-io.c. + * + * called like this: dio->get_blocks(dio->inode, fs_startblk, + * fs_count, map_bh, dio->rw == WRITE); + */ +static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_write_ctxt *wc; + struct ocfs2_write_cluster_desc *desc = NULL; + struct ocfs2_dio_write_ctxt *dwc = NULL; + struct buffer_head *di_bh = NULL; + u64 p_blkno; + unsigned int i_blkbits = inode->i_sb->s_blocksize_bits; + loff_t pos = iblock << i_blkbits; + sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits; + unsigned len, total_len = bh_result->b_size; + int ret = 0, first_get_block = 0; + + len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); + len = min(total_len, len); + + /* + * bh_result->b_size is count in get_more_blocks according to write + * "pos" and "end", we need map twice to return different buffer state: + * 1. area in file size, not set NEW; + * 2. area out file size, set NEW. + * + * iblock endblk + * |--------|---------|---------|--------- + * |<-------area in file------->| + */ + + if ((iblock <= endblk) && + ((iblock + ((len - 1) >> i_blkbits)) > endblk)) + len = (endblk - iblock + 1) << i_blkbits; + + mlog(0, "get block of %lu at %llu:%u req %u\n", + inode->i_ino, pos, len, total_len); + + /* + * Because we need to change file size in ocfs2_dio_end_io_write(), or + * we may need to add it to orphan dir. So can not fall to fast path + * while file size will be changed. + */ + if (pos + total_len <= i_size_read(inode)) { + + /* This is the fast path for re-write. */ + ret = ocfs2_lock_get_block(inode, iblock, bh_result, create); + if (buffer_mapped(bh_result) && + !buffer_new(bh_result) && + ret == 0) + goto out; + + /* Clear state set by ocfs2_get_block. */ + bh_result->b_state = 0; + } + + dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block); + if (unlikely(dwc == NULL)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) > + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) && + !dwc->dw_orphaned) { + /* + * when we are going to alloc extents beyond file size, add the + * inode to orphan dir, so we can recall those spaces when + * system crashed during write. + */ + ret = ocfs2_add_inode_to_orphan(osb, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + dwc->dw_orphaned = 1; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + down_write(&oi->ip_alloc_sem); + + if (first_get_block) { + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, + total_len, NULL); + if (ret < 0) { + mlog_errno(ret); + goto unlock; + } + } + + ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len, + OCFS2_WRITE_DIRECT, NULL, + (void **)&wc, di_bh, NULL); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + desc = &wc->w_desc[0]; + + p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys); + BUG_ON(p_blkno == 0); + p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1); + + map_bh(bh_result, inode->i_sb, p_blkno); + bh_result->b_size = len; + if (desc->c_needs_zero) + set_buffer_new(bh_result); + + if (iblock > endblk) + set_buffer_new(bh_result); + + /* May sleep in end_io. It should not happen in a irq context. So defer + * it to dio work queue. */ + set_buffer_defer_completion(bh_result); + + if (!list_empty(&wc->w_unwritten_list)) { + struct ocfs2_unwritten_extent *ue = NULL; + + ue = list_first_entry(&wc->w_unwritten_list, + struct ocfs2_unwritten_extent, + ue_node); + BUG_ON(ue->ue_cpos != desc->c_cpos); + /* The physical address may be 0, fill it. */ + ue->ue_phys = desc->c_phys; + + list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list); + dwc->dw_zero_count += wc->w_unwritten_count; + } + + ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc); + BUG_ON(ret != len); + ret = 0; +unlock: + up_write(&oi->ip_alloc_sem); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); +out: + return ret; +} + +static int ocfs2_dio_end_io_write(struct inode *inode, + struct ocfs2_dio_write_ctxt *dwc, + loff_t offset, + ssize_t bytes) +{ + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_unwritten_extent *ue = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = NULL; + loff_t end = offset + bytes; + int ret = 0, credits = 0; + + ocfs2_init_dealloc_ctxt(&dealloc); + + /* We do clear unwritten, delete orphan, change i_size here. If neither + * of these happen, we can skip all this. */ + if (list_empty(&dwc->dw_zero_list) && + end <= i_size_read(inode) && + !dwc->dw_orphaned) + goto out; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + down_write(&oi->ip_alloc_sem); + + /* Delete orphan before acquire i_rwsem. */ + if (dwc->dw_orphaned) { + BUG_ON(dwc->dw_writer_pid != task_pid_nr(current)); + + end = end > i_size_read(inode) ? end : 0; + + ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, + !!end, end); + if (ret < 0) + mlog_errno(ret); + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + + /* Attach dealloc with extent tree in case that we may reuse extents + * which are already unlinked from current extent tree due to extent + * rotation and merging. + */ + et.et_dealloc = &dealloc; + + ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, + &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list); + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto unlock; + } + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto commit; + } + + list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { + ret = ocfs2_assure_trans_credits(handle, credits); + if (ret < 0) { + mlog_errno(ret); + break; + } + ret = ocfs2_mark_extent_written(inode, &et, handle, + ue->ue_cpos, 1, + ue->ue_phys, + meta_ac, &dealloc); + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + if (end > i_size_read(inode)) { + ret = ocfs2_set_inode_size(handle, inode, di_bh, end); + if (ret < 0) + mlog_errno(ret); + } +commit: + ocfs2_commit_trans(osb, handle); +unlock: + up_write(&oi->ip_alloc_sem); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + ocfs2_run_deallocs(osb, &dealloc); + ocfs2_dio_free_write_ctx(inode, dwc); + + return ret; +} + +/* + * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're + * particularly interested in the aio/dio case. We use the rw_lock DLM lock + * to protect io on one node from truncation on another. + */ +static int ocfs2_dio_end_io(struct kiocb *iocb, + loff_t offset, + ssize_t bytes, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + int level; + int ret = 0; + + /* this io's submitter should not have unlocked this before we could */ + BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + + if (bytes <= 0) + mlog_ratelimited(ML_ERROR, "Direct IO failed, bytes = %lld", + (long long)bytes); + if (private) { + if (bytes > 0) + ret = ocfs2_dio_end_io_write(inode, private, offset, + bytes); + else + ocfs2_dio_free_write_ctx(inode, private); + } + + ocfs2_iocb_clear_rw_locked(iocb); + + level = ocfs2_iocb_rw_locked_level(iocb); + ocfs2_rw_unlock(inode, level); + return ret; +} + +static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + get_block_t *get_block; + + /* + * Fallback to buffered I/O if we see an inode without + * extents. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + /* Fallback to buffered I/O if we do not support append dio. */ + if (iocb->ki_pos + iter->count > i_size_read(inode) && + !ocfs2_supports_append_dio(osb)) + return 0; + + if (iov_iter_rw(iter) == READ) + get_block = ocfs2_lock_get_block; + else + get_block = ocfs2_dio_wr_get_block; + + return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, get_block, + ocfs2_dio_end_io, 0); +} + const struct address_space_operations ocfs2_aops = { - .readpage = ocfs2_readpage, - .readpages = ocfs2_readpages, - .writepage = ocfs2_writepage, + .dirty_folio = block_dirty_folio, + .read_folio = ocfs2_read_folio, + .readahead = ocfs2_readahead, + .writepages = ocfs2_writepages, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, .bmap = ocfs2_bmap, .direct_IO = ocfs2_direct_IO, - .invalidatepage = ocfs2_invalidatepage, - .releasepage = ocfs2_releasepage, - .migratepage = buffer_migrate_page, + .invalidate_folio = block_invalidate_folio, + .release_folio = ocfs2_release_folio, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f671e49beb34..114efc9111e4 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -1,39 +1,18 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H -#include <linux/aio.h> - -handle_t *ocfs2_start_walk_page_trans(struct inode *inode, - struct page *page, - unsigned from, - unsigned to); +#include <linux/fs.h> -int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, +int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new); -void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages); +void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios); int walk_page_buffers( handle_t *handle, struct buffer_head *head, @@ -44,16 +23,20 @@ int walk_page_buffers( handle_t *handle, struct buffer_head *bh)); int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + loff_t pos, unsigned len, unsigned copied, void *fsdata); + +typedef enum { + OCFS2_WRITE_BUFFER = 0, + OCFS2_WRITE_DIRECT, + OCFS2_WRITE_MMAP, +} ocfs2_write_type_t; -int ocfs2_write_begin_nolock(struct file *filp, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata, - struct buffer_head *di_bh, struct page *mmap_page); +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, ocfs2_write_type_t type, + struct folio **foliop, void **fsdata, + struct buffer_head *di_bh, struct folio *mmap_folio); -int ocfs2_read_inline_data(struct inode *inode, struct page *page, +int ocfs2_read_inline_data(struct inode *inode, struct folio *folio, struct buffer_head *di_bh); int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); @@ -74,37 +57,19 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) /* * Using a named enum representing lock types in terms of #N bit stored in * iocb->private, which is going to be used for communication between - * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). + * ocfs2_dio_end_io() and ocfs2_file_write/read_iter(). */ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, OCFS2_IOCB_RW_LOCK_LEVEL, - OCFS2_IOCB_SEM, - OCFS2_IOCB_UNALIGNED_IO, OCFS2_IOCB_NUM_LOCKS }; +#define ocfs2_iocb_init_rw_locked(iocb) \ + (iocb->private = NULL) #define ocfs2_iocb_clear_rw_locked(iocb) \ clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) -#define ocfs2_iocb_set_sem_locked(iocb) \ - set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) -#define ocfs2_iocb_clear_sem_locked(iocb) \ - clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) -#define ocfs2_iocb_is_sem_locked(iocb) \ - test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) - -#define ocfs2_iocb_set_unaligned_aio(iocb) \ - set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define ocfs2_iocb_clear_unaligned_aio(iocb) \ - clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define ocfs2_iocb_is_unaligned_aio(iocb) \ - test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) - -#define OCFS2_IOEND_WQ_HASH_SZ 37 -#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\ - OCFS2_IOEND_WQ_HASH_SZ]) -extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 0725e6054650..863a5316030b 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -1,20 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * blockcheck.c * * Checksum and ECC codes for the OCFS2 userspace library. * * Copyright (C) 2006, 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License, version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/kernel.h> @@ -132,7 +122,7 @@ u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr * parity bits that are part of the bit number * representation. Huh? * - * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code"> + * <wikipedia href="https://en.wikipedia.org/wiki/Hamming_code"> * In other words, the parity bit at position 2^k * checks bits in positions having bit k set in * their binary representation. Conversely, for @@ -237,70 +227,38 @@ static int blockcheck_u64_get(void *data, u64 *val) *val = *(u64 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); - -static struct dentry *blockcheck_debugfs_create(const char *name, - struct dentry *parent, - u64 *value) -{ - return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, - &blockcheck_fops); -} +DEFINE_DEBUGFS_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { if (stats) { - debugfs_remove(stats->b_debug_check); - stats->b_debug_check = NULL; - debugfs_remove(stats->b_debug_failure); - stats->b_debug_failure = NULL; - debugfs_remove(stats->b_debug_recover); - stats->b_debug_recover = NULL; - debugfs_remove(stats->b_debug_dir); + debugfs_remove_recursive(stats->b_debug_dir); stats->b_debug_dir = NULL; } } -static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent) +static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) { - int rc = -EINVAL; + struct dentry *dir; - if (!stats) - goto out; - - stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); - if (!stats->b_debug_dir) - goto out; + dir = debugfs_create_dir("blockcheck", parent); + stats->b_debug_dir = dir; - stats->b_debug_check = - blockcheck_debugfs_create("blocks_checked", - stats->b_debug_dir, - &stats->b_check_count); + debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir, + &stats->b_check_count, &blockcheck_fops); - stats->b_debug_failure = - blockcheck_debugfs_create("checksums_failed", - stats->b_debug_dir, - &stats->b_failure_count); + debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir, + &stats->b_failure_count, &blockcheck_fops); - stats->b_debug_recover = - blockcheck_debugfs_create("ecc_recoveries", - stats->b_debug_dir, - &stats->b_recover_count); - if (stats->b_debug_check && stats->b_debug_failure && - stats->b_debug_recover) - rc = 0; + debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir, + &stats->b_recover_count, &blockcheck_fops); -out: - if (rc) - ocfs2_blockcheck_debug_remove(stats); - return rc; } #else -static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent) +static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) { - return 0; } static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) @@ -309,10 +267,10 @@ static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats * #endif /* CONFIG_DEBUG_FS */ /* Always-called wrappers for starting and stopping the debugfs files */ -int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent) +void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) { - return ocfs2_blockcheck_debug_install(stats, parent); + ocfs2_blockcheck_debug_install(stats, parent); } void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats) diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h index d4b69febf70a..d0578e98ee8d 100644 --- a/fs/ocfs2/blockcheck.h +++ b/fs/ocfs2/blockcheck.h @@ -1,20 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * blockcheck.h * * Checksum and ECC codes for the OCFS2 userspace library. * * Copyright (C) 2004, 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License, version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef OCFS2_BLOCKCHECK_H @@ -33,9 +23,6 @@ struct ocfs2_blockcheck_stats { * ocfs2_blockcheck_stats_debugfs_install() */ struct dentry *b_debug_dir; /* Parent of the debugfs files */ - struct dentry *b_debug_check; /* Exposes b_check_count */ - struct dentry *b_debug_failure; /* Exposes b_failure_count */ - struct dentry *b_debug_recover; /* Exposes b_recover_count */ }; @@ -64,8 +51,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, struct ocfs2_blockcheck_stats *stats); /* Debug Initialization */ -int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent); +void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent); void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats); /* diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 5d18ad10c27f..8f714406528d 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -1,31 +1,16 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * io.c * * Buffer cache handling * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> +#include <linux/bio.h> #include <cluster/masklog.h> @@ -79,7 +64,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, bh); wait_on_buffer(bh); @@ -90,7 +75,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, * information for this bh as it's not marked locally * uptodate. */ ret = -EIO; - put_bh(bh); mlog_errno(ret); } @@ -99,25 +83,34 @@ out: return ret; } +/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it + * will be easier to handle read failure. + */ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int nr, struct buffer_head *bhs[]) { int status = 0; unsigned int i; struct buffer_head *bh; + int new_bh = 0; trace_ocfs2_read_blocks_sync((unsigned long long)block, nr); if (!nr) goto bail; + /* Don't put buffer head and re-assign it to NULL if it is allocated + * outside since the caller can't be aware of this alternation! + */ + new_bh = (bhs[0] == NULL); + for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(osb->sb, block++); if (bhs[i] == NULL) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); - goto bail; + break; } } bh = bhs[i]; @@ -140,22 +133,43 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, lock_buffer(bh); if (buffer_jbd(bh)) { +#ifdef CATCH_BH_JBD_RACES mlog(ML_ERROR, "block %llu had the JBD bit set " "while I was in lock_buffer!", (unsigned long long)bh->b_blocknr); BUG(); +#else + unlock_buffer(bh); + continue; +#endif } - clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, bh); } +read_failure: for (i = nr; i > 0; i--) { bh = bhs[i - 1]; + if (unlikely(status)) { + if (new_bh && bh) { + /* If middle bh fails, let previous bh + * finish its read and then put it to + * avoid bh leak + */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + put_bh(bh); + bhs[i - 1] = NULL; + } else if (bh && buffer_uptodate(bh)) { + clear_buffer_uptodate(bh); + } + continue; + } + /* No need to wait on the buffer if it's managed by JBD. */ if (!buffer_jbd(bh)) wait_on_buffer(bh); @@ -165,8 +179,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, * so we can safely record this and loop back * to cleanup the other buffers. */ status = -EIO; - put_bh(bh); - bhs[i - 1] = NULL; + goto read_failure; } } @@ -174,6 +187,9 @@ bail: return status; } +/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it + * will be easier to handle read failure. + */ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, @@ -183,6 +199,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, int i, ignore_cache = 0; struct buffer_head *bh; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + int new_bh = 0; trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags); @@ -208,15 +225,20 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, goto bail; } + /* Don't put buffer head and re-assign it to NULL if it is allocated + * outside since the caller can't be aware of this alternation! + */ + new_bh = (bhs[0] == NULL); + ocfs2_metadata_cache_io_lock(ci); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { - ocfs2_metadata_cache_io_unlock(ci); - status = -EIO; + status = -ENOMEM; mlog_errno(status); - goto bail; + /* Don't forget to put previous bh! */ + break; } } bh = bhs[i]; @@ -301,22 +323,38 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, continue; } - clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ if (validate) set_buffer_needs_validate(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, bh); continue; } } - status = 0; - +read_failure: for (i = (nr - 1); i >= 0; i--) { bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { + if (unlikely(status)) { + /* Clear the buffers on error including those + * ever succeeded in reading + */ + if (new_bh && bh) { + /* If middle bh fails, let previous bh + * finish its read and then put it to + * avoid bh leak + */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + put_bh(bh); + bhs[i] = NULL; + } else if (bh && buffer_uptodate(bh)) { + clear_buffer_uptodate(bh); + } + continue; + } /* We know this can't have changed as we hold the * owner sem. Avoid doing any work on the bh if the * journal has it. */ @@ -331,9 +369,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, * for this bh as it's not marked locally * uptodate. */ status = -EIO; - put_bh(bh); - bhs[i] = NULL; - continue; + clear_buffer_needs_validate(bh); + goto read_failure; } if (buffer_needs_validate(bh)) { @@ -343,18 +380,16 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, BUG_ON(buffer_jbd(bh)); clear_buffer_needs_validate(bh); status = validate(sb, bh); - if (status) { - put_bh(bh); - bhs[i] = NULL; - continue; - } + if (status) + goto read_failure; } } /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ - ocfs2_set_buffer_uptodate(ci, bh); + if (bh) + ocfs2_set_buffer_uptodate(ci, bh); } ocfs2_metadata_cache_io_unlock(ci); @@ -414,13 +449,12 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check); - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ret = -EIO; - put_bh(bh); mlog_errno(ret); } diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index b97bcc6dde7c..2d51649fc090 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * ocfs2_buffer_head.h * * Buffer cache handling functions defined * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_BUFFER_HEAD_IO_H @@ -28,9 +12,6 @@ #include <linux/buffer_head.h> -void ocfs2_end_buffer_io_sync(struct buffer_head *bh, - int uptodate); - int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct ocfs2_caching_info *ci); diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile index bc8c5e7d8608..0e5b73215819 100644 --- a/fs/ocfs2/cluster/Makefile +++ b/fs/ocfs2/cluster/Makefile @@ -1,4 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ - quorum.o tcp.o netdebug.o ver.o + quorum.o tcp.o netdebug.o diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 5c1c864e81cc..724350925aff 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1,24 +1,9 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ +#include "linux/kstrtox.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/jiffies.h> @@ -35,7 +20,8 @@ #include <linux/time.h> #include <linux/debugfs.h> #include <linux/slab.h> - +#include <linux/bitmap.h> +#include <linux/ktime.h> #include "heartbeat.h" #include "tcp.h" #include "nodemanager.h" @@ -105,10 +91,6 @@ static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_REGION_PINNED "pinned" static struct dentry *o2hb_debug_dir; -static struct dentry *o2hb_debug_livenodes; -static struct dentry *o2hb_debug_liveregions; -static struct dentry *o2hb_debug_quorumregions; -static struct dentry *o2hb_debug_failedregions; static LIST_HEAD(o2hb_all_regions); @@ -118,21 +100,19 @@ static struct o2hb_callback { static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); -#define O2HB_DEFAULT_BLOCK_BITS 9 - enum o2hb_heartbeat_modes { O2HB_HEARTBEAT_LOCAL = 0, O2HB_HEARTBEAT_GLOBAL, O2HB_HEARTBEAT_NUM_MODES, }; -char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { - "local", /* O2HB_HEARTBEAT_LOCAL */ - "global", /* O2HB_HEARTBEAT_GLOBAL */ +static const char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { + "local", /* O2HB_HEARTBEAT_LOCAL */ + "global", /* O2HB_HEARTBEAT_GLOBAL */ }; unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; -unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; +static unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; /* * o2hb_dependent_users tracks the number of registered callbacks that depend @@ -140,7 +120,7 @@ unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; * However only o2dlm depends on the heartbeat. It does not want the heartbeat * to stop while a dlm domain is still active. */ -unsigned int o2hb_dependent_users; +static unsigned int o2hb_dependent_users; /* * In global heartbeat mode, all regions are pinned if there are one or more @@ -218,7 +198,8 @@ struct o2hb_region { unsigned hr_unclean_stop:1, hr_aborted_start:1, hr_item_pinned:1, - hr_item_dropped:1; + hr_item_dropped:1, + hr_node_deleted:1; /* protected by the hr_callback_sem */ struct task_struct *hr_task; @@ -233,7 +214,7 @@ struct o2hb_region { unsigned int hr_num_pages; struct page **hr_slot_data; - struct block_device *hr_bdev; + struct file *hr_bdev_file; struct o2hb_disk_slot *hr_slots; /* live node map of this region */ @@ -241,10 +222,6 @@ struct o2hb_region { unsigned int hr_region_num; struct dentry *hr_debug_dir; - struct dentry *hr_debug_livenodes; - struct dentry *hr_debug_regnum; - struct dentry *hr_debug_elapsed_time; - struct dentry *hr_debug_pinned; struct o2hb_debug_buf *hr_db_livenodes; struct o2hb_debug_buf *hr_db_regnum; struct o2hb_debug_buf *hr_db_elapsed_time; @@ -259,8 +236,6 @@ struct o2hb_region { * (hr_steady_iterations == 0) within hr_unsteady_iterations */ atomic_t hr_unsteady_iterations; - char hr_dev_name[BDEVNAME_SIZE]; - unsigned int hr_timeout_ms; /* randomized as the region goes up and down so that a node @@ -270,48 +245,65 @@ struct o2hb_region { struct delayed_work hr_write_timeout_work; unsigned long hr_last_timeout_start; + /* negotiate timer, used to negotiate extending hb timeout. */ + struct delayed_work hr_nego_timeout_work; + unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + /* Used during o2hb_check_slot to hold a copy of the block * being checked because we temporarily have to zero out the * crc field. */ struct o2hb_disk_heartbeat_block *hr_tmp_block; + + /* Message key for negotiate timeout message. */ + unsigned int hr_key; + struct list_head hr_handler_list; + + /* last hb status, 0 for success, other value for error. */ + int hr_last_hb_status; }; +static inline struct block_device *reg_bdev(struct o2hb_region *reg) +{ + return reg->hr_bdev_file ? file_bdev(reg->hr_bdev_file) : NULL; +} + struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; int wc_error; }; -static int o2hb_pop_count(void *map, int count) -{ - int i = -1, pop = 0; +#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2) - while ((i = find_next_bit(map, count, i + 1)) < count) - pop++; - return pop; -} +enum { + O2HB_NEGO_TIMEOUT_MSG = 1, + O2HB_NEGO_APPROVE_MSG = 2, +}; + +struct o2hb_nego_msg { + u8 node_num; +}; static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; - unsigned long flags; struct o2hb_region *reg = container_of(work, struct o2hb_region, hr_write_timeout_work.work); - mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " - "milliseconds\n", reg->hr_dev_name, + mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u " + "milliseconds\n", reg_bdev(reg), jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { - spin_lock_irqsave(&o2hb_live_lock, flags); + spin_lock(&o2hb_live_lock); if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); - failed = o2hb_pop_count(&o2hb_failed_region_bitmap, + failed = bitmap_weight(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); - quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, + quorum = bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); - spin_unlock_irqrestore(&o2hb_live_lock, flags); + spin_unlock(&o2hb_live_lock); mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", quorum, failed); @@ -327,7 +319,7 @@ static void o2hb_write_timeout(struct work_struct *work) o2quo_disk_timeout(); } -static void o2hb_arm_write_timeout(struct o2hb_region *reg) +static void o2hb_arm_timeout(struct o2hb_region *reg) { /* Arm writeout only after thread reaches steady state */ if (atomic_read(®->hr_steady_iterations) != 0) @@ -342,14 +334,134 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) spin_unlock(&o2hb_live_lock); } cancel_delayed_work(®->hr_write_timeout_work); - reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); + + cancel_delayed_work(®->hr_nego_timeout_work); + /* negotiate timeout must be less than write timeout. */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); + bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES); } -static void o2hb_disarm_write_timeout(struct o2hb_region *reg) +static void o2hb_disarm_timeout(struct o2hb_region *reg) { cancel_delayed_work_sync(®->hr_write_timeout_work); + cancel_delayed_work_sync(®->hr_nego_timeout_work); +} + +static int o2hb_send_nego_msg(int key, int type, u8 target) +{ + struct o2hb_nego_msg msg; + int status, ret; + + msg.node_num = o2nm_this_node(); +again: + ret = o2net_send_message(type, key, &msg, sizeof(msg), + target, &status); + + if (ret == -EAGAIN || ret == -ENOMEM) { + msleep(100); + goto again; + } + + return ret; +} + +static void o2hb_nego_timeout(struct work_struct *work) +{ + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int master_node, i, ret; + struct o2hb_region *reg; + + reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); + /* don't negotiate timeout if last hb failed since it is very + * possible io failed. Should let write timeout fence self. + */ + if (reg->hr_last_hb_status) + return; + + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); + /* lowest node as master node to make negotiate decision. */ + master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES); + + if (master_node == o2nm_this_node()) { + if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, + config_item_name(®->hr_item), reg_bdev(reg)); + set_bit(master_node, reg->hr_nego_node_bitmap); + } + if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap, + O2NM_MAX_NODES)) { + /* check negotiate bitmap every second to do timeout + * approve decision. + */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(1000)); + + return; + } + + printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n", + config_item_name(®->hr_item), + reg_bdev(reg)); + /* approve negotiate timeout request. */ + o2hb_arm_timeout(reg); + + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + if (i == master_node) + continue; + + mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i); + ret = o2hb_send_nego_msg(reg->hr_key, + O2HB_NEGO_APPROVE_MSG, i); + if (ret) + mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n", + i, ret); + } + } else { + /* negotiate timeout with master node. */ + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), + reg_bdev(reg), master_node); + ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, + master_node); + if (ret) + mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n", + master_node, ret); + } +} + +static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + struct o2hb_nego_msg *nego_msg; + + nego_msg = (struct o2hb_nego_msg *)msg->buf; + printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n", + nego_msg->node_num, config_item_name(®->hr_item), + reg_bdev(reg)); + if (nego_msg->node_num < O2NM_MAX_NODES) + set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); + else + mlog(ML_ERROR, "got nego timeout message from bad node.\n"); + + return 0; +} + +static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + + printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n", + config_item_name(®->hr_item), reg_bdev(reg)); + o2hb_arm_timeout(reg); + return 0; } static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) @@ -373,21 +485,19 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, } } -static void o2hb_wait_on_io(struct o2hb_region *reg, - struct o2hb_bio_wait_ctxt *wc) +static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc) { o2hb_bio_wait_dec(wc, 1); wait_for_completion(&wc->wc_io_complete); } -static void o2hb_bio_end_io(struct bio *bio, - int error) +static void o2hb_bio_end_io(struct bio *bio) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (error) { - mlog(ML_ERROR, "IO Error %d\n", error); - wc->wc_error = error; + if (bio->bi_status) { + mlog(ML_ERROR, "IO Error %d\n", bio->bi_status); + wc->wc_error = blk_status_to_errno(bio->bi_status); } o2hb_bio_wait_dec(wc, 1); @@ -399,7 +509,7 @@ static void o2hb_bio_end_io(struct bio *bio, static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct o2hb_bio_wait_ctxt *wc, unsigned int *current_slot, - unsigned int max_slots) + unsigned int max_slots, blk_opf_t opf) { int len, current_page; unsigned int vec_len, vec_start; @@ -413,7 +523,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(GFP_ATOMIC, 16); + bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -421,18 +531,17 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, } /* Must put everything in 512 byte sectors for the bio... */ - bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); - bio->bi_bdev = reg->hr_bdev; + bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; - vec_start = (cs << bits) % PAGE_CACHE_SIZE; + vec_start = (cs << bits) % PAGE_SIZE; while(cs < max_slots) { current_page = cs / spp; page = reg->hr_slot_data[current_page]; - vec_len = min(PAGE_CACHE_SIZE - vec_start, - (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); + vec_len = min(PAGE_SIZE - vec_start, + (max_slots-cs) * (PAGE_SIZE/spp) ); mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", current_page, vec_len, vec_start); @@ -440,7 +549,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, len = bio_add_page(bio, page, vec_len, vec_start); if (len != vec_len) break; - cs += vec_len / (PAGE_CACHE_SIZE/spp); + cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; } @@ -450,9 +559,10 @@ bail: } static int o2hb_read_slots(struct o2hb_region *reg, + unsigned int begin_slot, unsigned int max_slots) { - unsigned int current_slot=0; + unsigned int current_slot = begin_slot; int status; struct o2hb_bio_wait_ctxt wc; struct bio *bio; @@ -460,7 +570,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, o2hb_bio_wait_init(&wc); while(current_slot < max_slots) { - bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); + bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots, + REQ_OP_READ); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -468,13 +579,13 @@ static int o2hb_read_slots(struct o2hb_region *reg, } atomic_inc(&wc.wc_num_reqs); - submit_bio(READ, bio); + submit_bio(bio); } status = 0; bail_and_wait: - o2hb_wait_on_io(reg, &wc); + o2hb_wait_on_io(&wc); if (wc.wc_error && !status) status = wc.wc_error; @@ -492,7 +603,8 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, slot = o2nm_this_node(); - bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, + REQ_OP_WRITE | REQ_SYNC); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -500,7 +612,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, } atomic_inc(&write_wc->wc_num_reqs); - submit_bio(WRITE_SYNC, bio); + submit_bio(bio); status = 0; bail: @@ -582,8 +694,8 @@ static int o2hb_check_own_slot(struct o2hb_region *reg) else errstr = ERRSTR3; - mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), " - "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name, + mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), " + "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg_bdev(reg), slot->ds_node_num, (unsigned long long)slot->ds_last_generation, (unsigned long long)slot->ds_last_time, hb_block->hb_node, (unsigned long long)le64_to_cpu(hb_block->hb_generation), @@ -606,7 +718,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg, hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; memset(hb_block, 0, reg->hr_block_bytes); /* TODO: time stuff */ - cputime = CURRENT_TIME.tv_sec; + cputime = ktime_get_real_seconds(); if (!cputime) cputime = 1; @@ -628,11 +740,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, struct o2nm_node *node, int idx) { - struct list_head *iter; struct o2hb_callback_func *f; - list_for_each(iter, &hbcall->list) { - f = list_entry(iter, struct o2hb_callback_func, hc_item); + list_for_each_entry(f, &hbcall->list, hc_item) { mlog(ML_HEARTBEAT, "calling funcs %p\n", f); (f->hc_func)(node, idx, f->hc_data); } @@ -641,16 +751,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, /* Will run the list in order until we process the passed event */ static void o2hb_run_event_list(struct o2hb_node_event *queued_event) { - int empty; struct o2hb_callback *hbcall; struct o2hb_node_event *event; - spin_lock(&o2hb_live_lock); - empty = list_empty(&queued_event->hn_item); - spin_unlock(&o2hb_live_lock); - if (empty) - return; - /* Holding callback sem assures we don't alter the callback * lists when doing this, and serializes ourselves with other * processes wanting callbacks. */ @@ -709,6 +812,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) struct o2hb_node_event event = { .hn_item = LIST_HEAD_INIT(event.hn_item), }; struct o2nm_node *node; + int queued = 0; node = o2nm_get_node_by_num(slot->ds_node_num); if (!node) @@ -726,11 +830,13 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, slot->ds_node_num); + queued = 1; } } spin_unlock(&o2hb_live_lock); - o2hb_run_event_list(&event); + if (queued) + o2hb_run_event_list(&event); o2nm_node_put(node); } @@ -758,12 +864,12 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) * live nodes heartbeat on it. In other words, the region has been * added to all nodes. */ - if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, - sizeof(o2hb_live_node_bitmap))) + if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + O2NM_MAX_NODES)) goto unlock; - printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", + config_item_name(®->hr_item), reg_bdev(reg)); set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); @@ -771,7 +877,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) * If global heartbeat active, unpin all regions if the * region count > CUT_OFF */ - if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + if (bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) o2hb_region_unpin(NULL); unlock: @@ -790,6 +896,7 @@ static int o2hb_check_slot(struct o2hb_region *reg, unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; unsigned int slot_dead_ms; int tmp; + int queued = 0; memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); @@ -820,8 +927,8 @@ static int o2hb_check_slot(struct o2hb_region *reg, /* The node is live but pushed out a bad crc. We * consider it a transient miss but don't populate any * other values as they may be junk. */ - mlog(ML_ERROR, "Node %d has written a bad crc to %s\n", - slot->ds_node_num, reg->hr_dev_name); + mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n", + slot->ds_node_num, reg_bdev(reg)); o2hb_dump_slot(hb_block); slot->ds_equal_samples++; @@ -883,6 +990,7 @@ fire_callbacks: slot->ds_node_num); changed = 1; + queued = 1; } list_add_tail(&slot->ds_live_item, @@ -899,12 +1007,12 @@ fire_callbacks: slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); if (slot_dead_ms && slot_dead_ms != dead_ms) { /* TODO: Perhaps we can fail the region here. */ - mlog(ML_ERROR, "Node %d on device %s has a dead count " + mlog(ML_ERROR, "Node %d on device %pg has a dead count " "of %u ms, but our count is %u ms.\n" "Please double check your configuration values " "for 'O2CB_HEARTBEAT_THRESHOLD'\n", - slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, - dead_ms); + slot->ds_node_num, reg_bdev(reg), + slot_dead_ms, dead_ms); } goto out; } @@ -913,7 +1021,7 @@ fire_callbacks: if (list_empty(&slot->ds_live_item)) goto out; - /* live nodes only go dead after enough consequtive missed + /* live nodes only go dead after enough consecutive missed * samples.. reset the missed counter whenever we see * activity */ if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) { @@ -934,6 +1042,7 @@ fire_callbacks: node, slot->ds_node_num); changed = 1; + queued = 1; } /* We don't clear this because the node is still @@ -949,35 +1058,27 @@ fire_callbacks: out: spin_unlock(&o2hb_live_lock); - o2hb_run_event_list(&event); + if (queued) + o2hb_run_event_list(&event); if (node) o2nm_node_put(node); return changed; } -/* This could be faster if we just implmented a find_last_bit, but I - * don't think the circumstances warrant it. */ -static int o2hb_highest_node(unsigned long *nodes, - int numbits) +static int o2hb_highest_node(unsigned long *nodes, int numbits) { - int highest, node; - - highest = numbits; - node = -1; - while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) { - if (node >= numbits) - break; - - highest = node; - } + return find_last_bit(nodes, numbits); +} - return highest; +static int o2hb_lowest_node(unsigned long *nodes, int numbits) +{ + return find_first_bit(nodes, numbits); } static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { - int i, ret, highest_node; + int i, ret, highest_node, lowest_node; int membership_change = 0, own_slot_ok = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -994,7 +1095,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * If a node is not configured but is in the livemap, we still need * to read the slot so as to be able to remove it from the livemap. */ - o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); i = -1; while ((i = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { @@ -1002,7 +1103,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) } highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); - if (highest_node >= O2NM_MAX_NODES) { + lowest_node = o2hb_lowest_node(configured_nodes, O2NM_MAX_NODES); + if (highest_node >= O2NM_MAX_NODES || lowest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "o2hb: No configured nodes found!\n"); ret = -EINVAL; goto bail; @@ -1012,7 +1114,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * yet. Of course, if the node definitions have holes in them * then we're reading an empty slot anyway... Consider this * best-effort. */ - ret = o2hb_read_slots(reg, highest_node + 1); + ret = o2hb_read_slots(reg, lowest_node, highest_node + 1); if (ret < 0) { mlog_errno(ret); goto bail; @@ -1043,13 +1145,13 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * before we can go to steady state. This ensures that * people we find in our steady state have seen us. */ - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); if (write_wc.wc_error) { /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to * disk */ - mlog(ML_ERROR, "Write error %d on device \"%s\"\n", - write_wc.wc_error, reg->hr_dev_name); + mlog(ML_ERROR, "Write error %d on device \"%pg\"\n", + write_wc.wc_error, reg_bdev(reg)); ret = write_wc.wc_error; goto bail; } @@ -1057,7 +1159,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* Skip disarming the timeout if own slot has stale/bad data */ if (own_slot_ok) { o2hb_set_quorum_device(reg); - o2hb_arm_write_timeout(reg); + o2hb_arm_timeout(reg); + reg->hr_last_timeout_start = jiffies; } bail: @@ -1072,9 +1175,9 @@ bail: if (atomic_read(®->hr_steady_iterations) != 0) { if (atomic_dec_and_test(®->hr_unsteady_iterations)) { printk(KERN_NOTICE "o2hb: Unable to stabilize " - "heartbeart on region %s (%s)\n", + "heartbeat on region %s (%pg)\n", config_item_name(®->hr_item), - reg->hr_dev_name); + reg_bdev(reg)); atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; wake_up(&o2hb_steady_queue); @@ -1085,37 +1188,6 @@ bail: return ret; } -/* Subtract b from a, storing the result in a. a *must* have a larger - * value than b. */ -static void o2hb_tv_subtract(struct timeval *a, - struct timeval *b) -{ - /* just return 0 when a is after b */ - if (a->tv_sec < b->tv_sec || - (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { - a->tv_sec = 0; - a->tv_usec = 0; - return; - } - - a->tv_sec -= b->tv_sec; - a->tv_usec -= b->tv_usec; - while ( a->tv_usec < 0 ) { - a->tv_sec--; - a->tv_usec += 1000000; - } -} - -static unsigned int o2hb_elapsed_msecs(struct timeval *start, - struct timeval *end) -{ - struct timeval res = *end; - - o2hb_tv_subtract(&res, start); - - return res.tv_sec * 1000 + res.tv_usec / 1000; -} - /* * we ride the region ref that the region dir holds. before the region * dir is removed and drops it ref it will wait to tear down this @@ -1126,15 +1198,21 @@ static int o2hb_thread(void *data) int i, ret; struct o2hb_region *reg = data; struct o2hb_bio_wait_ctxt write_wc; - struct timeval before_hb, after_hb; + ktime_t before_hb, after_hb; unsigned int elapsed_msec; mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); /* Pin node */ - o2nm_depend_this_node(); + ret = o2nm_depend_this_node(); + if (ret) { + mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret); + reg->hr_node_deleted = 1; + wake_up(&o2hb_steady_queue); + return 0; + } while (!kthread_should_stop() && !reg->hr_unclean_stop && !reg->hr_aborted_start) { @@ -1143,18 +1221,19 @@ static int o2hb_thread(void *data) * hr_timeout_ms between disk writes. On busy systems * this should result in a heartbeat which is less * likely to time itself out. */ - do_gettimeofday(&before_hb); + before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); + reg->hr_last_hb_status = ret; + + after_hb = ktime_get_real(); - do_gettimeofday(&after_hb); - elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); + elapsed_msec = (unsigned int) + ktime_ms_delta(after_hb, before_hb); mlog(ML_HEARTBEAT, - "start = %lu.%lu, end = %lu.%lu, msec = %u\n", - before_hb.tv_sec, (unsigned long) before_hb.tv_usec, - after_hb.tv_sec, (unsigned long) after_hb.tv_usec, - elapsed_msec); + "start = %lld, end = %lld, msec = %u, ret = %d\n", + before_hb, after_hb, elapsed_msec, ret); if (!kthread_should_stop() && elapsed_msec < reg->hr_timeout_ms) { @@ -1164,7 +1243,7 @@ static int o2hb_thread(void *data) } } - o2hb_disarm_write_timeout(reg); + o2hb_disarm_timeout(reg); /* unclean stop is only used in very bad situation */ for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) @@ -1179,7 +1258,7 @@ static int o2hb_thread(void *data) o2hb_prepare_block(reg, 0); ret = o2hb_issue_node_write(reg, &write_wc); if (ret == 0) - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); else mlog_errno(ret); } @@ -1229,7 +1308,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) case O2HB_DB_TYPE_REGION_NUMBER: reg = (struct o2hb_region *)db->db_data; - out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", + out += scnprintf(buf + out, PAGE_SIZE - out, "%d\n", reg->hr_region_num); goto done; @@ -1239,12 +1318,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) /* If 0, it has never been set before */ if (lts) lts = jiffies_to_msecs(jiffies - lts); - out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); + out += scnprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); goto done; case O2HB_DB_TYPE_REGION_PINNED: reg = (struct o2hb_region *)db->db_data; - out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + out += scnprintf(buf + out, PAGE_SIZE - out, "%u\n", !!reg->hr_item_pinned); goto done; @@ -1253,8 +1332,8 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) } while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) - out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); - out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += scnprintf(buf + out, PAGE_SIZE - out, "\n"); done: i_size_write(inode, out); @@ -1303,107 +1382,60 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { + debugfs_remove_recursive(o2hb_debug_dir); kfree(o2hb_db_livenodes); kfree(o2hb_db_liveregions); kfree(o2hb_db_quorumregions); kfree(o2hb_db_failedregions); - debugfs_remove(o2hb_debug_failedregions); - debugfs_remove(o2hb_debug_quorumregions); - debugfs_remove(o2hb_debug_liveregions); - debugfs_remove(o2hb_debug_livenodes); - debugfs_remove(o2hb_debug_dir); } -static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, - struct o2hb_debug_buf **db, int db_len, - int type, int size, int len, void *data) +static void o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, int type, + int size, int len, void *data) { *db = kmalloc(db_len, GFP_KERNEL); if (!*db) - return NULL; + return; (*db)->db_type = type; (*db)->db_size = size; (*db)->db_len = len; (*db)->db_data = data; - return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, - &o2hb_debug_fops); + debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops); } -static int o2hb_debug_init(void) +static void o2hb_debug_init(void) { - int ret = -ENOMEM; - o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (!o2hb_debug_dir) { - mlog_errno(ret); - goto bail; - } - - o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, - o2hb_debug_dir, - &o2hb_db_livenodes, - sizeof(*o2hb_db_livenodes), - O2HB_DB_TYPE_LIVENODES, - sizeof(o2hb_live_node_bitmap), - O2NM_MAX_NODES, - o2hb_live_node_bitmap); - if (!o2hb_debug_livenodes) { - mlog_errno(ret); - goto bail; - } - o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, - o2hb_debug_dir, - &o2hb_db_liveregions, - sizeof(*o2hb_db_liveregions), - O2HB_DB_TYPE_LIVEREGIONS, - sizeof(o2hb_live_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_live_region_bitmap); - if (!o2hb_debug_liveregions) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVENODES, o2hb_debug_dir, + &o2hb_db_livenodes, sizeof(*o2hb_db_livenodes), + O2HB_DB_TYPE_LIVENODES, sizeof(o2hb_live_node_bitmap), + O2NM_MAX_NODES, o2hb_live_node_bitmap); - o2hb_debug_quorumregions = - o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, - o2hb_debug_dir, - &o2hb_db_quorumregions, - sizeof(*o2hb_db_quorumregions), - O2HB_DB_TYPE_QUORUMREGIONS, - sizeof(o2hb_quorum_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_quorum_region_bitmap); - if (!o2hb_debug_quorumregions) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, o2hb_debug_dir, + &o2hb_db_liveregions, sizeof(*o2hb_db_liveregions), + O2HB_DB_TYPE_LIVEREGIONS, + sizeof(o2hb_live_region_bitmap), O2NM_MAX_REGIONS, + o2hb_live_region_bitmap); - o2hb_debug_failedregions = - o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, - o2hb_debug_dir, - &o2hb_db_failedregions, - sizeof(*o2hb_db_failedregions), - O2HB_DB_TYPE_FAILEDREGIONS, - sizeof(o2hb_failed_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_failed_region_bitmap); - if (!o2hb_debug_failedregions) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, o2hb_debug_dir, + &o2hb_db_quorumregions, + sizeof(*o2hb_db_quorumregions), + O2HB_DB_TYPE_QUORUMREGIONS, + sizeof(o2hb_quorum_region_bitmap), O2NM_MAX_REGIONS, + o2hb_quorum_region_bitmap); - ret = 0; -bail: - if (ret) - o2hb_exit(); - - return ret; + o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, o2hb_debug_dir, + &o2hb_db_failedregions, + sizeof(*o2hb_db_failedregions), + O2HB_DB_TYPE_FAILEDREGIONS, + sizeof(o2hb_failed_region_bitmap), O2NM_MAX_REGIONS, + o2hb_failed_region_bitmap); } -int o2hb_init(void) +void o2hb_init(void) { int i; @@ -1413,38 +1445,34 @@ int o2hb_init(void) for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) INIT_LIST_HEAD(&o2hb_live_slots[i]); - INIT_LIST_HEAD(&o2hb_node_events); - - memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); - memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); - memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); - memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); - memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); + bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES); + bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); o2hb_dependent_users = 0; - return o2hb_debug_init(); + o2hb_debug_init(); } /* if we're already in a callback then we're already serialized by the sem */ static void o2hb_fill_node_map_from_callback(unsigned long *map, - unsigned bytes) + unsigned int bits) { - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memcpy(map, &o2hb_live_node_bitmap, bytes); + bitmap_copy(map, o2hb_live_node_bitmap, bits); } /* * get a map of all nodes that are heartbeating in any regions */ -void o2hb_fill_node_map(unsigned long *map, unsigned bytes) +void o2hb_fill_node_map(unsigned long *map, unsigned int bits) { /* callers want to serialize this map and callbacks so that they * can trust that they don't miss nodes coming to the party */ down_read(&o2hb_callback_sem); spin_lock(&o2hb_live_lock); - o2hb_fill_node_map_from_callback(map, bytes); + o2hb_fill_node_map_from_callback(map, bits); spin_unlock(&o2hb_live_lock); up_read(&o2hb_callback_sem); } @@ -1469,7 +1497,7 @@ static void o2hb_region_release(struct config_item *item) struct page *page; struct o2hb_region *reg = to_o2hb_region(item); - mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); + mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg_bdev(reg)); kfree(reg->hr_tmp_block); @@ -1482,38 +1510,37 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slot_data); } - if (reg->hr_bdev) - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); + if (reg->hr_bdev_file) + fput(reg->hr_bdev_file); kfree(reg->hr_slots); - kfree(reg->hr_db_regnum); + debugfs_remove_recursive(reg->hr_debug_dir); kfree(reg->hr_db_livenodes); - debugfs_remove(reg->hr_debug_livenodes); - debugfs_remove(reg->hr_debug_regnum); - debugfs_remove(reg->hr_debug_elapsed_time); - debugfs_remove(reg->hr_debug_pinned); - debugfs_remove(reg->hr_debug_dir); + kfree(reg->hr_db_regnum); + kfree(reg->hr_db_elapsed_time); + kfree(reg->hr_db_pinned); spin_lock(&o2hb_live_lock); list_del(®->hr_all_item); spin_unlock(&o2hb_live_lock); + o2net_unregister_handler_list(®->hr_handler_list); kfree(reg); } static int o2hb_read_block_input(struct o2hb_region *reg, const char *page, - size_t count, unsigned long *ret_bytes, unsigned int *ret_bits) { unsigned long bytes; char *p = (char *)page; + int ret; - bytes = simple_strtoul(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 0, &bytes); + if (ret) + return ret; /* Heartbeat and fs min / max block sizes are the same. */ if (bytes > 4096 || bytes < 512) @@ -1529,25 +1556,26 @@ static int o2hb_read_block_input(struct o2hb_region *reg, return 0; } -static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg, +static ssize_t o2hb_region_block_bytes_show(struct config_item *item, char *page) { - return sprintf(page, "%u\n", reg->hr_block_bytes); + return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes); } -static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, +static ssize_t o2hb_region_block_bytes_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); int status; unsigned long block_bytes; unsigned int block_bits; - if (reg->hr_bdev) + if (reg->hr_bdev_file) return -EINVAL; - status = o2hb_read_block_input(reg, page, count, - &block_bytes, &block_bits); + status = o2hb_read_block_input(reg, page, &block_bytes, + &block_bits); if (status) return status; @@ -1557,24 +1585,26 @@ static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg, +static ssize_t o2hb_region_start_block_show(struct config_item *item, char *page) { - return sprintf(page, "%llu\n", reg->hr_start_block); + return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block); } -static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, +static ssize_t o2hb_region_start_block_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); unsigned long long tmp; char *p = (char *)page; + ssize_t ret; - if (reg->hr_bdev) + if (reg->hr_bdev_file) return -EINVAL; - tmp = simple_strtoull(p, &p, 0); - if (!p || (*p && (*p != '\n'))) + ret = kstrtoull(p, 0, &tmp); + if (ret) return -EINVAL; reg->hr_start_block = tmp; @@ -1582,25 +1612,26 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", reg->hr_blocks); + return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks); } -static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, +static ssize_t o2hb_region_blocks_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); unsigned long tmp; char *p = (char *)page; + int ret; - if (reg->hr_bdev) + if (reg->hr_bdev_file) return -EINVAL; - tmp = simple_strtoul(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 0, &tmp); + if (ret) + return ret; if (tmp > O2NM_MAX_NODES || tmp == 0) return -ERANGE; @@ -1610,20 +1641,19 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_dev_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) { unsigned int ret = 0; - if (reg->hr_bdev) - ret = sprintf(page, "%s\n", reg->hr_dev_name); + if (to_o2hb_region(item)->hr_bdev_file) + ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item))); return ret; } static void o2hb_init_region_params(struct o2hb_region *reg) { - reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits; + reg->hr_slots_per_page = PAGE_SIZE >> reg->hr_block_bits; reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS; mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n", @@ -1644,17 +1674,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) struct o2hb_disk_slot *slot; reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); - if (reg->hr_tmp_block == NULL) { - mlog_errno(-ENOMEM); + if (reg->hr_tmp_block == NULL) return -ENOMEM; - } reg->hr_slots = kcalloc(reg->hr_blocks, sizeof(struct o2hb_disk_slot), GFP_KERNEL); - if (reg->hr_slots == NULL) { - mlog_errno(-ENOMEM); + if (reg->hr_slots == NULL) return -ENOMEM; - } for(i = 0; i < reg->hr_blocks; i++) { slot = ®->hr_slots[i]; @@ -1670,17 +1696,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), GFP_KERNEL); - if (!reg->hr_slot_data) { - mlog_errno(-ENOMEM); + if (!reg->hr_slot_data) return -ENOMEM; - } for(i = 0; i < reg->hr_num_pages; i++) { page = alloc_page(GFP_KERNEL); - if (!page) { - mlog_errno(-ENOMEM); + if (!page) return -ENOMEM; - } reg->hr_slot_data[i] = page; @@ -1711,11 +1733,9 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg) struct o2hb_disk_slot *slot; struct o2hb_disk_heartbeat_block *hb_block; - ret = o2hb_read_slots(reg, reg->hr_blocks); - if (ret) { - mlog_errno(ret); + ret = o2hb_read_slots(reg, 0, reg->hr_blocks); + if (ret) goto out; - } /* We only want to get an idea of the values initially in each * slot, so we do no verification - o2hb_check_slot will @@ -1735,61 +1755,57 @@ out: return ret; } -/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ -static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, +/* + * this is acting as commit; we set up all of hr_bdev_file and hr_task or + * nothing + */ +static ssize_t o2hb_region_dev_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); struct task_struct *hb_task; long fd; int sectsize; char *p = (char *)page; - struct fd f; - struct inode *inode; ssize_t ret = -EINVAL; int live_threshold; - if (reg->hr_bdev) - goto out; + if (reg->hr_bdev_file) + return -EINVAL; /* We can't heartbeat without having had our node number * configured yet. */ if (o2nm_this_node() == O2NM_MAX_NODES) - goto out; + return -EINVAL; - fd = simple_strtol(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - goto out; + ret = kstrtol(p, 0, &fd); + if (ret < 0) + return -EINVAL; if (fd < 0 || fd >= INT_MAX) - goto out; + return -EINVAL; - f = fdget(fd); - if (f.file == NULL) - goto out; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EINVAL; if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || reg->hr_block_bytes == 0) - goto out2; - - inode = igrab(f.file->f_mapping->host); - if (inode == NULL) - goto out2; + return -EINVAL; - if (!S_ISBLK(inode->i_mode)) - goto out3; + if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode)) + return -EINVAL; - reg->hr_bdev = I_BDEV(f.file->f_mapping->host); - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); - if (ret) { - reg->hr_bdev = NULL; - goto out3; + reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev, + BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(reg->hr_bdev_file)) { + ret = PTR_ERR(reg->hr_bdev_file); + reg->hr_bdev_file = NULL; + return ret; } - inode = NULL; - - bdevname(reg->hr_bdev, reg->hr_dev_name); - sectsize = bdev_logical_block_size(reg->hr_bdev); + sectsize = bdev_logical_block_size(reg_bdev(reg)); if (sectsize != reg->hr_block_bytes) { mlog(ML_ERROR, "blocksize %u incorrect for device, expected %d", @@ -1819,6 +1835,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, } INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); + INIT_DELAYED_WORK(®->hr_nego_timeout_work, o2hb_nego_timeout); /* * A node is considered live after it has beat LIVE_THRESHOLD @@ -1831,14 +1848,14 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, live_threshold = O2HB_LIVE_THRESHOLD; if (o2hb_global_heartbeat_active()) { spin_lock(&o2hb_live_lock); - if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) + if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) live_threshold <<= 1; spin_unlock(&o2hb_live_lock); } ++live_threshold; atomic_set(®->hr_steady_iterations, live_threshold); - /* unsteady_iterations is double the steady_iterations */ - atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); + /* unsteady_iterations is triple the steady_iterations */ + atomic_set(®->hr_unsteady_iterations, (live_threshold * 3)); hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", reg->hr_item.ci_name); @@ -1853,7 +1870,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, spin_unlock(&o2hb_live_lock); ret = wait_event_interruptible(o2hb_steady_queue, - atomic_read(®->hr_steady_iterations) == 0); + atomic_read(®->hr_steady_iterations) == 0 || + reg->hr_node_deleted); if (ret) { atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; @@ -1864,6 +1882,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out3; } + if (reg->hr_node_deleted) { + ret = -EINVAL; + goto out3; + } + /* Ok, we were woken. Make sure it wasn't by drop_item() */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; @@ -1877,26 +1900,20 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, ret = -EIO; if (hb_task && o2hb_global_heartbeat_active()) - printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n", + config_item_name(®->hr_item), reg_bdev(reg)); out3: - iput(inode); -out2: - fdput(f); -out: if (ret < 0) { - if (reg->hr_bdev) { - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); - reg->hr_bdev = NULL; - } + fput(reg->hr_bdev_file); + reg->hr_bdev_file = NULL; } return ret; } -static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_pid_show(struct config_item *item, char *page) { + struct o2hb_region *reg = to_o2hb_region(item); pid_t pid = 0; spin_lock(&o2hb_live_lock); @@ -1910,95 +1927,26 @@ static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, return sprintf(page, "%u\n", pid); } -struct o2hb_region_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2hb_region *, char *); - ssize_t (*store)(struct o2hb_region *, const char *, size_t); -}; - -static struct o2hb_region_attribute o2hb_region_attr_block_bytes = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "block_bytes", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_block_bytes_read, - .store = o2hb_region_block_bytes_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_start_block = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "start_block", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_start_block_read, - .store = o2hb_region_start_block_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_blocks = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "blocks", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_blocks_read, - .store = o2hb_region_blocks_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_dev = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "dev", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_dev_read, - .store = o2hb_region_dev_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_pid = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "pid", - .ca_mode = S_IRUGO | S_IRUSR }, - .show = o2hb_region_pid_read, -}; +CONFIGFS_ATTR(o2hb_region_, block_bytes); +CONFIGFS_ATTR(o2hb_region_, start_block); +CONFIGFS_ATTR(o2hb_region_, blocks); +CONFIGFS_ATTR(o2hb_region_, dev); +CONFIGFS_ATTR_RO(o2hb_region_, pid); static struct configfs_attribute *o2hb_region_attrs[] = { - &o2hb_region_attr_block_bytes.attr, - &o2hb_region_attr_start_block.attr, - &o2hb_region_attr_blocks.attr, - &o2hb_region_attr_dev.attr, - &o2hb_region_attr_pid.attr, + &o2hb_region_attr_block_bytes, + &o2hb_region_attr_start_block, + &o2hb_region_attr_blocks, + &o2hb_region_attr_dev, + &o2hb_region_attr_pid, NULL, }; -static ssize_t o2hb_region_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2hb_region *reg = to_o2hb_region(item); - struct o2hb_region_attribute *o2hb_region_attr = - container_of(attr, struct o2hb_region_attribute, attr); - ssize_t ret = 0; - - if (o2hb_region_attr->show) - ret = o2hb_region_attr->show(reg, page); - return ret; -} - -static ssize_t o2hb_region_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2hb_region *reg = to_o2hb_region(item); - struct o2hb_region_attribute *o2hb_region_attr = - container_of(attr, struct o2hb_region_attribute, attr); - ssize_t ret = -EINVAL; - - if (o2hb_region_attr->store) - ret = o2hb_region_attr->store(reg, page, count); - return ret; -} - static struct configfs_item_operations o2hb_region_item_ops = { .release = o2hb_region_release, - .show_attribute = o2hb_region_show, - .store_attribute = o2hb_region_store, }; -static struct config_item_type o2hb_region_type = { +static const struct config_item_type o2hb_region_type = { .ct_item_ops = &o2hb_region_item_ops, .ct_attrs = o2hb_region_attrs, .ct_owner = THIS_MODULE, @@ -2018,69 +1966,33 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } -static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +static void o2hb_debug_region_init(struct o2hb_region *reg, + struct dentry *parent) { - int ret = -ENOMEM; + struct dentry *dir; - reg->hr_debug_dir = - debugfs_create_dir(config_item_name(®->hr_item), dir); - if (!reg->hr_debug_dir) { - mlog_errno(ret); - goto bail; - } + dir = debugfs_create_dir(config_item_name(®->hr_item), parent); + reg->hr_debug_dir = dir; - reg->hr_debug_livenodes = - o2hb_debug_create(O2HB_DEBUG_LIVENODES, - reg->hr_debug_dir, - &(reg->hr_db_livenodes), - sizeof(*(reg->hr_db_livenodes)), - O2HB_DB_TYPE_REGION_LIVENODES, - sizeof(reg->hr_live_node_bitmap), - O2NM_MAX_NODES, reg); - if (!reg->hr_debug_livenodes) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, + reg); - reg->hr_debug_regnum = - o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, - reg->hr_debug_dir, - &(reg->hr_db_regnum), - sizeof(*(reg->hr_db_regnum)), - O2HB_DB_TYPE_REGION_NUMBER, - 0, O2NM_MAX_NODES, reg); - if (!reg->hr_debug_regnum) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); - reg->hr_debug_elapsed_time = - o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, - reg->hr_debug_dir, - &(reg->hr_db_elapsed_time), - sizeof(*(reg->hr_db_elapsed_time)), - O2HB_DB_TYPE_REGION_ELAPSED_TIME, - 0, 0, reg); - if (!reg->hr_debug_elapsed_time) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); - reg->hr_debug_pinned = - o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, - reg->hr_debug_dir, - &(reg->hr_db_pinned), - sizeof(*(reg->hr_db_pinned)), - O2HB_DB_TYPE_REGION_PINNED, - 0, 0, reg); - if (!reg->hr_debug_pinned) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned), + sizeof(*(reg->hr_db_pinned)), + O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); - ret = 0; -bail: - return ret; } static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, @@ -2115,13 +2027,39 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g config_item_init_type_name(®->hr_item, name, &o2hb_region_type); - ret = o2hb_debug_region_init(reg, o2hb_debug_dir); - if (ret) { - config_item_put(®->hr_item); - goto free; - } + /* this is the same way to generate msg key as dlm, for local heartbeat, + * name is also the same, so make initial crc value different to avoid + * message key conflict. + */ + reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS, + name, strlen(name)); + INIT_LIST_HEAD(®->hr_handler_list); + ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_timeout_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto remove_item; + + ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_approve_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto unregister_handler; + + o2hb_debug_region_init(reg, o2hb_debug_dir); return ®->hr_item; + +unregister_handler: + o2net_unregister_handler_list(®->hr_handler_list); +remove_item: + spin_lock(&o2hb_live_lock); + list_del(®->hr_all_item); + if (o2hb_global_heartbeat_active()) + clear_bit(reg->hr_region_num, o2hb_region_bitmap); + spin_unlock(&o2hb_live_lock); free: kfree(reg); return ERR_PTR(ret); @@ -2152,10 +2090,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, quorum_region = 1; clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); spin_unlock(&o2hb_live_lock); - printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n", + printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n", ((atomic_read(®->hr_steady_iterations) == 0) ? "stopped" : "start aborted"), config_item_name(item), - reg->hr_dev_name); + reg_bdev(reg)); } /* @@ -2182,7 +2120,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, if (!o2hb_dependent_users) goto unlock; - if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + if (bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) o2hb_region_pin(NULL); @@ -2190,56 +2128,22 @@ unlock: spin_unlock(&o2hb_live_lock); } -struct o2hb_heartbeat_group_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2hb_heartbeat_group *, char *); - ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t); -}; - -static ssize_t o2hb_heartbeat_group_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); - struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = - container_of(attr, struct o2hb_heartbeat_group_attribute, attr); - ssize_t ret = 0; - - if (o2hb_heartbeat_group_attr->show) - ret = o2hb_heartbeat_group_attr->show(reg, page); - return ret; -} - -static ssize_t o2hb_heartbeat_group_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); - struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = - container_of(attr, struct o2hb_heartbeat_group_attribute, attr); - ssize_t ret = -EINVAL; - - if (o2hb_heartbeat_group_attr->store) - ret = o2hb_heartbeat_group_attr->store(reg, page, count); - return ret; -} - -static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group, - char *page) +static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item, + char *page) { return sprintf(page, "%u\n", o2hb_dead_threshold); } -static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group, - const char *page, - size_t count) +static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item, + const char *page, size_t count) { unsigned long tmp; char *p = (char *)page; + int ret; - tmp = simple_strtoul(p, &p, 10); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 10, &tmp); + if (ret) + return ret; /* this will validate ranges for us. */ o2hb_dead_threshold_set((unsigned int) tmp); @@ -2247,17 +2151,15 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group return count; } -static -ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group, - char *page) +static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item, + char *page) { return sprintf(page, "%s\n", o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); } -static -ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, - const char *page, size_t count) +static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, + const char *page, size_t count) { unsigned int i; int ret; @@ -2268,7 +2170,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, return -EINVAL; for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { - if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) + if (strncasecmp(page, o2hb_heartbeat_mode_desc[i], len)) continue; ret = o2hb_global_heartbeat_mode_set(i); @@ -2282,41 +2184,22 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, } -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "dead_threshold", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_heartbeat_group_threshold_show, - .store = o2hb_heartbeat_group_threshold_store, -}; - -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "mode", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_heartbeat_group_mode_show, - .store = o2hb_heartbeat_group_mode_store, -}; +CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold); +CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { - &o2hb_heartbeat_group_attr_threshold.attr, - &o2hb_heartbeat_group_attr_mode.attr, + &o2hb_heartbeat_group_attr_dead_threshold, + &o2hb_heartbeat_group_attr_mode, NULL, }; -static struct configfs_item_operations o2hb_heartbeat_group_item_ops = { - .show_attribute = o2hb_heartbeat_group_show, - .store_attribute = o2hb_heartbeat_group_store, -}; - static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { .make_item = o2hb_heartbeat_group_make_item, .drop_item = o2hb_heartbeat_group_drop_item, }; -static struct config_item_type o2hb_heartbeat_group_type = { +static const struct config_item_type o2hb_heartbeat_group_type = { .ct_group_ops = &o2hb_heartbeat_group_group_ops, - .ct_item_ops = &o2hb_heartbeat_group_item_ops, .ct_attrs = o2hb_heartbeat_group_attrs, .ct_owner = THIS_MODULE, }; @@ -2482,7 +2365,7 @@ static int o2hb_region_inc_user(const char *region_uuid) if (o2hb_dependent_users > 1) goto unlock; - if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + if (bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) ret = o2hb_region_pin(NULL); @@ -2491,7 +2374,7 @@ unlock: return ret; } -void o2hb_region_dec_user(const char *region_uuid) +static void o2hb_region_dec_user(const char *region_uuid) { spin_lock(&o2hb_live_lock); @@ -2516,8 +2399,7 @@ unlock: int o2hb_register_callback(const char *region_uuid, struct o2hb_callback_func *hc) { - struct o2hb_callback_func *tmp; - struct list_head *iter; + struct o2hb_callback_func *f; struct o2hb_callback *hbcall; int ret; @@ -2540,10 +2422,9 @@ int o2hb_register_callback(const char *region_uuid, down_write(&o2hb_callback_sem); - list_for_each(iter, &hbcall->list) { - tmp = list_entry(iter, struct o2hb_callback_func, hc_item); - if (hc->hc_priority < tmp->hc_priority) { - list_add_tail(&hc->hc_item, iter); + list_for_each_entry(f, &hbcall->list, hc_item) { + if (hc->hc_priority < f->hc_priority) { + list_add_tail(&hc->hc_item, &f->hc_item); break; } } @@ -2582,11 +2463,13 @@ void o2hb_unregister_callback(const char *region_uuid, } EXPORT_SYMBOL_GPL(o2hb_unregister_callback); -int o2hb_check_node_heartbeating(u8 node_num) +int o2hb_check_node_heartbeating_no_sem(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - o2hb_fill_node_map(testing_map, sizeof(testing_map)); + spin_lock(&o2hb_live_lock); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); + spin_unlock(&o2hb_live_lock); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", @@ -2596,13 +2479,13 @@ int o2hb_check_node_heartbeating(u8 node_num) return 1; } -EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem); int o2hb_check_node_heartbeating_from_callback(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", @@ -2614,23 +2497,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) } EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); -/* Makes sure our local node is configured with a node number, and is - * heartbeating. */ -int o2hb_check_local_node_heartbeating(void) -{ - u8 node_num; - - /* if this node was set then we have networking */ - node_num = o2nm_this_node(); - if (node_num == O2NM_MAX_NODES) { - mlog(ML_HEARTBEAT, "this node has not been configured.\n"); - return 0; - } - - return o2hb_check_node_heartbeating(node_num); -} -EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); - /* * this is just a hack until we get the plumbing which flips file systems * read only and drops the hb ref instead of killing the node dead. diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 00ad8e8fea51..8ef8c1b9eeb7 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * heartbeat.h * * Function prototypes * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef O2CLUSTER_HEARTBEAT_H @@ -76,12 +59,11 @@ int o2hb_register_callback(const char *region_uuid, void o2hb_unregister_callback(const char *region_uuid, struct o2hb_callback_func *hc); void o2hb_fill_node_map(unsigned long *map, - unsigned bytes); + unsigned int bits); void o2hb_exit(void); -int o2hb_init(void); -int o2hb_check_node_heartbeating(u8 node_num); +void o2hb_init(void); +int o2hb_check_node_heartbeating_no_sem(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); -int o2hb_check_local_node_heartbeating(void); void o2hb_stop_all_regions(void); int o2hb_get_all_regions(char *region_uuids, u8 numregions); int o2hb_global_heartbeat_active(void); diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 07ac24fd9252..563881ddbf00 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -1,22 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/module.h> @@ -24,7 +8,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/string.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "masklog.h" @@ -49,13 +33,13 @@ static ssize_t mlog_mask_show(u64 mask, char *buf) static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) { - if (!strnicmp(buf, "allow", 5)) { + if (!strncasecmp(buf, "allow", 5)) { __mlog_set_u64(mask, mlog_and_bits); __mlog_clear_u64(mask, mlog_not_bits); - } else if (!strnicmp(buf, "deny", 4)) { + } else if (!strncasecmp(buf, "deny", 4)) { __mlog_set_u64(mask, mlog_not_bits); __mlog_clear_u64(mask, mlog_and_bits); - } else if (!strnicmp(buf, "off", 3)) { + } else if (!strncasecmp(buf, "off", 3)) { __mlog_clear_u64(mask, mlog_not_bits); __mlog_clear_u64(mask, mlog_and_bits); } else @@ -64,6 +48,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) return count; } +void __mlog_printk(const u64 *mask, const char *func, int line, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + const char *level; + const char *prefix = ""; + + if (!__mlog_test_u64(*mask, mlog_and_bits) || + __mlog_test_u64(*mask, mlog_not_bits)) + return; + + if (*mask & ML_ERROR) { + level = KERN_ERR; + prefix = "ERROR: "; + } else if (*mask & ML_NOTICE) { + level = KERN_NOTICE; + } else { + level = KERN_INFO; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s(%s,%u,%u):%s:%d %s%pV", + level, current->comm, task_pid_nr(current), + raw_smp_processor_id(), func, line, prefix, &vaf); + + va_end(args); +} +EXPORT_SYMBOL_GPL(__mlog_printk); + struct mlog_attribute { struct attribute attr; u64 mask; @@ -102,7 +120,8 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(KTHREAD), }; -static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; +static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, }; +ATTRIBUTE_GROUPS(mlog_default); static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, char *buf) @@ -126,8 +145,8 @@ static const struct sysfs_ops mlog_attr_ops = { }; static struct kobj_type mlog_ktype = { - .default_attrs = mlog_attr_ptrs, - .sysfs_ops = &mlog_attr_ops, + .default_groups = mlog_default_groups, + .sysfs_ops = &mlog_attr_ops, }; static struct kset mlog_kset = { @@ -139,10 +158,10 @@ int mlog_sys_init(struct kset *o2cb_kset) int i = 0; while (mlog_attrs[i].attr.mode) { - mlog_attr_ptrs[i] = &mlog_attrs[i].attr; + mlog_default_attrs[i] = &mlog_attrs[i].attr; i++; } - mlog_attr_ptrs[i] = NULL; + mlog_default_attrs[i] = NULL; kobject_set_name(&mlog_kset.kobj, "logmask"); mlog_kset.kobj.kset = o2cb_kset; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index baa2b9ef7eef..630bd5a3dd0d 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -1,22 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef O2CLUSTER_MASKLOG_H @@ -45,7 +29,7 @@ * just calling printk() so that this can eventually make its way through * relayfs along with the debugging messages. Everything else gets KERN_DEBUG. * The inline tests and macro dance give GCC the opportunity to quite cleverly - * only emit the appropriage printk() when the caller passes in a constant + * only emit the appropriate printk() when the caller passes in a constant * mask, as is almost always the case. * * All this bitmask nonsense is managed from the files under @@ -162,46 +146,39 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; #endif +__printf(4, 5) +void __mlog_printk(const u64 *m, const char *func, int line, + const char *fmt, ...); + /* - * smp_processor_id() "helpfully" screams when called outside preemptible - * regions in current kernels. sles doesn't have the variants that don't - * scream. just do this instead of trying to guess which we're building - * against.. *sigh*. + * Testing before the __mlog_printk call lets the compiler eliminate the + * call completely when (m & ML_ALLOWED_BITS) is 0. */ -#define __mlog_cpu_guess ({ \ - unsigned long _cpu = get_cpu(); \ - put_cpu(); \ - _cpu; \ -}) +#define mlog(mask, fmt, ...) \ +do { \ + u64 _m = MLOG_MASK_PREFIX | (mask); \ + if (_m & ML_ALLOWED_BITS) \ + __mlog_printk(&_m, __func__, __LINE__, fmt, \ + ##__VA_ARGS__); \ +} while (0) -/* In the following two macros, the whitespace after the ',' just - * before ##args is intentional. Otherwise, gcc 2.95 will eat the - * previous token if args expands to nothing. - */ -#define __mlog_printk(level, fmt, args...) \ - printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ - task_pid_nr(current), __mlog_cpu_guess, \ - __PRETTY_FUNCTION__, __LINE__ , ##args) - -#define mlog(mask, fmt, args...) do { \ - u64 __m = MLOG_MASK_PREFIX | (mask); \ - if ((__m & ML_ALLOWED_BITS) && \ - __mlog_test_u64(__m, mlog_and_bits) && \ - !__mlog_test_u64(__m, mlog_not_bits)) { \ - if (__m & ML_ERROR) \ - __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ - else if (__m & ML_NOTICE) \ - __mlog_printk(KERN_NOTICE, fmt , ##args); \ - else __mlog_printk(KERN_INFO, fmt , ##args); \ - } \ +#define mlog_ratelimited(mask, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + mlog(mask, fmt, ##__VA_ARGS__); \ } while (0) -#define mlog_errno(st) do { \ +#define mlog_errno(st) ({ \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ - _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \ + _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \ + _st != -EDQUOT) \ mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ -} while (0) + _st; \ +}) #define mlog_bug_on_msg(cond, fmt, args...) do { \ if (cond) { \ diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 73ba81928bce..bc27301eab6d 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * netdebug.c * * debug functionality for o2net * * Copyright (C) 2005, 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifdef CONFIG_DEBUG_FS @@ -53,10 +36,6 @@ #define SHOW_SOCK_STATS 1 static struct dentry *o2net_dentry; -static struct dentry *sc_dentry; -static struct dentry *nst_dentry; -static struct dentry *stats_dentry; -static struct dentry *nodes_dentry; static DEFINE_SPINLOCK(o2net_debug_lock); @@ -65,17 +44,17 @@ static LIST_HEAD(send_tracking); void o2net_debug_add_nst(struct o2net_send_tracking *nst) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_add(&nst->st_net_debug_item, &send_tracking); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } void o2net_debug_del_nst(struct o2net_send_tracking *nst) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); if (!list_empty(&nst->st_net_debug_item)) list_del_init(&nst->st_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } static struct o2net_send_tracking @@ -105,9 +84,9 @@ static void *nst_seq_start(struct seq_file *seq, loff_t *pos) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return nst; } @@ -116,13 +95,13 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); list_del_init(&dummy_nst->st_net_debug_item); if (nst) list_add(&dummy_nst->st_net_debug_item, &nst->st_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return nst; /* unused, just needs to be null when done */ } @@ -133,7 +112,7 @@ static int nst_seq_show(struct seq_file *seq, void *v) ktime_t now; s64 sock, send, status; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); if (!nst) goto out; @@ -166,7 +145,7 @@ static int nst_seq_show(struct seq_file *seq, void *v) (long long)status); out: - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return 0; } @@ -185,29 +164,13 @@ static const struct seq_operations nst_seq_ops = { static int nst_fop_open(struct inode *inode, struct file *file) { struct o2net_send_tracking *dummy_nst; - struct seq_file *seq; - int ret; - - dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); - if (dummy_nst == NULL) { - ret = -ENOMEM; - goto out; - } - dummy_nst->st_task = NULL; - - ret = seq_open(file, &nst_seq_ops); - if (ret) - goto out; - seq = file->private_data; - seq->private = dummy_nst; + dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst)); + if (!dummy_nst) + return -ENOMEM; o2net_debug_add_nst(dummy_nst); - dummy_nst = NULL; - -out: - kfree(dummy_nst); - return ret; + return 0; } static int nst_fop_release(struct inode *inode, struct file *file) @@ -228,16 +191,16 @@ static const struct file_operations nst_seq_fops = { void o2net_debug_add_sc(struct o2net_sock_container *sc) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_add(&sc->sc_net_debug_item, &sock_containers); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } void o2net_debug_del_sc(struct o2net_sock_container *sc) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_del_init(&sc->sc_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } struct o2net_sock_debug { @@ -273,9 +236,9 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return sc; } @@ -285,12 +248,12 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); list_del_init(&dummy_sc->sc_net_debug_item); if (sc) list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return sc; /* unused, just needs to be null when done */ } @@ -365,7 +328,7 @@ static void sc_show_sock_container(struct seq_file *seq, " func key: 0x%08x\n" " func type: %u\n", sc, - atomic_read(&sc->sc_kref.refcount), + kref_read(&sc->sc_kref), &saddr, inet ? ntohs(sport) : 0, &daddr, inet ? ntohs(dport) : 0, sc->sc_node->nd_name, @@ -386,7 +349,7 @@ static int sc_seq_show(struct seq_file *seq, void *v) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); if (sc) { @@ -396,7 +359,7 @@ static int sc_seq_show(struct seq_file *seq, void *v) sc_show_sock_stats(seq, sc); } - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return 0; } @@ -412,33 +375,27 @@ static const struct seq_operations sc_seq_ops = { .show = sc_seq_show, }; -static int sc_common_open(struct file *file, struct o2net_sock_debug *sd) +static int sc_common_open(struct file *file, int ctxt) { + struct o2net_sock_debug *sd; struct o2net_sock_container *dummy_sc; - struct seq_file *seq; - int ret; - dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); - if (dummy_sc == NULL) { - ret = -ENOMEM; - goto out; - } - dummy_sc->sc_page = NULL; + dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL); + if (!dummy_sc) + return -ENOMEM; - ret = seq_open(file, &sc_seq_ops); - if (ret) - goto out; + sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd)); + if (!sd) { + kfree(dummy_sc); + return -ENOMEM; + } - seq = file->private_data; - seq->private = sd; + sd->dbg_ctxt = ctxt; sd->dbg_sock = dummy_sc; - o2net_debug_add_sc(dummy_sc); - dummy_sc = NULL; + o2net_debug_add_sc(dummy_sc); -out: - kfree(dummy_sc); - return ret; + return 0; } static int sc_fop_release(struct inode *inode, struct file *file) @@ -448,21 +405,13 @@ static int sc_fop_release(struct inode *inode, struct file *file) struct o2net_sock_container *dummy_sc = sd->dbg_sock; o2net_debug_del_sc(dummy_sc); + kfree(dummy_sc); return seq_release_private(inode, file); } static int stats_fop_open(struct inode *inode, struct file *file) { - struct o2net_sock_debug *sd; - - sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); - if (sd == NULL) - return -ENOMEM; - - sd->dbg_ctxt = SHOW_SOCK_STATS; - sd->dbg_sock = NULL; - - return sc_common_open(file, sd); + return sc_common_open(file, SHOW_SOCK_STATS); } static const struct file_operations stats_seq_fops = { @@ -474,16 +423,7 @@ static const struct file_operations stats_seq_fops = { static int sc_fop_open(struct inode *inode, struct file *file) { - struct o2net_sock_debug *sd; - - sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); - if (sd == NULL) - return -ENOMEM; - - sd->dbg_ctxt = SHOW_SOCK_CONTAINERS; - sd->dbg_sock = NULL; - - return sc_common_open(file, sd); + return sc_common_open(file, SHOW_SOCK_CONTAINERS); } static const struct file_operations sc_seq_fops = { @@ -498,11 +438,11 @@ static int o2net_fill_bitmap(char *buf, int len) unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; int i = -1, out = 0; - o2net_fill_node_map(map, sizeof(map)); + o2net_fill_node_map(map, O2NM_MAX_NODES); while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) - out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); - out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += scnprintf(buf + out, PAGE_SIZE - out, "\n"); return out; } @@ -544,36 +484,23 @@ static const struct file_operations nodes_fops = { void o2net_debugfs_exit(void) { - debugfs_remove(nodes_dentry); - debugfs_remove(stats_dentry); - debugfs_remove(sc_dentry); - debugfs_remove(nst_dentry); - debugfs_remove(o2net_dentry); + debugfs_remove_recursive(o2net_dentry); } -int o2net_debugfs_init(void) +void o2net_debugfs_init(void) { umode_t mode = S_IFREG|S_IRUSR; o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); - if (o2net_dentry) - nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode, - o2net_dentry, NULL, &nst_seq_fops); - if (nst_dentry) - sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode, - o2net_dentry, NULL, &sc_seq_fops); - if (sc_dentry) - stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode, - o2net_dentry, NULL, &stats_seq_fops); - if (stats_dentry) - nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode, - o2net_dentry, NULL, &nodes_fops); - if (nodes_dentry) - return 0; - - o2net_debugfs_exit(); - mlog_errno(-ENOMEM); - return -ENOMEM; + + debugfs_create_file(NST_DEBUG_NAME, mode, o2net_dentry, NULL, + &nst_seq_fops); + debugfs_create_file(SC_DEBUG_NAME, mode, o2net_dentry, NULL, + &sc_seq_fops); + debugfs_create_file(STATS_DEBUG_NAME, mode, o2net_dentry, NULL, + &stats_seq_fops); + debugfs_create_file(NODES_DEBUG_NAME, mode, o2net_dentry, NULL, + &nodes_fops); } #endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index bb240647ca5f..2f61d39e4e50 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -1,22 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/slab.h> @@ -29,18 +13,20 @@ #include "heartbeat.h" #include "masklog.h" #include "sys.h" -#include "ver.h" /* for now we operate under the assertion that there can be only one * cluster active at a time. Changing this will require trickling * cluster references throughout where nodes are looked up */ struct o2nm_cluster *o2nm_single_cluster = NULL; -char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { - "reset", /* O2NM_FENCE_RESET */ - "panic", /* O2NM_FENCE_PANIC */ +static const char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ }; +static inline void o2nm_lock_subsystem(void); +static inline void o2nm_unlock_subsystem(void); + struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { struct o2nm_node *node = NULL; @@ -68,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes) return -EINVAL; read_lock(&cluster->cl_nodes_lock); - memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); + bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES); read_unlock(&cluster->cl_nodes_lock); return 0; @@ -173,31 +159,35 @@ static void o2nm_node_release(struct config_item *item) kfree(node); } -static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_num_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", node->nd_num); + return sprintf(page, "%d\n", to_o2nm_node(item)->nd_num); } static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) { /* through the first node_set .parent * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ - return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + if (node->nd_item.ci_parent) + return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + else + return NULL; } enum { O2NM_NODE_ATTR_NUM = 0, O2NM_NODE_ATTR_PORT, O2NM_NODE_ATTR_ADDRESS, - O2NM_NODE_ATTR_LOCAL, }; -static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, +static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, size_t count) { - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; + int ret = 0; tmp = simple_strtoul(p, &p, 0); if (!p || (*p && (*p != '\n'))) @@ -214,28 +204,41 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + write_lock(&cluster->cl_nodes_lock); if (cluster->cl_nodes[tmp]) - p = NULL; + ret = -EEXIST; + else if (test_and_set_bit(O2NM_NODE_ATTR_NUM, + &node->nd_set_attributes)) + ret = -EBUSY; else { cluster->cl_nodes[tmp] = node; node->nd_num = tmp; set_bit(tmp, cluster->cl_nodes_bitmap); } write_unlock(&cluster->cl_nodes_lock); - if (p == NULL) - return -EEXIST; + o2nm_unlock_subsystem(); + + if (ret) + return ret; return count; } -static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_ipv4_port_show(struct config_item *item, char *page) { - return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port)); + return sprintf(page, "%u\n", ntohs(to_o2nm_node(item)->nd_ipv4_port)); } -static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, +static ssize_t o2nm_node_ipv4_port_store(struct config_item *item, const char *page, size_t count) { + struct o2nm_node *node = to_o2nm_node(item); unsigned long tmp; char *p = (char *)page; @@ -248,21 +251,24 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, if (tmp >= (u16)-1) return -ERANGE; + if (test_and_set_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) + return -EBUSY; node->nd_ipv4_port = htons(tmp); return count; } -static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_ipv4_address_show(struct config_item *item, char *page) { - return sprintf(page, "%pI4\n", &node->nd_ipv4_address); + return sprintf(page, "%pI4\n", &to_o2nm_node(item)->nd_ipv4_address); } -static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, +static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, const char *page, size_t count) { - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_cluster *cluster; int ret, i; struct rb_node **p, *parent; unsigned int octets[4]; @@ -279,15 +285,27 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); } + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + ret = 0; write_lock(&cluster->cl_nodes_lock); if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) ret = -EEXIST; + else if (test_and_set_bit(O2NM_NODE_ATTR_ADDRESS, + &node->nd_set_attributes)) + ret = -EBUSY; else { rb_link_node(&node->nd_ip_node, parent, p); rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); } write_unlock(&cluster->cl_nodes_lock); + o2nm_unlock_subsystem(); + if (ret) return ret; @@ -296,15 +314,16 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, return count; } -static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_local_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", node->nd_local); + return sprintf(page, "%d\n", to_o2nm_node(item)->nd_local); } -static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, +static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, size_t count) { - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; ssize_t ret; @@ -322,17 +341,26 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + ret = -EINVAL; + goto out; + } + /* the only failure case is trying to set a new local node * when a different one is already set */ if (tmp && tmp == cluster->cl_has_local && - cluster->cl_local_node != node->nd_num) - return -EBUSY; + cluster->cl_local_node != node->nd_num) { + ret = -EBUSY; + goto out; + } /* bring up the rx thread if we're setting the new local node. */ if (tmp && !cluster->cl_has_local) { ret = o2net_start_listening(node); if (ret) - return ret; + goto out; } if (!tmp && cluster->cl_has_local && @@ -347,114 +375,31 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, cluster->cl_local_node = node->nd_num; } - return count; -} - -struct o2nm_node_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2nm_node *, char *); - ssize_t (*store)(struct o2nm_node *, const char *, size_t); -}; + ret = count; -static struct o2nm_node_attribute o2nm_node_attr_num = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "num", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_num_read, - .store = o2nm_node_num_write, -}; - -static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_port", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_ipv4_port_read, - .store = o2nm_node_ipv4_port_write, -}; - -static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_address", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_ipv4_address_read, - .store = o2nm_node_ipv4_address_write, -}; +out: + o2nm_unlock_subsystem(); + return ret; +} -static struct o2nm_node_attribute o2nm_node_attr_local = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "local", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_local_read, - .store = o2nm_node_local_write, -}; +CONFIGFS_ATTR(o2nm_node_, num); +CONFIGFS_ATTR(o2nm_node_, ipv4_port); +CONFIGFS_ATTR(o2nm_node_, ipv4_address); +CONFIGFS_ATTR(o2nm_node_, local); static struct configfs_attribute *o2nm_node_attrs[] = { - [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr, - [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr, - [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr, - [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr, + &o2nm_node_attr_num, + &o2nm_node_attr_ipv4_port, + &o2nm_node_attr_ipv4_address, + &o2nm_node_attr_local, NULL, }; -static int o2nm_attr_index(struct configfs_attribute *attr) -{ - int i; - for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) { - if (attr == o2nm_node_attrs[i]) - return i; - } - BUG(); - return 0; -} - -static ssize_t o2nm_node_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_node_attribute *o2nm_node_attr = - container_of(attr, struct o2nm_node_attribute, attr); - ssize_t ret = 0; - - if (o2nm_node_attr->show) - ret = o2nm_node_attr->show(node, page); - return ret; -} - -static ssize_t o2nm_node_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_node_attribute *o2nm_node_attr = - container_of(attr, struct o2nm_node_attribute, attr); - ssize_t ret; - int attr_index = o2nm_attr_index(attr); - - if (o2nm_node_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - if (test_bit(attr_index, &node->nd_set_attributes)) - return -EBUSY; - - ret = o2nm_node_attr->store(node, page, count); - if (ret < count) - goto out; - - set_bit(attr_index, &node->nd_set_attributes); -out: - return ret; -} - static struct configfs_item_operations o2nm_node_item_ops = { .release = o2nm_node_release, - .show_attribute = o2nm_node_show, - .store_attribute = o2nm_node_store, }; -static struct config_item_type o2nm_node_type = { +static const struct config_item_type o2nm_node_type = { .ct_item_ops = &o2nm_node_item_ops, .ct_attrs = o2nm_node_attrs, .ct_owner = THIS_MODULE, @@ -476,12 +421,6 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group) } #endif -struct o2nm_cluster_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2nm_cluster *, char *); - ssize_t (*store)(struct o2nm_cluster *, const char *, size_t); -}; - static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, unsigned int *val) { @@ -502,15 +441,16 @@ static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, return count; } -static ssize_t o2nm_cluster_attr_idle_timeout_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_idle_timeout_ms_show(struct config_item *item, + char *page) { - return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); + return sprintf(page, "%u\n", to_o2nm_cluster(item)->cl_idle_timeout_ms); } -static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_idle_timeout_ms_store(struct config_item *item, + const char *page, size_t count) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret; unsigned int val; @@ -537,15 +477,17 @@ static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( return ret; } -static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_keepalive_delay_ms_show( + struct config_item *item, char *page) { - return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); + return sprintf(page, "%u\n", + to_o2nm_cluster(item)->cl_keepalive_delay_ms); } -static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_keepalive_delay_ms_store( + struct config_item *item, const char *page, size_t count) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret; unsigned int val; @@ -572,22 +514,24 @@ static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( return ret; } -static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_reconnect_delay_ms_show( + struct config_item *item, char *page) { - return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); + return sprintf(page, "%u\n", + to_o2nm_cluster(item)->cl_reconnect_delay_ms); } -static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_reconnect_delay_ms_store( + struct config_item *item, const char *page, size_t count) { return o2nm_cluster_attr_write(page, count, - &cluster->cl_reconnect_delay_ms); + &to_o2nm_cluster(item)->cl_reconnect_delay_ms); } -static ssize_t o2nm_cluster_attr_fence_method_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_fence_method_show( + struct config_item *item, char *page) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret = 0; if (cluster) @@ -596,8 +540,8 @@ static ssize_t o2nm_cluster_attr_fence_method_read( return ret; } -static ssize_t o2nm_cluster_attr_fence_method_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_fence_method_store( + struct config_item *item, const char *page, size_t count) { unsigned int i; @@ -609,10 +553,10 @@ static ssize_t o2nm_cluster_attr_fence_method_write( continue; if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) continue; - if (cluster->cl_fence_method != i) { + if (to_o2nm_cluster(item)->cl_fence_method != i) { printk(KERN_INFO "ocfs2: Changing fence method to %s\n", o2nm_fence_method_desc[i]); - cluster->cl_fence_method = i; + to_o2nm_cluster(item)->cl_fence_method = i; } return count; } @@ -621,79 +565,18 @@ bail: return -EINVAL; } -static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "idle_timeout_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_idle_timeout_ms_read, - .store = o2nm_cluster_attr_idle_timeout_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "keepalive_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_keepalive_delay_ms_read, - .store = o2nm_cluster_attr_keepalive_delay_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "reconnect_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_reconnect_delay_ms_read, - .store = o2nm_cluster_attr_reconnect_delay_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "fence_method", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_fence_method_read, - .store = o2nm_cluster_attr_fence_method_write, -}; +CONFIGFS_ATTR(o2nm_cluster_, idle_timeout_ms); +CONFIGFS_ATTR(o2nm_cluster_, keepalive_delay_ms); +CONFIGFS_ATTR(o2nm_cluster_, reconnect_delay_ms); +CONFIGFS_ATTR(o2nm_cluster_, fence_method); static struct configfs_attribute *o2nm_cluster_attrs[] = { - &o2nm_cluster_attr_idle_timeout_ms.attr, - &o2nm_cluster_attr_keepalive_delay_ms.attr, - &o2nm_cluster_attr_reconnect_delay_ms.attr, - &o2nm_cluster_attr_fence_method.attr, + &o2nm_cluster_attr_idle_timeout_ms, + &o2nm_cluster_attr_keepalive_delay_ms, + &o2nm_cluster_attr_reconnect_delay_ms, + &o2nm_cluster_attr_fence_method, NULL, }; -static ssize_t o2nm_cluster_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2nm_cluster *cluster = to_o2nm_cluster(item); - struct o2nm_cluster_attribute *o2nm_cluster_attr = - container_of(attr, struct o2nm_cluster_attribute, attr); - ssize_t ret = 0; - - if (o2nm_cluster_attr->show) - ret = o2nm_cluster_attr->show(cluster, page); - return ret; -} - -static ssize_t o2nm_cluster_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2nm_cluster *cluster = to_o2nm_cluster(item); - struct o2nm_cluster_attribute *o2nm_cluster_attr = - container_of(attr, struct o2nm_cluster_attribute, attr); - ssize_t ret; - - if (o2nm_cluster_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - ret = o2nm_cluster_attr->store(cluster, page, count); - if (ret < count) - goto out; -out: - return ret; -} static struct config_item *o2nm_node_group_make_item(struct config_group *group, const char *name) @@ -722,13 +605,15 @@ static void o2nm_node_group_drop_item(struct config_group *group, struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); - o2net_disconnect_node(node); + if (cluster->cl_nodes[node->nd_num] == node) { + o2net_disconnect_node(node); - if (cluster->cl_has_local && - (cluster->cl_local_node == node->nd_num)) { - cluster->cl_has_local = 0; - cluster->cl_local_node = O2NM_INVALID_NODE_NUM; - o2net_stop_listening(node); + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } } /* XXX call into net to stop this node from trading messages */ @@ -757,7 +642,7 @@ static struct configfs_group_operations o2nm_node_group_group_ops = { .drop_item = o2nm_node_group_drop_item, }; -static struct config_item_type o2nm_node_group_type = { +static const struct config_item_type o2nm_node_group_type = { .ct_group_ops = &o2nm_node_group_group_ops, .ct_owner = THIS_MODULE, }; @@ -768,17 +653,14 @@ static void o2nm_cluster_release(struct config_item *item) { struct o2nm_cluster *cluster = to_o2nm_cluster(item); - kfree(cluster->cl_group.default_groups); kfree(cluster); } static struct configfs_item_operations o2nm_cluster_item_ops = { .release = o2nm_cluster_release, - .show_attribute = o2nm_cluster_show, - .store_attribute = o2nm_cluster_store, }; -static struct config_item_type o2nm_cluster_type = { +static const struct config_item_type o2nm_cluster_type = { .ct_item_ops = &o2nm_cluster_item_ops, .ct_attrs = o2nm_cluster_attrs, .ct_owner = THIS_MODULE, @@ -806,29 +688,26 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g struct o2nm_cluster *cluster = NULL; struct o2nm_node_group *ns = NULL; struct config_group *o2hb_group = NULL, *ret = NULL; - void *defs = NULL; - /* this runs under the parent dir's i_mutex; there can be only + /* this runs under the parent dir's i_rwsem; there can be only * one caller in here at a time */ if (o2nm_single_cluster) return ERR_PTR(-ENOSPC); cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); - defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); o2hb_group = o2hb_alloc_hb_set(); - if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) + if (cluster == NULL || ns == NULL || o2hb_group == NULL) goto out; config_group_init_type_name(&cluster->cl_group, name, &o2nm_cluster_type); + configfs_add_default_group(&ns->ns_group, &cluster->cl_group); + config_group_init_type_name(&ns->ns_group, "node", &o2nm_node_group_type); + configfs_add_default_group(o2hb_group, &cluster->cl_group); - cluster->cl_group.default_groups = defs; - cluster->cl_group.default_groups[0] = &ns->ns_group; - cluster->cl_group.default_groups[1] = o2hb_group; - cluster->cl_group.default_groups[2] = NULL; rwlock_init(&cluster->cl_nodes_lock); cluster->cl_node_ip_tree = RB_ROOT; cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; @@ -844,7 +723,6 @@ out: kfree(cluster); kfree(ns); o2hb_free_hb_set(o2hb_group); - kfree(defs); ret = ERR_PTR(-ENOMEM); } @@ -854,18 +732,11 @@ out: static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item) { struct o2nm_cluster *cluster = to_o2nm_cluster(item); - int i; - struct config_item *killme; BUG_ON(o2nm_single_cluster != cluster); o2nm_single_cluster = NULL; - for (i = 0; cluster->cl_group.default_groups[i]; i++) { - killme = &cluster->cl_group.default_groups[i]->cg_item; - cluster->cl_group.default_groups[i] = NULL; - config_item_put(killme); - } - + configfs_remove_default_groups(&cluster->cl_group); config_item_put(item); } @@ -874,7 +745,7 @@ static struct configfs_group_operations o2nm_cluster_group_group_ops = { .drop_item = o2nm_cluster_group_drop_item, }; -static struct config_item_type o2nm_cluster_group_type = { +static const struct config_item_type o2nm_cluster_group_type = { .ct_group_ops = &o2nm_cluster_group_group_ops, .ct_owner = THIS_MODULE, }; @@ -890,6 +761,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = { }, }; +static inline void o2nm_lock_subsystem(void) +{ + mutex_lock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + +static inline void o2nm_unlock_subsystem(void) +{ + mutex_unlock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + int o2nm_depend_item(struct config_item *item) { return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); @@ -897,7 +778,7 @@ int o2nm_depend_item(struct config_item *item) void o2nm_undepend_item(struct config_item *item) { - configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); + configfs_undepend_item(item); } int o2nm_depend_this_node(void) @@ -943,13 +824,9 @@ static void __exit exit_o2nm(void) static int __init init_o2nm(void) { - int ret = -1; + int ret; - cluster_print_version(); - - ret = o2hb_init(); - if (ret) - goto out; + o2hb_init(); ret = o2net_init(); if (ret) @@ -984,6 +861,7 @@ out: MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster management"); module_init(init_o2nm) module_exit(exit_o2nm) diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index 09ea2d388bbb..3490e77a952d 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * nodemanager.h * * Function prototypes * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef O2CLUSTER_NODEMANAGER_H diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h index 3f4151da9709..6088c9f974dd 100644 --- a/fs/ocfs2/cluster/ocfs2_heartbeat.h +++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * ocfs2_heartbeat.h * * On-disk structures for ocfs2_heartbeat * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef _OCFS2_HEARTBEAT_H diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h index 49b594325bec..c9a0b77443e7 100644 --- a/fs/ocfs2/cluster/ocfs2_nodemanager.h +++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h @@ -1,28 +1,11 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * ocfs2_nodemanager.h * * Header describing the interface between userspace and the kernel * for the ocfs2_nodemanager module. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef _OCFS2_NODEMANAGER_H diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 1ec141e758d7..bfb8b456876c 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -1,23 +1,7 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * - * vim: noexpandtab sw=8 ts=8 sts=0: +// SPDX-License-Identifier: GPL-2.0-or-later +/* * * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ /* This quorum hack is only here until we transition to some more rational @@ -39,7 +23,7 @@ * race between when we see a node start heartbeating and when we connect * to it. * - * So nodes that are in this transtion put a hold on the quorum decision + * So nodes that are in this transition put a hold on the quorum decision * with a counter. As they fall out of this transition they drop the count * and if they're the last, they fire off the decision. */ @@ -76,20 +60,21 @@ static void o2quo_fence_self(void) switch (o2nm_single_cluster->cl_fence_method) { case O2NM_FENCE_PANIC: panic("*** ocfs2 is very sorry to be fencing this system by " - "panicing ***\n"); + "panicking ***\n"); break; default: WARN_ON(o2nm_single_cluster->cl_fence_method >= O2NM_FENCE_METHODS); + fallthrough; case O2NM_FENCE_RESET: printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " "system by restarting ***\n"); emergency_restart(); break; - }; + } } -/* Indicate that a timeout occurred on a hearbeat region write. The +/* Indicate that a timeout occurred on a heartbeat region write. The * other nodes in the cluster may consider us dead at that time so we * want to "fence" ourselves so that we don't scribble on the disk * after they think they've recovered us. This can't solve all @@ -108,7 +93,7 @@ static void o2quo_make_decision(struct work_struct *work) int lowest_hb, lowest_reachable = 0, fence = 0; struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); if (lowest_hb != O2NM_MAX_NODES) @@ -160,9 +145,18 @@ static void o2quo_make_decision(struct work_struct *work) } out: - spin_unlock(&qs->qs_lock); - if (fence) + if (fence) { + spin_unlock_bh(&qs->qs_lock); o2quo_fence_self(); + } else { + mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " + "connected: %d, lowest: %d (%sreachable)\n", + qs->qs_heartbeating, qs->qs_connected, lowest_hb, + lowest_reachable ? "" : "un"); + spin_unlock_bh(&qs->qs_lock); + + } + } static void o2quo_set_hold(struct o2quo_state *qs, u8 node) @@ -195,14 +189,14 @@ static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) } /* as a node comes up we delay the quorum decision until we know the fate of - * the connection. the hold will be droped in conn_up or hb_down. it might be + * the connection. the hold will be dropped in conn_up or hb_down. it might be * perpetuated by con_err until hb_down. if we already have a conn, we might * be dropping a hold that conn_up got. */ void o2quo_hb_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_heartbeating++; mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, @@ -217,7 +211,7 @@ void o2quo_hb_up(u8 node) else o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* hb going down releases any holds we might have had due to this node from @@ -226,7 +220,7 @@ void o2quo_hb_down(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_heartbeating--; mlog_bug_on_msg(qs->qs_heartbeating < 0, @@ -239,7 +233,7 @@ void o2quo_hb_down(u8 node) o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* this tells us that we've decided that the node is still heartbeating @@ -251,18 +245,18 @@ void o2quo_hb_still_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); mlog(0, "node %u\n", node); qs->qs_pending = 1; o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* This is analogous to hb_up. as a node's connection comes up we delay the - * quorum decision until we see it heartbeating. the hold will be droped in + * quorum decision until we see it heartbeating. the hold will be dropped in * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if * it's already heartbeating we might be dropping a hold that conn_up got. * */ @@ -270,7 +264,7 @@ void o2quo_conn_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_connected++; mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, @@ -285,7 +279,7 @@ void o2quo_conn_up(u8 node) else o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* we've decided that we won't ever be connecting to the node again. if it's @@ -296,7 +290,7 @@ void o2quo_conn_err(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); if (test_bit(node, qs->qs_conn_bm)) { qs->qs_connected--; @@ -305,14 +299,15 @@ void o2quo_conn_err(u8 node) node, qs->qs_connected); clear_bit(node, qs->qs_conn_bm); + + if (test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); } mlog(0, "node %u, %d total\n", node, qs->qs_connected); - if (test_bit(node, qs->qs_hb_bm)) - o2quo_set_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } void o2quo_init(void) diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h index 6649cc6f67c9..d64bf4482a4a 100644 --- a/fs/ocfs2/cluster/quorum.h +++ b/fs/ocfs2/cluster/quorum.h @@ -1,23 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef O2CLUSTER_QUORUM_H diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index a4b07730b2e1..022f716c74ff 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * sys.c * * OCFS2 cluster sysfs interface * * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation, - * version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #include <linux/kernel.h> @@ -41,7 +24,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); } static struct kobj_attribute attr_version = - __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); + __ATTR(interface_revision, S_IRUGO, version_show, NULL); static struct attribute *o2cb_attrs[] = { &attr_version.attr, diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h index d66b8ab0045e..70aaba65317e 100644 --- a/fs/ocfs2/cluster/sys.h +++ b/fs/ocfs2/cluster/sys.h @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * sys.h * * Function prototypes for o2cb sysfs interface * * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation, - * version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef O2CLUSTER_SYS_H diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d644dc611425..79b281e32f4c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1,33 +1,17 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * - * vim: noexpandtab sw=8 ts=8 sts=0: +// SPDX-License-Identifier: GPL-2.0-or-later +/* * * Copyright (C) 2004 Oracle. All rights reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * * ---- * - * Callers for this were originally written against a very simple synchronus + * Callers for this were originally written against a very simple synchronous * API. This implementation reflects those simple callers. Some day I'm sure * we'll need to move to a more robust posting/callback mechanism. * * Transmit calls pass in kernel virtual addresses and block copying this into * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting - * for a failed socket to timeout. TX callers can also pass in a poniter to an + * for a failed socket to timeout. TX callers can also pass in a pointer to an * 'int' which gets filled with an errno off the wire in response to the * message they send. * @@ -54,6 +38,7 @@ */ #include <linux/kernel.h> +#include <linux/sched/mm.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/idr.h> @@ -61,8 +46,9 @@ #include <linux/net.h> #include <linux/export.h> #include <net/tcp.h> +#include <trace/events/sock.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "heartbeat.h" #include "tcp.h" @@ -97,7 +83,7 @@ typeof(sc) __sc = (sc); \ mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ "pg_off %zu] " fmt, __sc, \ - atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \ + kref_read(&__sc->sc_kref), __sc->sc_sock, \ __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ ##args); \ } while (0) @@ -108,14 +94,14 @@ static struct rb_root o2net_handler_tree = RB_ROOT; static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; /* XXX someday we'll need better accounting */ -static struct socket *o2net_listen_sock = NULL; +static struct socket *o2net_listen_sock; /* * listen work is only queued by the listening socket callbacks on the * o2net_wq. teardown detaches the callbacks before destroying the workqueue. * quorum work is queued as sock containers are shutdown.. stop_listening * tears down all the node's sock containers, preventing future shutdowns - * and queued quroum work, before canceling delayed quorum work and + * and queued quorum work, before canceling delayed quorum work and * destroying the work queue. */ static struct workqueue_struct *o2net_wq; @@ -137,9 +123,9 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] = static void o2net_sc_connect_completed(struct work_struct *work); static void o2net_rx_until_empty(struct work_struct *work); static void o2net_shutdown_sc(struct work_struct *work); -static void o2net_listen_data_ready(struct sock *sk, int bytes); +static void o2net_listen_data_ready(struct sock *sk); static void o2net_sc_send_keep_req(struct work_struct *work); -static void o2net_idle_timer(unsigned long data); +static void o2net_idle_timer(struct timer_list *t); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); @@ -262,17 +248,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc) #endif /* CONFIG_OCFS2_FS_STATS */ -static inline int o2net_reconnect_delay(void) +static inline unsigned int o2net_reconnect_delay(void) { return o2nm_single_cluster->cl_reconnect_delay_ms; } -static inline int o2net_keepalive_delay(void) +static inline unsigned int o2net_keepalive_delay(void) { return o2nm_single_cluster->cl_keepalive_delay_ms; } -static inline int o2net_idle_timeout(void) +static inline unsigned int o2net_idle_timeout(void) { return o2nm_single_cluster->cl_idle_timeout_ms; } @@ -449,9 +435,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); - init_timer(&sc->sc_idle_timeout); - sc->sc_idle_timeout.function = o2net_idle_timer; - sc->sc_idle_timeout.data = (unsigned long)sc; + timer_setup(&sc->sc_idle_timeout, o2net_idle_timer, 0); sclog(sc, "alloced\n"); @@ -536,15 +520,16 @@ static void o2net_set_nn_state(struct o2net_node *nn, if (nn->nn_persistent_error || nn->nn_sc_valid) wake_up(&nn->nn_sc_wq); - if (!was_err && nn->nn_persistent_error) { + if (was_valid && !was_err && nn->nn_persistent_error) { o2quo_conn_err(o2net_num_from_nn(nn)); queue_delayed_work(o2net_wq, &nn->nn_still_up, msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); } if (was_valid && !valid) { - printk(KERN_NOTICE "o2net: No longer connected to " - SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); + if (old_sc) + printk(KERN_NOTICE "o2net: No longer connected to " + SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); o2net_complete_nodes_nsw(nn); } @@ -596,13 +581,16 @@ static void o2net_set_nn_state(struct o2net_node *nn, } /* see o2net_register_callbacks() */ -static void o2net_data_ready(struct sock *sk, int bytes) +static void o2net_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); + struct o2net_sock_container *sc; - read_lock(&sk->sk_callback_lock); - if (sk->sk_user_data) { - struct o2net_sock_container *sc = sk->sk_user_data; + trace_sk_data_ready(sk); + + read_lock_bh(&sk->sk_callback_lock); + sc = sk->sk_user_data; + if (sc) { sclog(sc, "data_ready hit\n"); o2net_set_data_ready_time(sc); o2net_sc_queue_work(sc, &sc->sc_rx_work); @@ -610,9 +598,9 @@ static void o2net_data_ready(struct sock *sk, int bytes) } else { ready = sk->sk_data_ready; } - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } /* see o2net_register_callbacks() */ @@ -621,7 +609,7 @@ static void o2net_state_change(struct sock *sk) void (*state_change)(struct sock *sk); struct o2net_sock_container *sc; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); sc = sk->sk_user_data; if (sc == NULL) { state_change = sk->sk_state_change; @@ -648,7 +636,7 @@ static void o2net_state_change(struct sock *sk) break; } out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); state_change(sk); } @@ -736,7 +724,7 @@ static void o2net_shutdown_sc(struct work_struct *work) if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) { /* we shouldn't flush as we're in the thread, the * races with pending sc work structs are harmless */ - del_timer_sync(&sc->sc_idle_timeout); + timer_delete_sync(&sc->sc_idle_timeout); o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); sc_put(sc); kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR); @@ -765,32 +753,32 @@ static struct o2net_msg_handler * o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, struct rb_node **ret_parent) { - struct rb_node **p = &o2net_handler_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p = &o2net_handler_tree.rb_node; + struct rb_node *parent = NULL; struct o2net_msg_handler *nmh, *ret = NULL; int cmp; - while (*p) { - parent = *p; - nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); + while (*p) { + parent = *p; + nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); cmp = o2net_handler_cmp(nmh, msg_type, key); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else { + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { ret = nmh; - break; + break; } - } + } - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; - return ret; + return ret; } static void o2net_handler_kref_release(struct kref *kref) @@ -871,8 +859,6 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, "for type %u key %08x\n", msg_type, key); } write_unlock(&o2net_handler_lock); - if (ret) - goto out; out: if (ret) @@ -915,74 +901,51 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { - int ret; - mm_segment_t oldfs; - struct kvec vec = { - .iov_len = len, - .iov_base = data, - }; - struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&vec, - .msg_flags = MSG_DONTWAIT, - }; - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); - set_fs(oldfs); - - return ret; + struct kvec vec = { .iov_len = len, .iov_base = data, }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, len); + return sock_recvmsg(sock, &msg, MSG_DONTWAIT); } static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, size_t veclen, size_t total) { int ret; - mm_segment_t oldfs; - struct msghdr msg = { - .msg_iov = (struct iovec *)vec, - .msg_iovlen = veclen, - }; + struct msghdr msg = {.msg_flags = 0,}; if (sock == NULL) { ret = -EINVAL; goto out; } - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_sendmsg(sock, &msg, total); - set_fs(oldfs); - if (ret != total) { - mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, - total); - if (ret >= 0) - ret = -EPIPE; /* should be smarter, I bet */ - goto out; - } - - ret = 0; + ret = kernel_sendmsg(sock, &msg, vec, veclen, total); + if (likely(ret == total)) + return 0; + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ out: - if (ret < 0) - mlog(0, "returning error: %d\n", ret); + mlog(0, "returning error: %d\n", ret); return ret; } static void o2net_sendpage(struct o2net_sock_container *sc, - void *kmalloced_virt, - size_t size) + void *virt, size_t size) { struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + struct msghdr msg = {}; + struct bio_vec bv; ssize_t ret; + bvec_set_virt(&bv, virt, size); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, size); + while (1) { + msg.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES; mutex_lock(&sc->sc_send_lock); - ret = sc->sc_sock->ops->sendpage(sc->sc_sock, - virt_to_page(kmalloced_virt), - (long)kmalloced_virt & ~PAGE_MASK, - size, MSG_DONTWAIT); + ret = sock_sendmsg(sc->sc_sock, &msg); mutex_unlock(&sc->sc_send_lock); + if (ret == size) break; if (ret == (ssize_t)-EAGAIN) { @@ -1033,16 +996,15 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, } /* Get a map of all nodes to which this node is currently connected to */ -void o2net_fill_node_map(unsigned long *map, unsigned bytes) +void o2net_fill_node_map(unsigned long *map, unsigned int bits) { struct o2net_sock_container *sc; int node, ret; - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memset(map, 0, bytes); + bitmap_zero(map, bits); for (node = 0; node < O2NM_MAX_NODES; ++node) { - o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); + if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret)) + continue; if (!ret) { set_bit(node, map); sc_put(sc); @@ -1102,7 +1064,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, o2net_set_nst_sock_container(&nst, sc); veclen = caller_veclen + 1; - vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); + vec = kmalloc_array(veclen, sizeof(struct kvec), GFP_ATOMIC); if (vec == NULL) { mlog(0, "failed to %zu element kvec!\n", veclen); ret = -ENOMEM; @@ -1238,7 +1200,6 @@ static int o2net_process_message(struct o2net_sock_container *sc, msglog(hdr, "bad magic\n"); ret = -EINVAL; goto out; - break; } /* find a handler for it */ @@ -1458,7 +1419,7 @@ out: return ret; } -/* this work func is triggerd by data ready. it reads until it can read no +/* this work func is triggered by data ready. it reads until it can read no * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing * our work the work struct will be marked and we'll be called again. */ static void o2net_rx_until_empty(struct work_struct *work) @@ -1481,31 +1442,6 @@ static void o2net_rx_until_empty(struct work_struct *work) sc_put(sc); } -static int o2net_set_nodelay(struct socket *sock) -{ - int ret, val = 1; - mm_segment_t oldfs; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - - /* - * Dear unsuspecting programmer, - * - * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level - * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will - * silently turn into SO_DEBUG. - * - * Yours, - * Keeper of hilariously fragile interfaces. - */ - ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char __user *)&val, sizeof(val)); - - set_fs(oldfs); - return ret; -} - static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1547,12 +1483,13 @@ static void o2net_sc_send_keep_req(struct work_struct *work) sc_put(sc); } -/* socket shutdown does a del_timer_sync against this as it tears down. +/* socket shutdown does a timer_delete_sync against this as it tears down. * we can't start this timer until we've got to the point in sc buildup * where shutdown is going to be involved */ -static void o2net_idle_timer(unsigned long data) +static void o2net_idle_timer(struct timer_list *t) { - struct o2net_sock_container *sc = (struct o2net_sock_container *)data; + struct o2net_sock_container *sc = timer_container_of(sc, t, + sc_idle_timeout); struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); #ifdef CONFIG_DEBUG_FS unsigned long msecs = ktime_to_ms(ktime_get()) - @@ -1562,16 +1499,20 @@ static void o2net_idle_timer(unsigned long data) #endif printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " - "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), - msecs / 1000, msecs % 1000); + "idle for %lu.%lu secs.\n", + SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000); - /* - * Initialize the nn_timeout so that the next connection attempt - * will continue in o2net_start_connect. + /* idle timerout happen, don't shutdown the connection, but + * make fence decision. Maybe the connection can recover before + * the decision is made. */ atomic_set(&nn->nn_timeout, 1); + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + + o2net_sc_reset_idle_timer(sc); - o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) @@ -1586,6 +1527,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) { + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + /* clear fence decision since the connection recover from timeout*/ + if (atomic_read(&nn->nn_timeout)) { + o2quo_conn_up(o2net_num_from_nn(nn)); + cancel_delayed_work(&nn->nn_still_up); + atomic_set(&nn->nn_timeout, 0); + } + /* Only push out an existing timer */ if (timer_pending(&sc->sc_idle_timeout)) o2net_sc_reset_idle_timer(sc); @@ -1606,23 +1556,25 @@ static void o2net_start_connect(struct work_struct *work) struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; int ret = 0, stop; unsigned int timeout; + unsigned int nofs_flag; + /* + * sock_create allocates the sock with GFP_KERNEL. We must + * prevent the filesystem from being reentered by memory reclaim. + */ + nofs_flag = memalloc_nofs_save(); /* if we're greater we initiate tx, otherwise we accept */ if (o2nm_this_node() <= o2net_num_from_nn(nn)) goto out; /* watch for racing with tearing a node down */ node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); - if (node == NULL) { - ret = 0; + if (node == NULL) goto out; - } mynode = o2nm_get_node_by_num(o2nm_this_node()); - if (mynode == NULL) { - ret = 0; + if (mynode == NULL) goto out; - } spin_lock(&nn->nn_lock); /* @@ -1657,12 +1609,13 @@ static void o2net_start_connect(struct work_struct *work) sc->sc_sock = sock; /* freed by sc_kref_release */ sock->sk->sk_allocation = GFP_ATOMIC; + sock->sk->sk_use_task_frag = false; myaddr.sin_family = AF_INET; myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; myaddr.sin_port = htons(0); /* any port */ - ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, + ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&myaddr, sizeof(myaddr)); if (ret) { mlog(ML_ERROR, "bind failed with %d at address %pI4\n", @@ -1670,11 +1623,8 @@ static void o2net_start_connect(struct work_struct *work) goto out; } - ret = o2net_set_nodelay(sc->sc_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(sc->sc_sock->sk); + tcp_sock_set_user_timeout(sock->sk, O2NET_TCP_USER_TIMEOUT); o2net_register_callbacks(sc->sc_sock->sk, sc); @@ -1688,20 +1638,19 @@ static void o2net_start_connect(struct work_struct *work) remoteaddr.sin_port = node->nd_ipv4_port; ret = sc->sc_sock->ops->connect(sc->sc_sock, - (struct sockaddr *)&remoteaddr, + (struct sockaddr_unsized *)&remoteaddr, sizeof(remoteaddr), O_NONBLOCK); if (ret == -EINPROGRESS) ret = 0; out: - if (ret) { + if (ret && sc) { printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); /* 0 err so that another will be queued and attempted * from set_nn_state */ - if (sc) - o2net_ensure_shutdown(nn, sc, 0); + o2net_ensure_shutdown(nn, sc, 0); } if (sc) sc_put(sc); @@ -1710,6 +1659,7 @@ out: if (mynode) o2nm_node_put(mynode); + memalloc_nofs_restore(nofs_flag); return; } @@ -1721,12 +1671,13 @@ static void o2net_connect_expired(struct work_struct *work) spin_lock(&nn->nn_lock); if (!nn->nn_sc_valid) { printk(KERN_NOTICE "o2net: No connection established with " - "node %u after %u.%u seconds, giving up.\n", + "node %u after %u.%u seconds, check network and" + " cluster configuration.\n", o2net_num_from_nn(nn), o2net_idle_timeout() / 1000, o2net_idle_timeout() % 1000); - o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); + o2net_set_nn_state(nn, NULL, 0, 0); } spin_unlock(&nn->nn_lock); } @@ -1787,7 +1738,7 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, (msecs_to_jiffies(o2net_reconnect_delay()) + 1); if (node_num != o2nm_this_node()) { - /* believe it or not, accept and node hearbeating testing + /* believe it or not, accept and node heartbeating testing * can succeed for this node before we got here.. so * only use set_nn_state to clear the persistent error * if that hasn't already happened */ @@ -1826,17 +1777,28 @@ int o2net_register_hb_callbacks(void) /* ------------------------------------------------------------ */ -static int o2net_accept_one(struct socket *sock) +static int o2net_accept_one(struct socket *sock, int *more) { - int ret, slen; + int ret; struct sockaddr_in sin; struct socket *new_sock = NULL; struct o2nm_node *node = NULL; struct o2nm_node *local_node = NULL; struct o2net_sock_container *sc = NULL; + struct proto_accept_arg arg = { + .flags = O_NONBLOCK, + }; struct o2net_node *nn; + unsigned int nofs_flag; + + /* + * sock_create_lite allocates the sock with GFP_KERNEL. We must + * prevent the filesystem from being reentered by memory reclaim. + */ + nofs_flag = memalloc_nofs_save(); BUG_ON(sock == NULL); + *more = 0; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); if (ret) @@ -1844,21 +1806,17 @@ static int o2net_accept_one(struct socket *sock) new_sock->type = sock->type; new_sock->ops = sock->ops; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + ret = sock->ops->accept(sock, new_sock, &arg); if (ret < 0) goto out; + *more = 1; new_sock->sk->sk_allocation = GFP_ATOMIC; - ret = o2net_set_nodelay(new_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(new_sock->sk); + tcp_sock_set_user_timeout(new_sock->sk, O2NET_TCP_USER_TIMEOUT); - slen = sizeof(sin); - ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, - &slen, 1); + ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1); if (ret < 0) goto out; @@ -1873,12 +1831,16 @@ static int o2net_accept_one(struct socket *sock) if (o2nm_this_node() >= node->nd_num) { local_node = o2nm_get_node_by_num(o2nm_this_node()); - printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " - "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " - "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, - &(local_node->nd_ipv4_address), - ntohs(local_node->nd_ipv4_port), node->nd_name, - node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + if (local_node) + printk(KERN_NOTICE "o2net: Unexpected connect attempt " + "seen at node '%s' (%u, %pI4:%d) from " + "node '%s' (%u, %pI4:%d)\n", + local_node->nd_name, local_node->nd_num, + &(local_node->nd_ipv4_address), + ntohs(local_node->nd_ipv4_port), + node->nd_name, + node->nd_num, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); ret = -EINVAL; goto out; } @@ -1939,39 +1901,78 @@ out: o2nm_node_put(local_node); if (sc) sc_put(sc); + + memalloc_nofs_restore(nofs_flag); return ret; } +/* + * This function is invoked in response to one or more + * pending accepts at softIRQ level. We must drain the + * entire que before returning. + */ + static void o2net_accept_many(struct work_struct *work) { struct socket *sock = o2net_listen_sock; - while (o2net_accept_one(sock) == 0) + int more; + + /* + * It is critical to note that due to interrupt moderation + * at the network driver level, we can't assume to get a + * softIRQ for every single conn since tcp SYN packets + * can arrive back-to-back, and therefore many pending + * accepts may result in just 1 softIRQ. If we terminate + * the o2net_accept_one() loop upon seeing an err, what happens + * to the rest of the conns in the queue? If no new SYN + * arrives for hours, no softIRQ will be delivered, + * and the connections will just sit in the queue. + */ + + for (;;) { + o2net_accept_one(sock, &more); + if (!more) + break; cond_resched(); + } } -static void o2net_listen_data_ready(struct sock *sk, int bytes) +static void o2net_listen_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); + + trace_sk_data_ready(sk); - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (ready == NULL) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } - /* ->sk_data_ready is also called for a newly established child socket - * before it has been accepted and the acceptor has set up their - * data_ready.. we only want to queue listen work for our listening - * socket */ + /* This callback may called twice when a new connection + * is being established as a child socket inherits everything + * from a parent LISTEN socket, including the data_ready cb of + * the parent. This leads to a hazard. In o2net_accept_one() + * we are still initializing the child socket but have not + * changed the inherited data_ready callback yet when + * data starts arriving. + * We avoid this hazard by checking the state. + * For the listening socket, the state will be TCP_LISTEN; for the new + * socket, will be TCP_ESTABLISHED. Also, in this case, + * sk->sk_user_data is not a valid function pointer. + */ + if (sk->sk_state == TCP_LISTEN) { - mlog(ML_TCP, "bytes: %d\n", bytes); queue_work(o2net_wq, &o2net_listen_work); + } else { + ready = NULL; } out: - read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + read_unlock_bh(&sk->sk_callback_lock); + if (ready != NULL) + ready(sk); } static int o2net_open_listening_sock(__be32 addr, __be16 port) @@ -2001,7 +2002,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) INIT_WORK(&o2net_listen_work, o2net_accept_many); sock->sk->sk_reuse = SK_CAN_REUSE; - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); if (ret < 0) { printk(KERN_ERR "o2net: Error %d while binding socket at " "%pI4:%u\n", ret, &addr, ntohs(port)); @@ -2037,7 +2038,7 @@ int o2net_start_listening(struct o2nm_node *node) BUG_ON(o2net_listen_sock != NULL); mlog(ML_KTHREAD, "starting o2net thread...\n"); - o2net_wq = create_singlethread_workqueue("o2net"); + o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM); if (o2net_wq == NULL) { mlog(ML_ERROR, "unable to launch o2net thread\n"); return -ENOMEM; /* ? */ @@ -2093,22 +2094,23 @@ void o2net_stop_listening(struct o2nm_node *node) int o2net_init(void) { + struct folio *folio; + void *p; unsigned long i; o2quo_init(); + o2net_debugfs_init(); - if (o2net_debugfs_init()) - return -ENOMEM; + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0); + if (!folio) + goto out; - o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); - o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); - o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); - if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { - kfree(o2net_hand); - kfree(o2net_keep_req); - kfree(o2net_keep_resp); - return -ENOMEM; - } + p = folio_address(folio); + o2net_hand = p; + p += sizeof(struct o2net_handshake); + o2net_keep_req = p; + p += sizeof(struct o2net_msg); + o2net_keep_resp = p; o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); o2net_hand->connector_id = cpu_to_be64(1); @@ -2133,13 +2135,16 @@ int o2net_init(void) } return 0; + +out: + o2net_debugfs_exit(); + o2quo_exit(); + return -ENOMEM; } void o2net_exit(void) { o2quo_exit(); - kfree(o2net_hand); - kfree(o2net_keep_req); - kfree(o2net_keep_resp); o2net_debugfs_exit(); + folio_put(virt_to_folio(o2net_hand)); } diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 5bada2a69b50..a75b551d31c7 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * tcp.h * * Function prototypes * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef O2CLUSTER_TCP_H @@ -47,7 +30,7 @@ struct o2net_msg __be32 status; __be32 key; __be32 msg_num; - __u8 buf[0]; + __u8 buf[]; }; typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data, @@ -63,6 +46,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data, #define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 #define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 +#define O2NET_TCP_USER_TIMEOUT 0x7fffffff /* TODO: figure this out.... */ static inline int o2net_link_down(int err, struct socket *sock) @@ -123,16 +107,15 @@ struct o2net_send_tracking; struct o2net_sock_container; #ifdef CONFIG_DEBUG_FS -int o2net_debugfs_init(void); +void o2net_debugfs_init(void); void o2net_debugfs_exit(void); void o2net_debug_add_nst(struct o2net_send_tracking *nst); void o2net_debug_del_nst(struct o2net_send_tracking *nst); void o2net_debug_add_sc(struct o2net_sock_container *sc); void o2net_debug_del_sc(struct o2net_sock_container *sc); #else -static inline int o2net_debugfs_init(void) +static inline void o2net_debugfs_init(void) { - return 0; } static inline void o2net_debugfs_exit(void) { diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4cbcb65784a3..601c99bd2611 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -1,22 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef O2CLUSTER_TCP_INTERNAL_H @@ -107,12 +91,12 @@ struct o2net_node { struct list_head nn_status_list; /* connects are attempted from when heartbeat comes up until either hb - * goes down, the node is unconfigured, no connect attempts succeed - * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work - * is queued from set_nn_state both from hb up and from itself if a - * connect attempt fails and so can be self-arming. shutdown is - * careful to first mark the nn such that no connects will be attempted - * before canceling delayed connect work and flushing the queue. */ + * goes down, the node is unconfigured, or a connect succeeds. + * connect_work is queued from set_nn_state both from hb up and from + * itself if a connect attempt fails and so can be self-arming. + * shutdown is careful to first mark the nn such that no connects will + * be attempted before canceling delayed connect work and flushing the + * queue. */ struct delayed_work nn_connect_work; unsigned long nn_last_connect_attempt; @@ -165,7 +149,7 @@ struct o2net_sock_container { /* original handlers for the sockets */ void (*sc_state_change)(struct sock *sk); - void (*sc_data_ready)(struct sock *sk, int bytes); + void (*sc_data_ready)(struct sock *sk); u32 sc_msg_key; u16 sc_msg_type; @@ -196,7 +180,7 @@ struct o2net_msg_handler { u32 nh_msg_type; u32 nh_key; o2net_msg_handler_func *nh_func; - o2net_msg_handler_func *nh_func_data; + void *nh_func_data; o2net_post_msg_handler_func *nh_post_func; struct kref nh_kref; diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c deleted file mode 100644 index a56eee6abad3..000000000000 --- a/fs/ocfs2/cluster/ver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define CLUSTER_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION - -void cluster_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(CLUSTER_BUILD_VERSION); diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h deleted file mode 100644 index 32554c3382c2..000000000000 --- a/fs/ocfs2/cluster/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef O2CLUSTER_VER_H -#define O2CLUSTER_VER_H - -void cluster_print_version(void); - -#endif /* O2CLUSTER_VER_H */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index ef999729e274..1873bbbb7e5b 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dcache.c * * dentry cache handling code * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -37,19 +21,19 @@ #include "dlmglue.h" #include "file.h" #include "inode.h" -#include "super.h" #include "ocfs2_trace.h" void ocfs2_dentry_attach_gen(struct dentry *dentry) { unsigned long gen = - OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; - BUG_ON(dentry->d_inode); + OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; + BUG_ON(d_inode(dentry)); dentry->d_fsdata = (void *)gen; } -static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) +static int ocfs2_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; int ret = 0; /* if all else fails, just return false */ @@ -58,11 +42,10 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); osb = OCFS2_SB(dentry->d_sb); - trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len, - dentry->d_name.name); + trace_ocfs2_dentry_revalidate(dentry, name->len, name->name); /* For a negative dentry - * check the generation number of the parent and compare with the @@ -70,11 +53,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) */ if (inode == NULL) { unsigned long gen = (unsigned long) dentry->d_fsdata; - unsigned long pgen = - OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; - - trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, - dentry->d_name.name, + unsigned long pgen = OCFS2_I(dir)->ip_dir_lock_gen; + trace_ocfs2_dentry_revalidate_negative(name->len, name->name, pgen, gen); if (gen != pgen) goto bail; @@ -140,17 +120,10 @@ static int ocfs2_match_dentry(struct dentry *dentry, if (!dentry->d_fsdata) return 0; - if (!dentry->d_parent) - return 0; - if (skip_unhashed && d_unhashed(dentry)) return 0; - parent = dentry->d_parent->d_inode; - /* Negative parent dentry? */ - if (!parent) - return 0; - + parent = d_inode(dentry->d_parent); /* Name is in a different directory. */ if (OCFS2_I(parent)->ip_blkno != parent_blkno) return 0; @@ -172,7 +145,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { trace_ocfs2_find_local_alias(dentry->d_name.len, @@ -243,7 +216,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, if (!inode) return 0; - if (!dentry->d_inode && dentry->d_fsdata) { + if (d_really_is_negative(dentry) && dentry->d_fsdata) { /* Converting a negative dentry to positive Clear dentry->d_fsdata */ dentry->d_fsdata = dl = NULL; @@ -251,8 +224,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, if (dl) { mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, - " \"%.*s\": old parent: %llu, new: %llu\n", - dentry->d_name.len, dentry->d_name.name, + " \"%pd\": old parent: %llu, new: %llu\n", + dentry, (unsigned long long)parent_blkno, (unsigned long long)dl->dl_parent_blkno); return 0; @@ -277,8 +250,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, - " \"%.*s\": old parent: %llu, new: %llu\n", - dentry->d_name.len, dentry->d_name.name, + " \"%pd\": old parent: %llu, new: %llu\n", + dentry, (unsigned long long)parent_blkno, (unsigned long long)dl->dl_parent_blkno); @@ -310,6 +283,18 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, out_attach: spin_lock(&dentry_attach_lock); + if (unlikely(dentry->d_fsdata && !alias)) { + /* d_fsdata is set by a racing thread which is doing + * the same thing as this thread is doing. Leave the racing + * thread going ahead and we return here. + */ + spin_unlock(&dentry_attach_lock); + iput(dl->dl_inode); + ocfs2_lock_res_free(&dl->dl_lockres); + kfree(dl); + return 0; + } + dentry->d_fsdata = dl; dl->dl_count++; spin_unlock(&dentry_attach_lock); @@ -345,52 +330,6 @@ out_attach: return ret; } -DEFINE_SPINLOCK(dentry_list_lock); - -/* We limit the number of dentry locks to drop in one go. We have - * this limit so that we don't starve other users of ocfs2_wq. */ -#define DL_INODE_DROP_COUNT 64 - -/* Drop inode references from dentry locks */ -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) -{ - struct ocfs2_dentry_lock *dl; - - spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { - dl = osb->dentry_lock_list; - osb->dentry_lock_list = dl->dl_next; - spin_unlock(&dentry_list_lock); - iput(dl->dl_inode); - kfree(dl); - spin_lock(&dentry_list_lock); - } - spin_unlock(&dentry_list_lock); -} - -void ocfs2_drop_dl_inodes(struct work_struct *work) -{ - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); - - __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); - /* - * Don't queue dropping if umount is in progress. We flush the - * list in ocfs2_dismount_volume - */ - spin_lock(&dentry_list_lock); - if (osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - spin_unlock(&dentry_list_lock); -} - -/* Flush the whole work queue */ -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) -{ - __ocfs2_drop_dl_inodes(osb, -1); -} - /* * ocfs2_dentry_iput() and friends. * @@ -415,24 +354,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { + iput(dl->dl_inode); ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); ocfs2_lock_res_free(&dl->dl_lockres); - - /* We leave dropping of inode reference to ocfs2_wq as that can - * possibly lead to inode deletion which gets tricky */ - spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - dl->dl_next = osb->dentry_lock_list; - osb->dentry_lock_list = dl; - spin_unlock(&dentry_list_lock); + kfree(dl); } void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { - int unlock; + int unlock = 0; BUG_ON(dl->dl_count == 0); @@ -460,17 +391,15 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) if (inode) ino = (unsigned long long)OCFS2_I(inode)->ip_blkno; mlog(ML_ERROR, "Dentry is missing cluster lock. " - "inode: %llu, d_flags: 0x%x, d_name: %.*s\n", - ino, dentry->d_flags, dentry->d_name.len, - dentry->d_name.name); + "inode: %llu, d_flags: 0x%x, d_name: %pd\n", + ino, dentry->d_flags, dentry); } goto out; } - mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n", - dentry->d_name.len, dentry->d_name.name, - dl->dl_count); + mlog_bug_on_msg(dl->dl_count == 0, "dentry: %pd, count: %u\n", + dentry, dl->dl_count); ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl); @@ -502,7 +431,7 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, { int ret; struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); /* * Move within the same directory, so the actual lock info won't diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index b79eff709958..7f246c5692d8 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * dcache.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_DCACHE_H @@ -29,13 +13,8 @@ extern const struct dentry_operations ocfs2_dentry_ops; struct ocfs2_dentry_lock { - /* Use count of dentry lock */ unsigned int dl_count; - union { - /* Linked list of dentry locks to release */ - struct ocfs2_dentry_lock *dl_next; - u64 dl_parent_blkno; - }; + u64 dl_parent_blkno; /* * The ocfs2_dentry_lock keeps an inode reference until @@ -49,14 +28,9 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); -extern spinlock_t dentry_list_lock; - void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); -void ocfs2_drop_dl_inodes(struct work_struct *work); -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); - struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index eb760d8acd50..2785ff245e79 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1,6 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dir.c * * Creates, reads, walks and deletes directory-nodes @@ -18,22 +17,7 @@ * * linux/fs/minix/dir.c * - * Copyright (C) 1991, 1992 Linux Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. + * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/fs.h> @@ -42,6 +26,7 @@ #include <linux/highmem.h> #include <linux/quotaops.h> #include <linux/sort.h> +#include <linux/iversion.h> #include <cluster/masklog.h> @@ -68,10 +53,6 @@ #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -static unsigned char ocfs2_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - static int ocfs2_do_extend_dir(struct super_block *sb, handle_t *handle, struct inode *dir, @@ -313,13 +294,29 @@ out: * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. */ -static int ocfs2_check_dir_entry(struct inode * dir, - struct ocfs2_dir_entry * de, - struct buffer_head * bh, +static int ocfs2_check_dir_entry(struct inode *dir, + struct ocfs2_dir_entry *de, + struct buffer_head *bh, + char *buf, + unsigned int size, unsigned long offset) { const char *error_msg = NULL; - const int rlen = le16_to_cpu(de->rec_len); + unsigned long next_offset; + int rlen; + + if (offset > size - OCFS2_DIR_REC_LEN(1)) { + /* Dirent is (maybe partially) beyond the buffer + * boundaries so touching 'de' members is unsafe. + */ + mlog(ML_ERROR, "directory entry (#%llu: offset=%lu) " + "too close to end or out-of-bounds", + (unsigned long long)OCFS2_I(dir)->ip_blkno, offset); + return 0; + } + + rlen = le16_to_cpu(de->rec_len); + next_offset = ((char *) de - buf) + rlen; if (unlikely(rlen < OCFS2_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; @@ -327,9 +324,11 @@ static int ocfs2_check_dir_entry(struct inode * dir, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely( - ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)) - error_msg = "directory entry across blocks"; + else if (unlikely(next_offset > size)) + error_msg = "directory entry overrun"; + else if (unlikely(next_offset > size - OCFS2_DIR_REC_LEN(1)) && + next_offset != size) + error_msg = "directory entry too close to end"; if (unlikely(error_msg != NULL)) mlog(ML_ERROR, "bad entry in directory #%llu: %s - " @@ -371,16 +370,17 @@ static inline int ocfs2_search_dirblock(struct buffer_head *bh, de_buf = first_de; dlimit = de_buf + bytes; - while (de_buf < dlimit) { + while (de_buf < dlimit - OCFS2_DIR_MEMBER_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ de = (struct ocfs2_dir_entry *) de_buf; - if (de_buf + namelen <= dlimit && + if (de->name + namelen <= dlimit && ocfs2_match(namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + if (!ocfs2_check_dir_entry(dir, de, bh, first_de, + bytes, offset)) { ret = -1; goto bail; } @@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Invalid dirblock #%llu: " - "signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - trailer->db_signature); + rc = ocfs2_error(dir->i_sb, + "Invalid dirblock #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + trailer->db_signature); goto out; } if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu has an invalid " - "db_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu has an invalid db_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } if (le64_to_cpu(trailer->db_parent_dinode) != OCFS2_I(dir)->ip_blkno) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu on dinode " - "#%llu has an invalid parent_dinode " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)OCFS2_I(dir)->ip_blkno, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } out: @@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { - ocfs2_error(sb, - "Dir Index Root # %llu has bad signature %.*s", - (unsigned long long)le64_to_cpu(dx_root->dr_blkno), - 7, dx_root->dr_signature); - return -EINVAL; + ret = ocfs2_error(sb, + "Dir Index Root # %llu has bad signature %.*s\n", + (unsigned long long)le64_to_cpu(dx_root->dr_blkno), + 7, dx_root->dr_signature); } - return 0; + return ret; } static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, @@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { - ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", - 7, dx_leaf->dl_signature); - return -EROFS; + ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n", + 7, dx_leaf->dl_signature); } - return 0; + return ret; } static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, @@ -702,7 +693,7 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, int ra_ptr = 0; /* Current index into readahead buffer */ int num = 0; - int nblocks, i, err; + int nblocks, i; sb = dir->i_sb; @@ -734,7 +725,7 @@ restart: num++; bh = NULL; - err = ocfs2_read_dir_block(dir, b++, &bh, + ocfs2_read_dir_block(dir, b++, &bh, OCFS2_BH_READAHEAD); bh_use[ra_max] = bh; } @@ -744,7 +735,7 @@ restart: if (ocfs2_read_dir_block(dir, block, &bh, 0)) { /* read error, skip block & hope for the best. * ocfs2_read_dir_block() has released the bh. */ - ocfs2_error(dir->i_sb, "reading directory %llu, " + mlog(ML_ERROR, "reading directory %llu, " "offset %lu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, block); @@ -800,6 +791,14 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_rec *rec = NULL; + if (le16_to_cpu(el->l_count) != + ocfs2_extent_recs_per_dx_root(inode->i_sb)) { + ret = ocfs2_error(inode->i_sb, + "Inode %lu has invalid extent list length %u\n", + inode->i_ino, le16_to_cpu(el->l_count)); + goto out; + } + if (el->l_tree_depth) { ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash, &eb_bh); @@ -812,15 +811,22 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "btree tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in btree tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ret = ocfs2_error(inode->i_sb, + "Inode %lu has empty extent list at depth %u\n", + inode->i_ino, + le16_to_cpu(el->l_tree_depth)); + goto out; + } + found = 0; for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { rec = &el->l_recs[i]; @@ -832,11 +838,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, } if (!found) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in btree", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in btree\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } @@ -875,9 +881,9 @@ static int ocfs2_dx_dir_lookup(struct inode *inode, u64 *ret_phys_blkno) { int ret = 0; - unsigned int cend, uninitialized_var(clen); - u32 uninitialized_var(cpos); - u64 uninitialized_var(blkno); + unsigned int cend, clen; + u32 cpos; + u64 blkno; u32 name_hash = hinfo->major_hash; ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno, @@ -921,7 +927,7 @@ static int ocfs2_dx_dir_search(const char *name, int namelen, struct ocfs2_dir_lookup_result *res) { int ret, i, found; - u64 uninitialized_var(phys); + u64 phys; struct buffer_head *dx_leaf_bh = NULL; struct ocfs2_dx_leaf *dx_leaf; struct ocfs2_dx_entry *dx_entry = NULL; @@ -1088,26 +1094,39 @@ int ocfs2_find_entry(const char *name, int namelen, { struct buffer_head *bh; struct ocfs2_dir_entry *res_dir = NULL; + int ret = 0; if (ocfs2_dir_indexed(dir)) return ocfs2_find_entry_dx(name, namelen, dir, lookup); + if (unlikely(i_size_read(dir) <= 0)) { + ret = -EFSCORRUPTED; + mlog_errno(ret); + goto out; + } /* * The unindexed dir code only uses part of the lookup * structure, so there's no reason to push it down further * than this. */ - if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (unlikely(i_size_read(dir) > dir->i_sb->s_blocksize)) { + ret = -EFSCORRUPTED; + mlog_errno(ret); + goto out; + } bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); - else + } else { bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir); + } if (bh == NULL) return -ENOENT; lookup->dl_leaf_bh = bh; lookup->dl_entry = res_dir; - return 0; +out: + return ret; } /* @@ -1167,7 +1186,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, pde = NULL; de = (struct ocfs2_dir_entry *) first_de; while (i < bytes) { - if (!ocfs2_check_dir_entry(dir, de, bh, i)) { + if (!ocfs2_check_dir_entry(dir, de, bh, first_de, bytes, i)) { status = -EIO; mlog_errno(status); goto bail; @@ -1184,7 +1203,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, le16_add_cpu(&pde->rec_len, le16_to_cpu(de->rec_len)); de->inode = 0; - dir->i_version++; + inode_inc_iversion(dir); ocfs2_journal_dirty(handle, bh); goto bail; } @@ -1617,14 +1636,11 @@ int __ocfs2_add_entry(handle_t *handle, struct ocfs2_dir_entry *de, *de1; struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; struct super_block *sb = dir->i_sb; - int retval, status; + int retval; unsigned int size = sb->s_blocksize; struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; - if (!namelen) - return -EINVAL; - if (ocfs2_dir_indexed(dir)) { struct buffer_head *bh; @@ -1667,7 +1683,8 @@ int __ocfs2_add_entry(handle_t *handle, /* These checks should've already been passed by the * prepare function, but I guess we can leave them * here anyway. */ - if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) { + if (!ocfs2_check_dir_entry(dir, de, insert_bh, data_start, + size, offset)) { retval = -ENOENT; goto bail; } @@ -1687,7 +1704,8 @@ int __ocfs2_add_entry(handle_t *handle, offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); if (ocfs2_dirent_would_fit(de, rec_len)) { - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + inode_set_mtime_to_ts(dir, + inode_set_ctime_current(dir)); retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (retval < 0) { mlog_errno(retval); @@ -1695,25 +1713,25 @@ int __ocfs2_add_entry(handle_t *handle, } if (insert_bh == parent_fe_bh) - status = ocfs2_journal_access_di(handle, + retval = ocfs2_journal_access_di(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); else { - status = ocfs2_journal_access_db(handle, + retval = ocfs2_journal_access_db(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (ocfs2_dir_indexed(dir)) { - status = ocfs2_dx_dir_insert(dir, + if (!retval && ocfs2_dir_indexed(dir)) + retval = ocfs2_dx_dir_insert(dir, handle, lookup); - if (status) { - mlog_errno(status); - goto bail; - } - } + } + + if (retval) { + mlog_errno(retval); + goto bail; } /* By now the buffer is marked for journaling */ @@ -1727,7 +1745,7 @@ int __ocfs2_add_entry(handle_t *handle, de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); de = de1; } - de->file_type = OCFS2_FT_UNKNOWN; + de->file_type = FT_UNKNOWN; if (blkno) { de->inode = cpu_to_le64(blkno); ocfs2_set_de_type(de, inode->i_mode); @@ -1739,7 +1757,7 @@ int __ocfs2_add_entry(handle_t *handle, if (ocfs2_dir_indexed(dir)) ocfs2_recalc_free_list(dir, handle, lookup); - dir->i_version++; + inode_inc_iversion(dir); ocfs2_journal_dirty(handle, insert_bh); retval = 0; goto bail; @@ -1785,7 +1803,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (*f_version != inode->i_version) { + if (!inode_eq_iversion(inode, *f_version)) { for (i = 0; i < i_size_read(inode) && i < offset; ) { de = (struct ocfs2_dir_entry *) (data->id_data + i); @@ -1801,24 +1819,21 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, i += le16_to_cpu(de->rec_len); } ctx->pos = offset = i; - *f_version = inode->i_version; + *f_version = inode_query_iversion(inode); } de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos); - if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) { + if (!ocfs2_check_dir_entry(inode, de, di_bh, (char *)data->id_data, + i_size_read(inode), ctx->pos)) { /* On error, skip the f_pos to the end. */ ctx->pos = i_size_read(inode); break; } offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { - unsigned char d_type = DT_UNKNOWN; - - if (de->file_type < OCFS2_FT_MAX) - d_type = ocfs2_filetype_table[de->file_type]; - if (!dir_emit(ctx, de->name, de->name_len, - le64_to_cpu(de->inode), d_type)) + le64_to_cpu(de->inode), + fs_ftype_to_dtype(de->file_type))) goto out; } ctx->pos += le16_to_cpu(de->rec_len); @@ -1879,7 +1894,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (*f_version != inode->i_version) { + if (!inode_eq_iversion(inode, *f_version)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ocfs2_dir_entry *) (bh->b_data + i); /* It's too expensive to do a full @@ -1896,28 +1911,24 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - *f_version = inode->i_version; + *f_version = inode_query_iversion(inode); } while (ctx->pos < i_size_read(inode) && offset < sb->s_blocksize) { de = (struct ocfs2_dir_entry *) (bh->b_data + offset); - if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { + if (!ocfs2_check_dir_entry(inode, de, bh, bh->b_data, + sb->s_blocksize, offset)) { /* On error, skip the f_pos to the next block. */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; - brelse(bh); - continue; + break; } if (le64_to_cpu(de->inode)) { - unsigned char d_type = DT_UNKNOWN; - - if (de->file_type < OCFS2_FT_MAX) - d_type = ocfs2_filetype_table[de->file_type]; if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode), - d_type)) { + fs_ftype_to_dtype(de->file_type))) { brelse(bh); return 0; } @@ -1950,7 +1961,7 @@ static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version, */ int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx) { - u64 version = inode->i_version; + u64 version = inode_query_iversion(inode); ocfs2_dir_foreach_blk(inode, &version, ctx, true); return 0; } @@ -1963,11 +1974,12 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) { int error = 0; struct inode *inode = file_inode(file); + struct ocfs2_file_private *fp = file->private_data; int lock_level = 0; trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); - error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level); + error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1); if (lock_level && error >= 0) { /* We release EX lock which used to update atime * and get PR lock again to reduce contention @@ -1983,7 +1995,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) goto bail_nolock; } - error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false); + error = ocfs2_dir_foreach_blk(inode, &fp->cookie, ctx, false); ocfs2_inode_unlock(inode, lock_level); if (error) @@ -1995,7 +2007,7 @@ bail_nolock: } /* - * NOTE: this should always be called with parent dir i_mutex taken. + * NOTE: this should always be called with parent dir i_rwsem taken. */ int ocfs2_find_files_on_disk(const char *name, int namelen, @@ -2040,29 +2052,30 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, * * Return 0 if the name does not exist * Return -EEXIST if the directory contains the name + * Return -EFSCORRUPTED if found corruption * - * Callers should have i_mutex + a cluster lock on dir + * Callers should have i_rwsem + a cluster lock on dir */ int ocfs2_check_dir_for_entry(struct inode *dir, const char *name, int namelen) { - int ret; + int ret = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; trace_ocfs2_check_dir_for_entry( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); - ret = -EEXIST; - if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) - goto bail; + ret = ocfs2_find_entry(name, namelen, dir, &lookup); + if (ret == 0) { + ret = -EEXIST; + mlog_errno(ret); + } else if (ret == -ENOENT) { + ret = 0; + } - ret = 0; -bail: ocfs2_free_dir_lookup_result(&lookup); - if (ret) - mlog_errno(ret); return ret; } @@ -2073,10 +2086,12 @@ struct ocfs2_empty_dir_priv { unsigned seen_other; unsigned dx_dir; }; -static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, - loff_t pos, u64 ino, unsigned type) +static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, + int name_len, loff_t pos, u64 ino, + unsigned type) { - struct ocfs2_empty_dir_priv *p = priv; + struct ocfs2_empty_dir_priv *p = + container_of(ctx, struct ocfs2_empty_dir_priv, ctx); /* * Check the positions of "." and ".." records to be sure @@ -2091,7 +2106,7 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, */ if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { p->seen_dot = 1; - return 0; + return true; } if (name_len == 2 && !strncmp("..", name, 2) && @@ -2099,13 +2114,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, p->seen_dot_dot = 1; if (p->dx_dir && p->seen_dot) - return 1; + return false; - return 0; + return true; } p->seen_other = 1; - return 1; + return false; } static int ocfs2_empty_dir_dx(struct inode *inode, @@ -2153,11 +2168,9 @@ int ocfs2_empty_dir(struct inode *inode) { int ret; struct ocfs2_empty_dir_priv priv = { - .ctx.actor = ocfs2_empty_dir_filldir + .ctx.actor = ocfs2_empty_dir_filldir, }; - memset(&priv, 0, sizeof(priv)); - if (ocfs2_dir_indexed(inode)) { ret = ocfs2_empty_dir_dx(inode, &priv); if (ret) @@ -2351,7 +2364,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, dx_root_bh = sb_getblk(osb->sb, dr_blkno); if (dx_root_bh == NULL) { - ret = -EIO; + ret = -ENOMEM; goto out; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh); @@ -2424,7 +2437,7 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, for (i = 0; i < num_dx_leaves; i++) { bh = sb_getblk(osb->sb, start_blk + i); if (bh == NULL) { - ret = -EIO; + ret = -ENOMEM; goto out; } dx_leaves[i] = bh; @@ -2931,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); dirdata_bh = sb_getblk(sb, blkno); if (!dirdata_bh) { - ret = -EIO; + ret = -ENOMEM; mlog_errno(ret); goto out_commit; } @@ -2959,6 +2972,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_init_dir_trailer(dir, dirdata_bh, i); } + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dirdata_bh); if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { @@ -3002,11 +3016,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_dinode_new_extent_list(dir, di); i_size_write(dir, sb->s_blocksize); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); di->i_size = cpu_to_le64(sb->s_blocksize); - di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir)); + ocfs2_update_inode_fsync_trans(handle, dir, 1); /* * This should never fail as our extent list is empty and all @@ -3082,7 +3097,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * We need to return the correct block within the * cluster which should hold our entry. */ - off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), + off = ocfs2_dx_dir_hash_idx(osb, &lookup->dl_hinfo); get_bh(dx_leaves[off]); lookup->dl_dx_leaf_bh = dx_leaves[off]; @@ -3161,7 +3176,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb, *new_bh = sb_getblk(sb, p_blkno); if (!*new_bh) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } @@ -3260,7 +3275,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(dir)->ip_lock); ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), parent_fe_bh); - num_free_extents = ocfs2_num_free_extents(osb, &et); + num_free_extents = ocfs2_num_free_extents(&et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); @@ -3286,7 +3301,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, if (ocfs2_dir_resv_allowed(osb)) data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; - credits = ocfs2_calc_extend_credits(sb, el, 1); + credits = ocfs2_calc_extend_credits(sb, el); } else { spin_unlock(&OCFS2_I(dir)->ip_lock); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; @@ -3340,6 +3355,7 @@ do_extend: } else { de->rec_len = cpu_to_le16(sb->s_blocksize); } + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, new_bh); dir_i_size += dir->i_sb->s_blocksize; @@ -3379,9 +3395,9 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, struct super_block *sb = dir->i_sb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_dir_entry *de, *last_de = NULL; - char *de_buf, *limit; + char *first_de, *de_buf, *limit; unsigned long offset = 0; - unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize; + unsigned int rec_len, new_rec_len, free_space; /* * This calculates how many free bytes we'd have in block zero, should @@ -3392,14 +3408,16 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, else free_space = dir->i_sb->s_blocksize - i_size_read(dir); - de_buf = di->id2.i_data.id_data; + first_de = di->id2.i_data.id_data; + de_buf = first_de; limit = de_buf + i_size_read(dir); rec_len = OCFS2_DIR_REC_LEN(namelen); while (de_buf < limit) { de = (struct ocfs2_dir_entry *)de_buf; - if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) { + if (!ocfs2_check_dir_entry(dir, de, di_bh, first_de, + i_size_read(dir), offset)) { ret = -ENOENT; goto out; } @@ -3426,6 +3444,14 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, offset += le16_to_cpu(de->rec_len); } + if (!last_de) { + ret = ocfs2_error(sb, "Directory entry (#%llu: size=%lld) " + "is unexpectedly short", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + i_size_read(dir)); + goto out; + } + /* * We're going to require expansion of the directory - figure * out how many blocks we'll need so that a place for the @@ -3453,10 +3479,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, int blocksize = dir->i_sb->s_blocksize; status = ocfs2_read_dir_block(dir, 0, &bh, 0); - if (status) { - mlog_errno(status); + if (status) goto bail; - } rec_len = OCFS2_DIR_REC_LEN(namelen); offset = 0; @@ -3477,14 +3501,14 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = ocfs2_read_dir_block(dir, offset >> sb->s_blocksize_bits, &bh, 0); - if (status) { - mlog_errno(status); + if (status) goto bail; - } + /* move to next block */ de = (struct ocfs2_dir_entry *) bh->b_data; } - if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + if (!ocfs2_check_dir_entry(dir, de, bh, bh->b_data, blocksize, + offset)) { status = -ENOENT; goto bail; } @@ -3510,7 +3534,6 @@ next: de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); } - status = 0; bail: brelse(bh); if (status) @@ -3543,19 +3566,6 @@ static int dx_leaf_sort_cmp(const void *a, const void *b) return 0; } -static void dx_leaf_sort_swap(void *a, void *b, int size) -{ - struct ocfs2_dx_entry *entry1 = a; - struct ocfs2_dx_entry *entry2 = b; - struct ocfs2_dx_entry tmp; - - BUG_ON(size != sizeof(*entry1)); - - tmp = *entry1; - *entry1 = *entry2; - *entry2 = tmp; -} - static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) { struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; @@ -3679,7 +3689,7 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, int i, j, num_used; u32 major_hash; struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; - struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; + struct ocfs2_dx_entry_list *orig_list, *tmp_list; struct ocfs2_dx_entry *dx_entry; tmp_list = &tmp_dx_leaf->dl_list; @@ -3688,7 +3698,6 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; orig_list = &orig_dx_leaf->dl_list; new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; - new_list = &new_dx_leaf->dl_list; num_used = le16_to_cpu(orig_list->de_num_used); @@ -3716,9 +3725,9 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb, struct ocfs2_dx_root_block *dx_root) { - int credits = ocfs2_clusters_to_blocks(osb->sb, 2); + int credits = ocfs2_clusters_to_blocks(osb->sb, 3); - credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1); + credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list); credits += ocfs2_quota_trans_credits(osb->sb); return credits; } @@ -3817,7 +3826,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, */ sort(dx_leaf->dl_list.de_entries, num_used, sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, - dx_leaf_sort_swap); + NULL); ocfs2_journal_dirty(handle, dx_leaf_bh); @@ -3898,6 +3907,7 @@ out_commit: dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_commit_trans(osb, handle); out: @@ -4123,10 +4133,15 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, } dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE; - memset(&dx_root->dr_list, 0, osb->sb->s_blocksize - - offsetof(struct ocfs2_dx_root_block, dr_list)); + + dx_root->dr_list.l_tree_depth = 0; dx_root->dr_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + dx_root->dr_list.l_next_free_rec = 0; + memset(&dx_root->dr_list.l_recs, 0, + osb->sb->s_blocksize - + (offsetof(struct ocfs2_dx_root_block, dr_list) + + offsetof(struct ocfs2_extent_list, l_recs))); /* This should never fail considering we start with an empty * dx_root. */ @@ -4136,6 +4151,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, mlog_errno(ret); did_quota = 0; + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dx_root_bh); out_commit: @@ -4288,12 +4304,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, trace_ocfs2_prepare_dir_for_insert( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen); - if (!namelen) { - ret = -EINVAL; - mlog_errno(ret); - goto out; - } - /* * Do this up front to reduce confusion. * @@ -4376,7 +4386,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, mlog_errno(ret); goto out; } - mutex_lock(&dx_alloc_inode->i_mutex); + inode_lock(dx_alloc_inode); ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); if (ret) { @@ -4403,6 +4413,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); spin_unlock(&OCFS2_I(dir)->ip_lock); di->i_dx_root = cpu_to_le64(0ULL); + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, di_bh); @@ -4424,7 +4435,7 @@ out_unlock: ocfs2_inode_unlock(dx_alloc_inode, 1); out_mutex: - mutex_unlock(&dx_alloc_inode->i_mutex); + inode_unlock(dx_alloc_inode); brelse(dx_alloc_bh); out: iput(dx_alloc_inode); @@ -4434,9 +4445,9 @@ out: int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) { int ret; - unsigned int uninitialized_var(clen); - u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos); - u64 uninitialized_var(blkno); + unsigned int clen; + u32 major_hash = UINT_MAX, p_cpos, cpos; + u64 blkno; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct buffer_head *dx_root_bh = NULL; struct ocfs2_dx_root_block *dx_root; @@ -4473,7 +4484,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0, - &dealloc, 0); + &dealloc, 0, false); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index f0344b75b14d..4b9f5a12c7d2 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * dir.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_DIR_H @@ -72,7 +56,7 @@ static inline int ocfs2_add_entry(handle_t *handle, struct buffer_head *parent_fe_bh, struct ocfs2_dir_lookup_result *lookup) { - return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, + return __ocfs2_add_entry(handle, d_inode(dentry->d_parent), dentry->d_name.name, dentry->d_name.len, inode, blkno, parent_fe_bh, lookup); } diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index c8a044efbb15..5e700b45d32d 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -1,7 +1,5 @@ -ccflags-y := -Ifs/ocfs2 - +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ - dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o - + dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h index 3cfa114aa391..1969db8ffa9c 100644 --- a/fs/ocfs2/dlm/dlmapi.h +++ b/fs/ocfs2/dlm/dlmapi.h @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * dlmapi.h * * externally exported dlm interfaces * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef DLMAPI_H @@ -79,8 +62,6 @@ enum dlm_status { DLM_MAXSTATS, /* 41: upper limit for return code validation */ }; -/* for pretty-printing dlm_status error messages */ -const char *dlm_errmsg(enum dlm_status err); /* for pretty-printing dlm_status error names */ const char *dlm_errname(enum dlm_status err); @@ -137,7 +118,7 @@ struct dlm_lockstatus { #define LKM_VALBLK 0x00000100 /* lock value block request */ #define LKM_NOQUEUE 0x00000200 /* non blocking request */ #define LKM_CONVERT 0x00000400 /* conversion request */ -#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */ +#define LKM_NODLCKWT 0x00000800 /* this lock won't deadlock (U) */ #define LKM_UNLOCK 0x00001000 /* deallocate this lock */ #define LKM_CANCEL 0x00002000 /* cancel conversion request */ #define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */ diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index fbec0be62326..c681ba957932 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmast.c * * AST and BAST functionality for local and remote nodes * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ @@ -38,15 +21,15 @@ #include <linux/spinlock.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock); @@ -180,16 +163,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) spin_unlock(&lock->spinlock); } -void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) -{ - BUG_ON(!dlm); - BUG_ON(!lock); - - spin_lock(&dlm->ast_lock); - __dlm_queue_bast(dlm, lock); - spin_unlock(&dlm->ast_lock); -} - static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock) { @@ -224,14 +197,12 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock) { dlm_astlockfunc_t *fn; - struct dlm_lockstatus *lksb; mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name, res->lockname.len, res->lockname.name, dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); - lksb = lock->lksb; fn = lock->ast; BUG_ON(lock->ml.node != dlm->node_num); @@ -292,7 +263,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_lock *lock = NULL; struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; char *name; - struct list_head *iter, *head=NULL; + struct list_head *head = NULL; __be64 cookie; u32 flags; u8 node; @@ -373,8 +344,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, /* try convert queue for both ast/bast */ head = &res->converting; lock = NULL; - list_for_each(iter, head) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry(lock, head, list) { if (lock->ml.cookie == cookie) goto do_ast; } @@ -385,10 +355,13 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, else head = &res->granted; - list_for_each(iter, head) { - lock = list_entry (iter, struct dlm_lock, list); - if (lock->ml.cookie == cookie) + list_for_each_entry(lock, head, list) { + /* if lock is found but unlock is pending ignore the bast */ + if (lock->ml.cookie == cookie) { + if (lock->unlock_pending) + break; goto do_ast; + } } mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, " diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index de854cca12a2..20f790a47484 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1,25 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * dlmcommon.h * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef DLMCOMMON_H @@ -32,10 +15,7 @@ #define DLM_LOCKID_NAME_MAX 32 -#define DLM_DOMAIN_NAME_MAX_LEN 255 #define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES -#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes -#define DLM_THREAD_MS 200 // flush at least every 200 ms #define DLM_HASH_SIZE_DEFAULT (1 << 17) #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE @@ -47,7 +27,7 @@ #define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE) /* Intended to make it easier for us to switch out hash functions */ -#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) +#define dlm_lockid_hash(_n, _l) full_name_hash(NULL, _n, _l) enum dlm_mle_type { DLM_MLE_BLOCK = 0, @@ -108,7 +88,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) struct dlm_recovery_ctxt { struct list_head resources; - struct list_head received; struct list_head node_data; u8 new_master; u8 dead_node; @@ -141,6 +120,7 @@ struct dlm_ctxt u8 node_num; u32 key; u8 joining_node; + u8 migrate_done; /* set to 1 means node has migrated all lock resources */ wait_queue_head_t dlm_join_events; unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -157,7 +137,6 @@ struct dlm_ctxt atomic_t res_tot_count; atomic_t res_cur_count; - struct dlm_debug_ctxt *dlm_debug_ctxt; struct dentry *dlm_debugfs_subroot; /* NOTE: Next three are protected by dlm_domain_lock */ @@ -283,6 +262,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_DROPPING_REF 0x00000040 #define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 #define DLM_LOCK_RES_SETREF_INPROG 0x00002000 +#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000 /* max milliseconds to wait to sync up a network failure with a node death */ #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) @@ -332,6 +312,7 @@ struct dlm_lock_resource u16 state; char lvb[DLM_LVB_LEN]; unsigned int inflight_locks; + unsigned int inflight_assert_workers; unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; }; @@ -376,17 +357,6 @@ struct dlm_lock lksb_kernel_allocated:1; }; - -#define DLM_LKSB_UNUSED1 0x01 -#define DLM_LKSB_PUT_LVB 0x02 -#define DLM_LKSB_GET_LVB 0x04 -#define DLM_LKSB_UNUSED2 0x08 -#define DLM_LKSB_UNUSED3 0x10 -#define DLM_LKSB_UNUSED4 0x20 -#define DLM_LKSB_UNUSED5 0x40 -#define DLM_LKSB_UNUSED6 0x80 - - enum dlm_lockres_list { DLM_GRANTED_LIST = 0, DLM_CONVERTING_LIST = 1, @@ -462,6 +432,7 @@ enum { DLM_QUERY_REGION = 519, DLM_QUERY_NODEINFO = 520, DLM_BEGIN_EXIT_DOMAIN_MSG = 521, + DLM_DEREF_LOCKRES_DONE = 522, }; struct dlm_reco_node_data @@ -556,7 +527,7 @@ struct dlm_master_requery * }; * * from ../cluster/tcp.h - * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) + * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) * (roughly 4080 bytes) * and sizeof(dlm_migratable_lockres) = 112 bytes * and sizeof(dlm_migratable_lock) = 16 bytes @@ -588,7 +559,7 @@ struct dlm_migratable_lockres // 48 bytes u8 lvb[DLM_LVB_LEN]; // 112 bytes - struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112 + struct dlm_migratable_lock ml[]; // 16 bytes each, begins at byte 112 }; #define DLM_MIG_LOCKRES_MAX_LEN \ (sizeof(struct dlm_migratable_lockres) + \ @@ -597,7 +568,7 @@ struct dlm_migratable_lockres /* from above, 128 bytes * for some undetermined future use */ -#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ +#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \ DLM_MIG_LOCKRES_MAX_LEN) struct dlm_create_lock @@ -625,7 +596,7 @@ struct dlm_convert_lock u8 name[O2NM_MAX_NAME_LEN]; - s8 lvb[0]; + s8 lvb[]; }; #define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN) @@ -640,7 +611,7 @@ struct dlm_unlock_lock u8 name[O2NM_MAX_NAME_LEN]; - s8 lvb[0]; + s8 lvb[]; }; #define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN) @@ -656,7 +627,7 @@ struct dlm_proxy_ast u8 name[O2NM_MAX_NAME_LEN]; - s8 lvb[0]; + s8 lvb[]; }; #define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN) @@ -712,10 +683,6 @@ struct dlm_begin_reco __be32 pad2; }; - -#define BITS_PER_BYTE 8 -#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) - struct dlm_query_join_request { u8 node_idx; @@ -793,6 +760,20 @@ struct dlm_deref_lockres u8 name[O2NM_MAX_NAME_LEN]; }; +enum { + DLM_DEREF_RESPONSE_DONE = 0, + DLM_DEREF_RESPONSE_INPROG = 1, +}; + +struct dlm_deref_lockres_done { + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + static inline enum dlm_status __dlm_lockres_state_to_status(struct dlm_lock_resource *res) { @@ -800,7 +781,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) status = DLM_RECOVERING; else if (res->state & DLM_LOCK_RES_MIGRATING) status = DLM_MIGRATING; @@ -911,8 +893,10 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); -void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_do_local_ast(struct dlm_ctxt *dlm, @@ -951,13 +935,10 @@ static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm, void dlm_print_one_lock_resource(struct dlm_lock_resource *res); void __dlm_print_one_lock_resource(struct dlm_lock_resource *res); -u8 dlm_nm_this_node(struct dlm_ctxt *dlm); void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -int dlm_nm_init(struct dlm_ctxt *dlm); -int dlm_heartbeat_init(struct dlm_ctxt *dlm); void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); @@ -976,6 +957,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, void dlm_assert_master_post_handler(int status, void *data, void *ret_data); int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); +int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, @@ -993,6 +976,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 nodenum, u8 *real_master); +void __dlm_do_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, @@ -1011,13 +996,13 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, /* will exit holding res->spinlock, but may drop in function */ void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); -void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags); /* will exit holding res->spinlock, but may drop in function */ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) { __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING| DLM_LOCK_RES_MIGRATING)); } @@ -1079,11 +1064,9 @@ static inline int dlm_lock_compatible(int existing, int request) static inline int dlm_lock_on_list(struct list_head *head, struct dlm_lock *lock) { - struct list_head *iter; struct dlm_lock *tmplock; - list_for_each(iter, head) { - tmplock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(tmplock, head, list) { if (tmplock == lock) return 1; } @@ -1111,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err) static inline void dlm_node_iter_init(unsigned long *map, struct dlm_node_iter *iter) { - memcpy(iter->node_map, map, sizeof(iter->node_map)); + bitmap_copy(iter->node_map, map, O2NM_MAX_NODES); iter->curnode = -1; } diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 29a886d1e82c..450d46eefab3 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmconvert.c * * underlying calls for lock conversion * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ @@ -38,9 +21,9 @@ #include <linux/spinlock.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -48,7 +31,7 @@ #include "dlmconvert.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" /* NOTE: __dlmconvert_master is the only function in here that * needs a spinlock held on entry (res->spinlock) and it is the @@ -123,7 +106,6 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, int *kick_thread) { enum dlm_status status = DLM_NORMAL; - struct list_head *iter; struct dlm_lock *tmplock=NULL; assert_spin_locked(&res->spinlock); @@ -185,16 +167,14 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, /* upconvert from here on */ status = DLM_NORMAL; - list_for_each(iter, &res->granted) { - tmplock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(tmplock, &res->granted, list) { if (tmplock == lock) continue; if (!dlm_lock_compatible(tmplock->ml.type, type)) goto switch_queues; } - list_for_each(iter, &res->converting) { - tmplock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(tmplock, &res->converting, list) { if (!dlm_lock_compatible(tmplock->ml.type, type)) goto switch_queues; /* existing conversion requests take precedence */ @@ -215,6 +195,12 @@ grant: if (lock->lksb->flags & DLM_LKSB_PUT_LVB) memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); + /* + * Move the lock to the tail because it may be the only lock which has + * an invalid lvb. + */ + list_move_tail(&lock->list, &res->granted); + status = DLM_NORMAL; *call_ast = 1; goto unlock_exit; @@ -290,6 +276,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, status = DLM_DENIED; goto bail; } + + if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) { + mlog(0, "last convert request returned DLM_RECOVERING, but " + "owner has already queued and sent ast to me. res %.*s, " + "(cookie=%u:%llu, type=%d, conv=%d)\n", + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ml.type, lock->ml.convert_type); + status = DLM_NORMAL; + goto bail; + } + res->state |= DLM_LOCK_RES_IN_PROGRESS; /* move lock to local convert queue */ /* do not alter lock refcount. switching lists. */ @@ -318,13 +317,22 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; - lock->convert_pending = 0; - /* if it failed, move it back to granted queue */ + /* if it failed, move it back to granted queue. + * if master returns DLM_NORMAL and then down before sending ast, + * it may have already been moved to granted queue, reset to + * DLM_RECOVERING and retry convert */ if (status != DLM_NORMAL) { if (status != DLM_NOTQUEUED) dlm_error(status); dlm_revert_pending_convert(res, lock); + } else if (!lock->convert_pending) { + mlog(0, "%s: res %.*s, owner died and lock has been moved back " + "to granted list, retry convert.\n", + dlm->name, res->lockname.len, res->lockname.name); + status = DLM_RECOVERING; } + + lock->convert_pending = 0; bail: spin_unlock(&res->spinlock); @@ -424,8 +432,8 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_ctxt *dlm = data; struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; struct dlm_lock_resource *res = NULL; - struct list_head *iter; struct dlm_lock *lock = NULL; + struct dlm_lock *tmp_lock; struct dlm_lockstatus *lksb; enum dlm_status status = DLM_NORMAL; u32 flags; @@ -471,14 +479,13 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, dlm_error(status); goto leave; } - list_for_each(iter, &res->granted) { - lock = list_entry(iter, struct dlm_lock, list); - if (lock->ml.cookie == cnv->cookie && - lock->ml.node == cnv->node_idx) { + list_for_each_entry(tmp_lock, &res->granted, list) { + if (tmp_lock->ml.cookie == cnv->cookie && + tmp_lock->ml.node == cnv->node_idx) { + lock = tmp_lock; dlm_lock_get(lock); break; } - lock = NULL; } spin_unlock(&res->spinlock); if (!lock) { diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h index b2e3677df878..1f371716513b 100644 --- a/fs/ocfs2/dlm/dlmconvert.h +++ b/fs/ocfs2/dlm/dlmconvert.h @@ -1,25 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * dlmconvert.h * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef DLMCONVERT_H diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 0e28e242226d..fe4fdd09bae3 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmdebug.c * * debug functionality for the dlm * * Copyright (C) 2004, 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #include <linux/types.h> @@ -31,10 +14,11 @@ #include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/export.h> +#include <linux/string_choices.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -42,7 +26,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static int stringify_lockname(const char *lockname, int locklen, char *buf, int len); @@ -81,7 +65,7 @@ static void __dlm_print_lock(struct dlm_lock *lock) lock->ml.type, lock->ml.convert_type, lock->ml.node, dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), - atomic_read(&lock->lock_refs.refcount), + kref_read(&lock->lock_refs), (list_empty(&lock->ast_list) ? 'y' : 'n'), (lock->ast_pending ? 'y' : 'n'), (list_empty(&lock->bast_list) ? 'y' : 'n'), @@ -96,7 +80,6 @@ static void __dlm_print_lock(struct dlm_lock *lock) void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) { - struct list_head *iter2; struct dlm_lock *lock; char buf[DLM_LOCKID_NAME_MAX]; @@ -107,29 +90,26 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) printk("lockres: %s, owner=%u, state=%u\n", buf, res->owner, res->state); printk(" last used: %lu, refcnt: %u, on purge list: %s\n", - res->last_used, atomic_read(&res->refs.refcount), - list_empty(&res->purge) ? "no" : "yes"); + res->last_used, kref_read(&res->refs), + str_no_yes(list_empty(&res->purge))); printk(" on dirty list: %s, on reco list: %s, " "migrating pending: %s\n", - list_empty(&res->dirty) ? "no" : "yes", - list_empty(&res->recovering) ? "no" : "yes", - res->migration_pending ? "yes" : "no"); + str_no_yes(list_empty(&res->dirty)), + str_no_yes(list_empty(&res->recovering)), + str_yes_no(res->migration_pending)); printk(" inflight locks: %d, asts reserved: %d\n", res->inflight_locks, atomic_read(&res->asts_reserved)); dlm_print_lockres_refmap(res); printk(" granted queue:\n"); - list_for_each(iter2, &res->granted) { - lock = list_entry(iter2, struct dlm_lock, list); + list_for_each_entry(lock, &res->granted, list) { __dlm_print_lock(lock); } printk(" converting queue:\n"); - list_for_each(iter2, &res->converting) { - lock = list_entry(iter2, struct dlm_lock, list); + list_for_each_entry(lock, &res->converting, list) { __dlm_print_lock(lock); } printk(" blocked queue:\n"); - list_for_each(iter2, &res->blocked) { - lock = list_entry(iter2, struct dlm_lock, list); + list_for_each_entry(lock, &res->blocked, list) { __dlm_print_lock(lock); } } @@ -185,59 +165,6 @@ static const char *dlm_errnames[] = { [DLM_MAXSTATS] = "DLM_MAXSTATS", }; -static const char *dlm_errmsgs[] = { - [DLM_NORMAL] = "request in progress", - [DLM_GRANTED] = "request granted", - [DLM_DENIED] = "request denied", - [DLM_DENIED_NOLOCKS] = "request denied, out of system resources", - [DLM_WORKING] = "async request in progress", - [DLM_BLOCKED] = "lock request blocked", - [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock", - [DLM_DENIED_GRACE_PERIOD] = "topological change in progress", - [DLM_SYSERR] = "system error", - [DLM_NOSUPPORT] = "unsupported", - [DLM_CANCELGRANT] = "can't cancel convert: already granted", - [DLM_IVLOCKID] = "bad lockid", - [DLM_SYNC] = "synchronous request granted", - [DLM_BADTYPE] = "bad resource type", - [DLM_BADRESOURCE] = "bad resource handle", - [DLM_MAXHANDLES] = "no more resource handles", - [DLM_NOCLINFO] = "can't contact cluster manager", - [DLM_NOLOCKMGR] = "can't contact lock manager", - [DLM_NOPURGED] = "can't contact purge daemon", - [DLM_BADARGS] = "bad api args", - [DLM_VOID] = "no status", - [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed", - [DLM_IVBUFLEN] = "invalid resource name length", - [DLM_CVTUNGRANT] = "attempted to convert ungranted lock", - [DLM_BADPARAM] = "invalid lock mode specified", - [DLM_VALNOTVALID] = "value block has been invalidated", - [DLM_REJECTED] = "request rejected, unrecognized client", - [DLM_ABORT] = "blocked lock request cancelled", - [DLM_CANCEL] = "conversion request cancelled", - [DLM_IVRESHANDLE] = "invalid resource handle", - [DLM_DEADLOCK] = "deadlock recovery refused this request", - [DLM_DENIED_NOASTS] = "failed to allocate AST", - [DLM_FORWARD] = "request must wait for primary's response", - [DLM_TIMEOUT] = "timeout value for lock has expired", - [DLM_IVGROUPID] = "invalid group specification", - [DLM_VERS_CONFLICT] = "version conflicts prevent request handling", - [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong", - [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device", - [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ", - [DLM_RECOVERING] = "lock resource being recovered", - [DLM_MIGRATING] = "lock resource being migrated", - [DLM_MAXSTATS] = "invalid error number", -}; - -const char *dlm_errmsg(enum dlm_status err) -{ - if (err >= DLM_MAXSTATS || err < 0) - return dlm_errmsgs[DLM_MAXSTATS]; - return dlm_errmsgs[err]; -} -EXPORT_SYMBOL_GPL(dlm_errmsg); - const char *dlm_errname(enum dlm_status err) { if (err >= DLM_MAXSTATS || err < 0) @@ -263,11 +190,11 @@ static int stringify_lockname(const char *lockname, int locklen, char *buf, memcpy((__be64 *)&inode_blkno_be, (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START], sizeof(__be64)); - out += snprintf(buf + out, len - out, "%.*s%08x", + out += scnprintf(buf + out, len - out, "%.*s%08x", OCFS2_DENTRY_LOCK_INO_START - 1, lockname, (unsigned int)be64_to_cpu(inode_blkno_be)); } else - out += snprintf(buf + out, len - out, "%.*s", + out += scnprintf(buf + out, len - out, "%.*s", locklen, lockname); return out; } @@ -279,7 +206,7 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes, int i = -1; while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes) - out += snprintf(buf + out, len - out, "%d ", i); + out += scnprintf(buf + out, len - out, "%d ", i); return out; } @@ -297,34 +224,34 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) mle_type = "MIG"; out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", mle_type, mle->master, mle->new_master, !list_empty(&mle->hb_events), !!mle->inuse, - atomic_read(&mle->mle_refs.refcount)); + kref_read(&mle->mle_refs)); - out += snprintf(buf + out, len - out, "Maybe="); + out += scnprintf(buf + out, len - out, "Maybe="); out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "Vote="); + out += scnprintf(buf + out, len - out, "Vote="); out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "Response="); + out += scnprintf(buf + out, len - out, "Response="); out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "Node="); + out += scnprintf(buf + out, len - out, "Node="); out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); return out; } @@ -333,7 +260,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) { char *buf; - buf = (char *) get_zeroed_page(GFP_NOFS); + buf = (char *) get_zeroed_page(GFP_ATOMIC); if (buf) { dump_mle(mle, buf, PAGE_SIZE - 1); free_page((unsigned long)buf); @@ -342,7 +269,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) #ifdef CONFIG_DEBUG_FS -static struct dentry *dlm_debugfs_root = NULL; +static struct dentry *dlm_debugfs_root; #define DLM_DEBUGFS_DIR "o2dlm" #define DLM_DEBUGFS_DLM_STATE "dlm_state" @@ -351,26 +278,6 @@ static struct dentry *dlm_debugfs_root = NULL; #define DLM_DEBUGFS_PURGE_LIST "purge_list" /* begin - utils funcs */ -static void dlm_debug_free(struct kref *kref) -{ - struct dlm_debug_ctxt *dc; - - dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt); - - kfree(dc); -} - -static void dlm_debug_put(struct dlm_debug_ctxt *dc) -{ - if (dc) - kref_put(&dc->debug_refcnt, dlm_debug_free); -} - -static void dlm_debug_get(struct dlm_debug_ctxt *dc) -{ - kref_get(&dc->debug_refcnt); -} - static int debug_release(struct inode *inode, struct file *file) { free_page((unsigned long)file->private_data); @@ -392,7 +299,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) int out = 0; unsigned long total = 0; - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Dumping Purgelist for Domain: %s\n", dlm->name); spin_lock(&dlm->spinlock); @@ -404,13 +311,13 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) out += stringify_lockname(res->lockname.name, res->lockname.len, buf + out, len - out); - out += snprintf(buf + out, len - out, "\t%ld\n", + out += scnprintf(buf + out, len - out, "\t%ld\n", (jiffies - res->last_used)/HZ); spin_unlock(&res->spinlock); } spin_unlock(&dlm->spinlock); - out += snprintf(buf + out, len - out, "Total on list: %ld\n", total); + out += scnprintf(buf + out, len - out, "Total on list: %lu\n", total); return out; } @@ -446,19 +353,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) { struct dlm_master_list_entry *mle; struct hlist_head *bucket; - struct hlist_node *list; int i, out = 0; unsigned long total = 0, longest = 0, bucket_count = 0; - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Dumping MLEs for Domain: %s\n", dlm->name); spin_lock(&dlm->master_lock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); - hlist_for_each(list, bucket) { - mle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); + hlist_for_each_entry(mle, bucket, master_hash_node) { ++total; ++bucket_count; if (len - out < 200) @@ -470,8 +374,8 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) } spin_unlock(&dlm->master_lock); - out += snprintf(buf + out, len - out, - "Total: %ld, Longest: %ld\n", total, longest); + out += scnprintf(buf + out, len - out, + "Total: %lu, Longest: %lu\n", total, longest); return out; } @@ -509,7 +413,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len) #define DEBUG_LOCK_VERSION 1 spin_lock(&lock->spinlock); - out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d," + out = scnprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d," "%d,%d,%d,%d\n", DEBUG_LOCK_VERSION, list_type, lock->ml.type, lock->ml.convert_type, @@ -521,7 +425,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len) lock->ast_pending, lock->bast_pending, lock->convert_pending, lock->lock_pending, lock->cancel_pending, lock->unlock_pending, - atomic_read(&lock->lock_refs.refcount)); + kref_read(&lock->lock_refs)); spin_unlock(&lock->spinlock); return out; @@ -533,13 +437,13 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) int i; int out = 0; - out += snprintf(buf + out, len - out, "NAME:"); + out += scnprintf(buf + out, len - out, "NAME:"); out += stringify_lockname(res->lockname.name, res->lockname.len, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); #define DEBUG_LRES_VERSION 1 - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n", DEBUG_LRES_VERSION, res->owner, res->state, res->last_used, @@ -548,20 +452,20 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) !list_empty(&res->recovering), res->inflight_locks, res->migration_pending, atomic_read(&res->asts_reserved), - atomic_read(&res->refs.refcount)); + kref_read(&res->refs)); /* refmap */ - out += snprintf(buf + out, len - out, "RMAP:"); + out += scnprintf(buf + out, len - out, "RMAP:"); out += stringify_nodemap(res->refmap, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* lvb */ - out += snprintf(buf + out, len - out, "LVBX:"); + out += scnprintf(buf + out, len - out, "LVBX:"); for (i = 0; i < DLM_LVB_LEN; i++) - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%02x", (unsigned char)res->lvb[i]); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* granted */ list_for_each_entry(lock, &res->granted, list) @@ -575,7 +479,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) list_for_each_entry(lock, &res->blocked, list) out += dump_lock(lock, 2, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); return out; } @@ -585,7 +489,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) struct debug_lockres *dl = m->private; struct dlm_ctxt *dlm = dl->dl_ctxt; struct dlm_lock_resource *oldres = dl->dl_res; - struct dlm_lock_resource *res = NULL; + struct dlm_lock_resource *res = NULL, *iter; struct list_head *track_list; spin_lock(&dlm->track_lock); @@ -600,11 +504,11 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) } } - list_for_each_entry(res, track_list, tracking) { - if (&res->tracking == &dlm->tracking_list) - res = NULL; - else - dlm_lockres_get(res); + list_for_each_entry(iter, track_list, tracking) { + if (&iter->tracking != &dlm->tracking_list) { + dlm_lockres_get(iter); + res = iter; + } break; } spin_unlock(&dlm->track_lock); @@ -654,41 +558,30 @@ static const struct seq_operations debug_lockres_ops = { static int debug_lockres_open(struct inode *inode, struct file *file) { struct dlm_ctxt *dlm = inode->i_private; - int ret = -ENOMEM; - struct seq_file *seq; - struct debug_lockres *dl = NULL; + struct debug_lockres *dl; + void *buf; - dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL); - if (!dl) { - mlog_errno(ret); - goto bail; - } - - dl->dl_len = PAGE_SIZE; - dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL); - if (!dl->dl_buf) { - mlog_errno(ret); + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) goto bail; - } - ret = seq_open(file, &debug_lockres_ops); - if (ret) { - mlog_errno(ret); - goto bail; - } + dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl)); + if (!dl) + goto bailfree; - seq = file->private_data; - seq->private = dl; + dl->dl_len = PAGE_SIZE; + dl->dl_buf = buf; dlm_grab(dlm); dl->dl_ctxt = dlm; return 0; + +bailfree: + kfree(buf); bail: - if (dl) - kfree(dl->dl_buf); - kfree(dl); - return ret; + mlog_errno(-ENOMEM); + return -ENOMEM; } static int debug_lockres_release(struct inode *inode, struct file *file) @@ -736,41 +629,41 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) } /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Domain: %s Key: 0x%08x Protocol: %d.%d\n", dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); /* Thread Pid: xxx Node: xxx State: xxxxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Thread Pid: %d Node: %d State: %s\n", task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state); /* Number of Joins: xxx Joining Node: xxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Number of Joins: %d Joining Node: %d\n", dlm->num_joins, dlm->joining_node); /* Domain Map: xx xx xx */ - out += snprintf(buf + out, len - out, "Domain Map: "); + out += scnprintf(buf + out, len - out, "Domain Map: "); out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Exit Domain Map: xx xx xx */ - out += snprintf(buf + out, len - out, "Exit Domain Map: "); + out += scnprintf(buf + out, len - out, "Exit Domain Map: "); out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Live Map: xx xx xx */ - out += snprintf(buf + out, len - out, "Live Map: "); + out += scnprintf(buf + out, len - out, "Live Map: "); out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Lock Resources: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Lock Resources: %d (%d)\n", atomic_read(&dlm->res_cur_count), atomic_read(&dlm->res_tot_count)); @@ -782,29 +675,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) cur_mles += atomic_read(&dlm->mle_cur_count[i]); /* MLEs: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "MLEs: %d (%d)\n", cur_mles, tot_mles); /* Blocking: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, " Blocking: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); /* Mastery: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, " Mastery: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); /* Migration: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, " Migration: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Lists: Dirty=%s Purge=%s PendingASTs=%s " "PendingBASTs=%s\n", (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), @@ -813,12 +706,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); /* Purge Count: xxx Refs: xxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Purge Count: %d Refs: %d\n", dlm->purge_count, - atomic_read(&dlm->dlm_refs.refcount)); + kref_read(&dlm->dlm_refs)); /* Dead Node: xxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Dead Node: %d\n", dlm->reco.dead_node); /* What about DLM_RECO_STATE_FINALIZE? */ @@ -828,19 +721,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) state = "INACTIVE"; /* Recovery Pid: xxxx Master: xxx State: xxxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Recovery Pid: %d Master: %d State: %s\n", task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, state); /* Recovery Map: xx xx */ - out += snprintf(buf + out, len - out, "Recovery Map: "); + out += scnprintf(buf + out, len - out, "Recovery Map: "); out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Recovery Node State: */ - out += snprintf(buf + out, len - out, "Recovery Node State:\n"); + out += scnprintf(buf + out, len - out, "Recovery Node State:\n"); list_for_each_entry(node, &dlm->reco.node_data, list) { switch (node->state) { case DLM_RECO_NODE_DATA_INIT: @@ -868,7 +761,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) state = "BAD"; break; } - out += snprintf(buf + out, len - out, "\t%u - %s\n", + out += scnprintf(buf + out, len - out, "\t%u - %s\n", node->node_num, state); } @@ -904,111 +797,42 @@ static const struct file_operations debug_state_fops = { /* end - debug state funcs */ /* files in subroot */ -int dlm_debug_init(struct dlm_ctxt *dlm) +void dlm_debug_init(struct dlm_ctxt *dlm) { - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; - /* for dumping dlm_ctxt */ - dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_state_fops); - if (!dc->debug_state_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_state_fops); /* for dumping lockres */ - dc->debug_lockres_dentry = - debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_lockres_fops); - if (!dc->debug_lockres_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops); /* for dumping mles */ - dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_mle_fops); - if (!dc->debug_mle_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops); /* for dumping lockres on the purge list */ - dc->debug_purgelist_dentry = - debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_purgelist_fops); - if (!dc->debug_purgelist_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } - - dlm_debug_get(dc); - return 0; - -bail: - dlm_debug_shutdown(dlm); - return -ENOMEM; -} - -void dlm_debug_shutdown(struct dlm_ctxt *dlm) -{ - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; - - if (dc) { - debugfs_remove(dc->debug_purgelist_dentry); - debugfs_remove(dc->debug_mle_dentry); - debugfs_remove(dc->debug_lockres_dentry); - debugfs_remove(dc->debug_state_dentry); - dlm_debug_put(dc); - } + debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, + &debug_purgelist_fops); } /* subroot - domain dir */ -int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, dlm_debugfs_root); - if (!dlm->dlm_debugfs_subroot) { - mlog_errno(-ENOMEM); - goto bail; - } - - dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), - GFP_KERNEL); - if (!dlm->dlm_debug_ctxt) { - mlog_errno(-ENOMEM); - goto bail; - } - kref_init(&dlm->dlm_debug_ctxt->debug_refcnt); - - return 0; -bail: - dlm_destroy_debugfs_subroot(dlm); - return -ENOMEM; } void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { - debugfs_remove(dlm->dlm_debugfs_subroot); + debugfs_remove_recursive(dlm->dlm_debugfs_subroot); } /* debugfs root */ -int dlm_create_debugfs_root(void) +void dlm_create_debugfs_root(void) { dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL); - if (!dlm_debugfs_root) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } - return 0; } void dlm_destroy_debugfs_root(void) diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 1f27c4812d1a..e08f7357e7ec 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -1,25 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * dlmdebug.h * * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef DLMDEBUG_H @@ -29,14 +12,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle); #ifdef CONFIG_DEBUG_FS -struct dlm_debug_ctxt { - struct kref debug_refcnt; - struct dentry *debug_state_dentry; - struct dentry *debug_lockres_dentry; - struct dentry *debug_mle_dentry; - struct dentry *debug_purgelist_dentry; -}; - struct debug_lockres { int dl_len; char *dl_buf; @@ -44,34 +19,27 @@ struct debug_lockres { struct dlm_lock_resource *dl_res; }; -int dlm_debug_init(struct dlm_ctxt *dlm); -void dlm_debug_shutdown(struct dlm_ctxt *dlm); +void dlm_debug_init(struct dlm_ctxt *dlm); -int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); +void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); -int dlm_create_debugfs_root(void); +void dlm_create_debugfs_root(void); void dlm_destroy_debugfs_root(void); #else -static inline int dlm_debug_init(struct dlm_ctxt *dlm) -{ - return 0; -} -static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) +static inline void dlm_debug_init(struct dlm_ctxt *dlm) { } -static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { - return 0; } static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { } -static inline int dlm_create_debugfs_root(void) +static inline void dlm_create_debugfs_root(void) { - return 0; } static inline void dlm_destroy_debugfs_root(void) { diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index dbb17c07656a..2347a50f079b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmdomain.c * * defines domain join / leave apis * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #include <linux/module.h> @@ -33,20 +16,19 @@ #include <linux/delay.h> #include <linux/err.h> #include <linux/debugfs.h> +#include <linux/sched/signal.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #include "dlmdomain.h" #include "dlmdebug.h" -#include "dlmver.h" - #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" /* * ocfs2 node maps are array of long int, which limits to send them freely @@ -87,7 +69,7 @@ static void dlm_free_pagevec(void **vec, int pages) static void **dlm_alloc_pagevec(int pages) { - void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL); + void **vec = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); int i; if (!vec) @@ -134,10 +116,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * - Message DLM_QUERY_NODEINFO added to allow online node removes * New in version 1.2: * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain + * New in version 1.3: + * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the + * refmap is cleared */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, - .pv_minor = 2, + .pv_minor = 3, }; #define DLM_DOMAIN_BACKOFF_MS 200 @@ -171,12 +156,10 @@ void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { struct hlist_head *bucket; - struct qstr *q; assert_spin_locked(&dlm->spinlock); - q = &res->lockname; - bucket = dlm_lockres_hash(dlm, q->hash); + bucket = dlm_lockres_hash(dlm, res->lockname.hash); /* get a reference for our hashtable */ dlm_lockres_get(res); @@ -193,7 +176,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, unsigned int hash) { struct hlist_head *bucket; - struct hlist_node *list; + struct dlm_lock_resource *res; mlog(0, "%.*s\n", len, name); @@ -201,9 +184,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, bucket = dlm_lockres_hash(dlm, hash); - hlist_for_each(list, bucket) { - struct dlm_lock_resource *res = hlist_entry(list, - struct dlm_lock_resource, hash_node); + hlist_for_each_entry(res, bucket, hash_node) { if (res->lockname.name[0] != name[0]) continue; if (unlikely(res->lockname.len != len)) @@ -262,22 +243,19 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) { - struct dlm_ctxt *tmp = NULL; - struct list_head *iter; + struct dlm_ctxt *tmp; assert_spin_locked(&dlm_domain_lock); /* tmp->name here is always NULL terminated, * but domain may not be! */ - list_for_each(iter, &dlm_domains) { - tmp = list_entry (iter, struct dlm_ctxt, list); + list_for_each_entry(tmp, &dlm_domains, list) { if (strlen(tmp->name) == len && memcmp(tmp->name, domain, len)==0) - break; - tmp = NULL; + return tmp; } - return tmp; + return NULL; } /* For null terminated domain strings ONLY */ @@ -366,25 +344,22 @@ static void __dlm_get(struct dlm_ctxt *dlm) * you shouldn't trust your pointer. */ struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) { - struct list_head *iter; - struct dlm_ctxt *target = NULL; + struct dlm_ctxt *target; + struct dlm_ctxt *ret = NULL; spin_lock(&dlm_domain_lock); - list_for_each(iter, &dlm_domains) { - target = list_entry (iter, struct dlm_ctxt, list); - + list_for_each_entry(target, &dlm_domains, list) { if (target == dlm) { __dlm_get(target); + ret = target; break; } - - target = NULL; } spin_unlock(&dlm_domain_lock); - return target; + return ret; } int dlm_domain_fully_joined(struct dlm_ctxt *dlm) @@ -402,7 +377,6 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm) static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) { if (dlm->dlm_worker) { - flush_workqueue(dlm->dlm_worker); destroy_workqueue(dlm->dlm_worker); dlm->dlm_worker = NULL; } @@ -411,7 +385,6 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) { dlm_unregister_domain_handlers(dlm); - dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -470,6 +443,19 @@ redo_bucket: cond_resched_lock(&dlm->spinlock); num += n; } + + if (!num) { + if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "%s: perhaps there are more lock resources " + "need to be migrated after dlm recovery\n", dlm->name); + ret = -EAGAIN; + } else { + mlog(0, "%s: we won't do dlm recovery after migrating " + "all lock resources\n", dlm->name); + dlm->migrate_done = 1; + } + } + spin_unlock(&dlm->spinlock); wake_up(&dlm->dlm_thread_wq); @@ -684,34 +670,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) spin_unlock(&dlm->spinlock); } -int dlm_joined(struct dlm_ctxt *dlm) -{ - int ret = 0; - - spin_lock(&dlm_domain_lock); - - if (dlm->dlm_state == DLM_CTXT_JOINED) - ret = 1; - - spin_unlock(&dlm_domain_lock); - - return ret; -} - -int dlm_shutting_down(struct dlm_ctxt *dlm) -{ - int ret = 0; - - spin_lock(&dlm_domain_lock); - - if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) - ret = 1; - - spin_unlock(&dlm_domain_lock); - - return ret; -} - void dlm_unregister_domain(struct dlm_ctxt *dlm) { int leave = 0; @@ -849,7 +807,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, * to back off and try again. This gives heartbeat a chance * to catch up. */ - if (!o2hb_check_node_heartbeating(query->node_idx)) { + if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) { mlog(0, "node %u is not in our live map yet\n", query->node_idx); @@ -887,7 +845,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, * to be put in someone's domain map. * Also, explicitly disallow joining at certain troublesome * times (ie. during recovery). */ - if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { + if (dlm->dlm_state != DLM_CTXT_LEAVING) { int bit = query->node_idx; spin_lock(&dlm->spinlock); @@ -969,6 +927,14 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, * domain. Set him in the map and clean up our * leftover join state. */ BUG_ON(dlm->joining_node != assert->node_idx); + + if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "dlm recovery is ongoing, disallow join\n"); + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + return -EAGAIN; + } + set_bit(assert->node_idx, dlm->domain_map); clear_bit(assert->node_idx, dlm->exit_domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); @@ -1079,7 +1045,7 @@ static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map) int status, ret = 0, i; char *p; - if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES) goto bail; qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL); @@ -1133,7 +1099,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, struct dlm_ctxt *dlm = NULL; char *local = NULL; int status = 0; - int locked = 0; qr = (struct dlm_query_region *) msg->buf; @@ -1142,10 +1107,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, /* buffer used in dlm_mast_regions() */ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); - if (!local) { - status = -ENOMEM; - goto bail; - } + if (!local) + return -ENOMEM; status = -EINVAL; @@ -1154,16 +1117,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, if (!dlm) { mlog(ML_ERROR, "Node %d queried hb regions on domain %s " "before join domain\n", qr->qr_node, qr->qr_domain); - goto bail; + goto out_domain_lock; } spin_lock(&dlm->spinlock); - locked = 1; if (dlm->joining_node != qr->qr_node) { mlog(ML_ERROR, "Node %d queried hb regions on domain %s " "but joining node is %d\n", qr->qr_node, qr->qr_domain, dlm->joining_node); - goto bail; + goto out_dlm_lock; } /* Support for global heartbeat was added in 1.1 */ @@ -1173,14 +1135,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, "but active dlm protocol is %d.%d\n", qr->qr_node, qr->qr_domain, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); - goto bail; + goto out_dlm_lock; } status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); -bail: - if (locked) - spin_unlock(&dlm->spinlock); +out_dlm_lock: + spin_unlock(&dlm->spinlock); + +out_domain_lock: spin_unlock(&dlm_domain_lock); kfree(local); @@ -1254,7 +1217,7 @@ static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map) struct o2nm_node *node; int ret = 0, status, count, i; - if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES) goto bail; qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL); @@ -1311,7 +1274,7 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, { struct dlm_query_nodeinfo *qn; struct dlm_ctxt *dlm = NULL; - int locked = 0, status = -EINVAL; + int status = -EINVAL; qn = (struct dlm_query_nodeinfo *) msg->buf; @@ -1327,12 +1290,11 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, } spin_lock(&dlm->spinlock); - locked = 1; if (dlm->joining_node != qn->qn_nodenum) { mlog(ML_ERROR, "Node %d queried nodes on domain %s but " "joining node is %d\n", qn->qn_nodenum, qn->qn_domain, dlm->joining_node); - goto bail; + goto unlock; } /* Support for node query was added in 1.1 */ @@ -1342,14 +1304,14 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, "but active dlm protocol is %d.%d\n", qn->qn_nodenum, qn->qn_domain, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); - goto bail; + goto unlock; } status = dlm_match_nodes(dlm, qn); +unlock: + spin_unlock(&dlm->spinlock); bail: - if (locked) - spin_unlock(&dlm->spinlock); spin_unlock(&dlm_domain_lock); return status; @@ -1415,7 +1377,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm, unsigned int map_size) { int status, tmpstat; - unsigned int node; + int node; if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))) { @@ -1484,39 +1446,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm, if (status == -ENOPROTOOPT) { status = 0; *response = JOIN_OK_NO_MAP; - } else if (packet.code == JOIN_DISALLOW || - packet.code == JOIN_OK_NO_MAP) { - *response = packet.code; - } else if (packet.code == JOIN_PROTOCOL_MISMATCH) { - mlog(ML_NOTICE, - "This node requested DLM locking protocol %u.%u and " - "filesystem locking protocol %u.%u. At least one of " - "the protocol versions on node %d is not compatible, " - "disconnecting\n", - dlm->dlm_locking_proto.pv_major, - dlm->dlm_locking_proto.pv_minor, - dlm->fs_locking_proto.pv_major, - dlm->fs_locking_proto.pv_minor, - node); - status = -EPROTO; - *response = packet.code; - } else if (packet.code == JOIN_OK) { - *response = packet.code; - /* Use the same locking protocol as the remote node */ - dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; - dlm->fs_locking_proto.pv_minor = packet.fs_minor; - mlog(0, - "Node %d responds JOIN_OK with DLM locking protocol " - "%u.%u and fs locking protocol %u.%u\n", - node, - dlm->dlm_locking_proto.pv_major, - dlm->dlm_locking_proto.pv_minor, - dlm->fs_locking_proto.pv_major, - dlm->fs_locking_proto.pv_minor); } else { - status = -EINVAL; - mlog(ML_ERROR, "invalid response %d from node %u\n", - packet.code, node); + *response = packet.code; + switch (packet.code) { + case JOIN_DISALLOW: + case JOIN_OK_NO_MAP: + break; + case JOIN_PROTOCOL_MISMATCH: + mlog(ML_NOTICE, + "This node requested DLM locking protocol %u.%u and " + "filesystem locking protocol %u.%u. At least one of " + "the protocol versions on node %d is not compatible, " + "disconnecting\n", + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor, + node); + status = -EPROTO; + break; + case JOIN_OK: + /* Use the same locking protocol as the remote node */ + dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; + dlm->fs_locking_proto.pv_minor = packet.fs_minor; + mlog(0, + "Node %d responds JOIN_OK with DLM locking protocol " + "%u.%u and fs locking protocol %u.%u\n", + node, + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor); + break; + default: + status = -EINVAL; + mlog(ML_ERROR, "invalid response %d from node %u\n", + packet.code, node); + /* Reset response to JOIN_DISALLOW */ + *response = JOIN_DISALLOW; + break; + } } mlog(0, "status %d, node %d response is %d\n", status, node, @@ -1530,6 +1499,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, unsigned int node) { int status; + int ret; struct dlm_assert_joined assert_msg; mlog(0, "Sending join assert to node %u\n", node); @@ -1541,11 +1511,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, &assert_msg, sizeof(assert_msg), node, - NULL); + &ret); if (status < 0) mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, node); + else + status = ret; return status; } @@ -1555,7 +1527,6 @@ static void dlm_send_join_asserts(struct dlm_ctxt *dlm, { int status, node, live; - status = 0; node = -1; while ((node = find_next_bit(node_map, O2NM_MAX_NODES, node + 1)) < O2NM_MAX_NODES) { @@ -1603,8 +1574,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm, spin_lock(&dlm->spinlock); /* For now, we restart the process if the node maps have * changed at all */ - ret = memcmp(ctxt->live_map, dlm->live_nodes_map, - sizeof(dlm->live_nodes_map)); + ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map, + O2NM_MAX_NODES); spin_unlock(&dlm->spinlock); if (ret) @@ -1631,13 +1602,11 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) /* group sem locking should work for us here -- we're already * registered for heartbeat events so filling this should be * atomic wrt getting those handlers called. */ - o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); + o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES); spin_lock(&dlm->spinlock); - memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); - + bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES); __dlm_set_joining_node(dlm, dlm->node_num); - spin_unlock(&dlm->spinlock); node = -1; @@ -1670,8 +1639,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) * yes_resp_map. Copy that into our domain map and send a join * assert message to clean up everyone elses state. */ spin_lock(&dlm->spinlock); - memcpy(dlm->domain_map, ctxt->yes_resp_map, - sizeof(ctxt->yes_resp_map)); + bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES); set_bit(dlm->node_num, dlm->domain_map); spin_unlock(&dlm->spinlock); @@ -1741,12 +1709,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); + o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, + dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); + status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down); if (status) goto bail; - o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, - dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up); if (status) goto bail; @@ -1864,6 +1833,10 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) if (status) goto bail; + status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key, + sizeof(struct dlm_deref_lockres_done), + dlm_deref_lockres_done_handler, + dlm, NULL, &dlm->dlm_domain_handlers); bail: if (status) dlm_unregister_domain_handlers(dlm); @@ -1876,6 +1849,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) int status; unsigned int backoff; unsigned int total_backoff = 0; + char wq_name[O2NM_MAX_NAME_LEN]; BUG_ON(!dlm); @@ -1887,12 +1861,6 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - status = dlm_debug_init(dlm); - if (status < 0) { - mlog_errno(status); - goto bail; - } - status = dlm_launch_thread(dlm); if (status < 0) { mlog_errno(status); @@ -1905,7 +1873,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + dlm_debug_init(dlm); + + snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); + dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); @@ -1925,12 +1897,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - if (total_backoff > - msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { + if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) { status = -ERESTARTSYS; mlog(ML_NOTICE, "Timed out joining dlm domain " "%s after %u msecs\n", dlm->name, - jiffies_to_msecs(total_backoff)); + total_backoff); goto bail; } @@ -1960,7 +1931,6 @@ bail: if (status) { dlm_unregister_domain_handlers(dlm); - dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -1978,24 +1948,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); if (!dlm) { - mlog_errno(-ENOMEM); + ret = -ENOMEM; + mlog_errno(ret); goto leave; } dlm->name = kstrdup(domain, GFP_KERNEL); if (dlm->name == NULL) { - mlog_errno(-ENOMEM); - kfree(dlm); - dlm = NULL; + ret = -ENOMEM; + mlog_errno(ret); goto leave; } dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); if (!dlm->lockres_hash) { - mlog_errno(-ENOMEM); - kfree(dlm->name); - kfree(dlm); - dlm = NULL; + ret = -ENOMEM; + mlog_errno(ret); goto leave; } @@ -2005,11 +1973,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->master_hash = (struct hlist_head **) dlm_alloc_pagevec(DLM_HASH_PAGES); if (!dlm->master_hash) { - mlog_errno(-ENOMEM); - dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); - kfree(dlm->name); - kfree(dlm); - dlm = NULL; + ret = -ENOMEM; + mlog_errno(ret); goto leave; } @@ -2019,15 +1984,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->key = key; dlm->node_num = o2nm_this_node(); - ret = dlm_create_debugfs_subroot(dlm); - if (ret < 0) { - dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); - dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); - kfree(dlm->name); - kfree(dlm); - dlm = NULL; - goto leave; - } + dlm_create_debugfs_subroot(dlm); spin_lock_init(&dlm->spinlock); spin_lock_init(&dlm->master_lock); @@ -2036,7 +1993,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, INIT_LIST_HEAD(&dlm->list); INIT_LIST_HEAD(&dlm->dirty_list); INIT_LIST_HEAD(&dlm->reco.resources); - INIT_LIST_HEAD(&dlm->reco.received); INIT_LIST_HEAD(&dlm->reco.node_data); INIT_LIST_HEAD(&dlm->purge_list); INIT_LIST_HEAD(&dlm->dlm_domain_handlers); @@ -2049,9 +2005,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", dlm->recovery_map, &(dlm->recovery_map[0])); - memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); - memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); - memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); + bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES); + bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES); + bitmap_zero(dlm->domain_map, O2NM_MAX_NODES); dlm->dlm_thread_task = NULL; dlm->dlm_reco_thread_task = NULL; @@ -2066,6 +2022,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; init_waitqueue_head(&dlm->dlm_join_events); + dlm->migrate_done = 0; + dlm->reco.new_master = O2NM_INVALID_NODE_NUM; dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; @@ -2086,9 +2044,23 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks); mlog(0, "context init: refcount %u\n", - atomic_read(&dlm->dlm_refs.refcount)); + kref_read(&dlm->dlm_refs)); + ret = 0; leave: + if (ret < 0 && dlm) { + if (dlm->master_hash) + dlm_free_pagevec((void **)dlm->master_hash, + DLM_HASH_PAGES); + + if (dlm->lockres_hash) + dlm_free_pagevec((void **)dlm->lockres_hash, + DLM_HASH_PAGES); + + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + } return dlm; } @@ -2296,13 +2268,10 @@ static DECLARE_RWSEM(dlm_callback_sem); void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num) { - struct list_head *iter; struct dlm_eviction_cb *cb; down_read(&dlm_callback_sem); - list_for_each(iter, &dlm->dlm_eviction_callbacks) { - cb = list_entry(iter, struct dlm_eviction_cb, ec_item); - + list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) { cb->ec_func(node_num, cb->ec_data); } up_read(&dlm_callback_sem); @@ -2339,8 +2308,6 @@ static int __init dlm_init(void) { int status; - dlm_print_version(); - status = dlm_init_mle_cache(); if (status) { mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); @@ -2366,9 +2333,7 @@ static int __init dlm_init(void) goto error; } - status = dlm_create_debugfs_root(); - if (status) - goto error; + dlm_create_debugfs_root(); return 0; error: @@ -2390,6 +2355,7 @@ static void __exit dlm_exit (void) MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 Distributed Lock Management"); module_init(dlm_init); module_exit(dlm_exit); diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h index 2f7f60bfeb3b..815abe30ad09 100644 --- a/fs/ocfs2/dlm/dlmdomain.h +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -1,25 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * dlmdomain.h * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef DLMDOMAIN_H @@ -28,8 +11,30 @@ extern spinlock_t dlm_domain_lock; extern struct list_head dlm_domains; -int dlm_joined(struct dlm_ctxt *dlm); -int dlm_shutting_down(struct dlm_ctxt *dlm); +static inline int dlm_joined(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_JOINED) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + +static inline int dlm_shutting_down(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 47e67c2d228f..041fd1791ae7 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmlock.c * * underlying calls for lock creation * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ @@ -40,9 +23,9 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -50,9 +33,9 @@ #include "dlmconvert.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" -static struct kmem_cache *dlm_lock_cache = NULL; +static struct kmem_cache *dlm_lock_cache; static DEFINE_SPINLOCK(dlm_cookie_lock); static u64 dlm_next_cookie = 1; @@ -77,8 +60,7 @@ int dlm_init_lock_cache(void) void dlm_destroy_lock_cache(void) { - if (dlm_lock_cache) - kmem_cache_destroy(dlm_lock_cache); + kmem_cache_destroy(dlm_lock_cache); } /* Tell us whether we can grant a new lock request. @@ -91,19 +73,14 @@ void dlm_destroy_lock_cache(void) static int dlm_can_grant_new_lock(struct dlm_lock_resource *res, struct dlm_lock *lock) { - struct list_head *iter; struct dlm_lock *tmplock; - list_for_each(iter, &res->granted) { - tmplock = list_entry(iter, struct dlm_lock, list); - + list_for_each_entry(tmplock, &res->granted, list) { if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) return 0; } - list_for_each(iter, &res->converting) { - tmplock = list_entry(iter, struct dlm_lock, list); - + list_for_each_entry(tmplock, &res->converting, list) { if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) return 0; if (!dlm_lock_compatible(tmplock->ml.convert_type, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 33ecbe0e6734..4145e06d2c08 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmmod.c * * standalone DLM module * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ @@ -38,11 +21,11 @@ #include <linux/inet.h> #include <linux/spinlock.h> #include <linux/delay.h> +#include <linux/string_choices.h> - -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -50,7 +33,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_mle_node_down(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle, @@ -82,9 +65,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, return 1; } -static struct kmem_cache *dlm_lockres_cache = NULL; -static struct kmem_cache *dlm_lockname_cache = NULL; -static struct kmem_cache *dlm_mle_cache = NULL; +static struct kmem_cache *dlm_lockres_cache; +static struct kmem_cache *dlm_lockname_cache; +static struct kmem_cache *dlm_mle_cache; static void dlm_mle_release(struct kref *kref); static void dlm_init_mle(struct dlm_master_list_entry *mle, @@ -233,7 +216,7 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle) assert_spin_locked(&dlm->spinlock); assert_spin_locked(&dlm->master_lock); - if (!atomic_read(&mle->mle_refs.refcount)) { + if (!kref_read(&mle->mle_refs)) { /* this may or may not crash, but who cares. * it's a BUG. */ mlog(ML_ERROR, "bad mle: %p\n", mle); @@ -275,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, mle->type = type; INIT_HLIST_NODE(&mle->master_hash_node); INIT_LIST_HEAD(&mle->hb_events); - memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + bitmap_zero(mle->maybe_map, O2NM_MAX_NODES); spin_lock_init(&mle->spinlock); init_waitqueue_head(&mle->wq); atomic_set(&mle->woken, 0); kref_init(&mle->mle_refs); - memset(mle->response_map, 0, sizeof(mle->response_map)); + bitmap_zero(mle->response_map, O2NM_MAX_NODES); mle->master = O2NM_MAX_NODES; mle->new_master = O2NM_MAX_NODES; mle->inuse = 0; @@ -307,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, atomic_inc(&dlm->mle_cur_count[mle->type]); /* copy off the node_map and register hb callbacks on our copy */ - memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); - memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); + bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES); + bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES); clear_bit(dlm->node_num, mle->vote_map); clear_bit(dlm->node_num, mle->node_map); @@ -342,16 +325,13 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, { struct dlm_master_list_entry *tmpmle; struct hlist_head *bucket; - struct hlist_node *list; unsigned int hash; assert_spin_locked(&dlm->master_lock); hash = dlm_lockid_hash(name, namelen); bucket = dlm_master_hash(dlm, hash); - hlist_for_each(list, bucket) { - tmpmle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); + hlist_for_each_entry(tmpmle, bucket, master_hash_node) { if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) continue; dlm_get_mle(tmpmle); @@ -417,8 +397,7 @@ int dlm_init_mle_cache(void) void dlm_destroy_mle_cache(void) { - if (dlm_mle_cache) - kmem_cache_destroy(dlm_mle_cache); + kmem_cache_destroy(dlm_mle_cache); } static void dlm_mle_release(struct kref *kref) @@ -475,11 +454,11 @@ bail: void dlm_destroy_master_caches(void) { - if (dlm_lockname_cache) - kmem_cache_destroy(dlm_lockname_cache); + kmem_cache_destroy(dlm_lockname_cache); + dlm_lockname_cache = NULL; - if (dlm_lockres_cache) - kmem_cache_destroy(dlm_lockres_cache); + kmem_cache_destroy(dlm_lockres_cache); + dlm_lockres_cache = NULL; } static void dlm_lockres_release(struct kref *kref) @@ -497,16 +476,6 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); - spin_lock(&dlm->track_lock); - if (!list_empty(&res->tracking)) - list_del_init(&res->tracking); - else { - mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", - res->lockname.len, res->lockname.name); - dlm_print_one_lock_resource(res); - } - spin_unlock(&dlm->track_lock); - atomic_dec(&dlm->res_cur_count); if (!hlist_unhashed(&res->hash_node) || @@ -580,6 +549,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; res->inflight_locks = 0; + res->inflight_assert_workers = 0; res->dlm = dlm; @@ -597,12 +567,12 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; - spin_lock(&dlm->spinlock); + spin_lock(&dlm->track_lock); list_add_tail(&res->tracking, &dlm->tracking_list); - spin_unlock(&dlm->spinlock); + spin_unlock(&dlm->track_lock); memset(res->lvb, 0, DLM_LVB_LEN); - memset(res->refmap, 0, sizeof(res->refmap)); + bitmap_zero(res->refmap, O2NM_MAX_NODES); } struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, @@ -623,9 +593,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, return res; error: - if (res && res->lockname.name) - kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); - if (res) kmem_cache_free(dlm_lockres_cache, res); return NULL; @@ -653,12 +620,9 @@ void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, clear_bit(bit, res->refmap); } - -void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, +static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - assert_spin_locked(&res->spinlock); - res->inflight_locks++; mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, @@ -666,6 +630,13 @@ void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, __builtin_return_address(0)); } +void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + __dlm_lockres_grab_inflight_ref(dlm, res); +} + void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { @@ -682,6 +653,35 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, wake_up(&res->wq); } +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + res->inflight_assert_workers++; + mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + BUG_ON(res->inflight_assert_workers == 0); + res->inflight_assert_workers--; + mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_drop_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + /* * lookup a lock resource by name. * may already exist in the hashtable. @@ -725,6 +725,19 @@ lookup: if (tmpres) { spin_unlock(&dlm->spinlock); spin_lock(&tmpres->spinlock); + + /* + * Right after dlm spinlock was released, dlm_thread could have + * purged the lockres. Check if lockres got unhashed. If so + * start over. + */ + if (hlist_unhashed(&tmpres->hash_node)) { + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + /* Wait on the thread that is mastering the resource */ if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { __dlm_wait_on_lockres(tmpres); @@ -750,8 +763,18 @@ lookup: dlm_lockres_grab_inflight_ref(dlm, tmpres); spin_unlock(&tmpres->spinlock); - if (res) + if (res) { + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else + mlog(ML_ERROR, "Resource %.*s not " + "on the Tracking list\n", + res->lockname.len, + res->lockname.name); + spin_unlock(&dlm->track_lock); dlm_lockres_put(res); + } res = tmpres; goto leave; } @@ -838,7 +861,7 @@ lookup: * to see if there are any nodes that still need to be * considered. these will not appear in the mle nodemap * but they might own this lockres. wait on them. */ - bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) { mlog(0, "%s: res %.*s, At least one node (%d) " "to recover before lock mastery can begin\n", @@ -855,10 +878,8 @@ lookup: /* finally add the lockres to its hash bucket */ __dlm_insert_lockres(dlm, res); - /* Grab inflight ref to pin the resource */ - spin_lock(&res->spinlock); - dlm_lockres_grab_inflight_ref(dlm, res); - spin_unlock(&res->spinlock); + /* since this lockres is new it doesn't not require the spinlock */ + __dlm_lockres_grab_inflight_ref(dlm, res); /* get an extra ref on the mle in case this is a BLOCK * if so, the creator of the BLOCK may try to put the last @@ -891,7 +912,7 @@ redo_request: dlm_wait_for_recovery(dlm); spin_lock(&dlm->spinlock); - bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) { mlog(0, "%s: res %.*s, At least one node (%d) " "to recover before lock mastery can begin\n", @@ -1015,10 +1036,10 @@ recheck: spin_lock(&mle->spinlock); m = mle->master; - map_changed = (memcmp(mle->vote_map, mle->node_map, - sizeof(mle->vote_map)) != 0); - voting_done = (memcmp(mle->vote_map, mle->response_map, - sizeof(mle->vote_map)) == 0); + map_changed = !bitmap_equal(mle->vote_map, mle->node_map, + O2NM_MAX_NODES); + voting_done = bitmap_equal(mle->vote_map, mle->response_map, + O2NM_MAX_NODES); /* restart if we hit any errors */ if (map_changed) { @@ -1058,7 +1079,7 @@ recheck: sleep = 1; /* have all nodes responded? */ if (voting_done && !*blocked) { - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (dlm->node_num <= bit) { /* my node number is lowest. * now tell other nodes that I am @@ -1079,13 +1100,6 @@ recheck: /* sleep if we haven't finished voting yet */ if (sleep) { unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); - - /* - if (atomic_read(&mle->mle_refs.refcount) < 2) - mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, - atomic_read(&mle->mle_refs.refcount), - res->lockname.len, res->lockname.name); - */ atomic_set(&mle->woken, 0); (void)wait_event_timeout(mle->wq, (atomic_read(&mle->woken) == 1), @@ -1220,8 +1234,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, } else { mlog(ML_ERROR, "node down! %d\n", node); if (blocked) { - int lowest = find_next_bit(mle->maybe_map, - O2NM_MAX_NODES, 0); + int lowest = find_first_bit(mle->maybe_map, + O2NM_MAX_NODES); /* act like it was never there */ clear_bit(node, mle->maybe_map); @@ -1263,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, /* now blank out everything, as if we had never * contacted anyone */ - memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); - memset(mle->response_map, 0, sizeof(mle->response_map)); + bitmap_zero(mle->maybe_map, O2NM_MAX_NODES); + bitmap_zero(mle->response_map, O2NM_MAX_NODES); /* reset the vote_map to the current node_map */ - memcpy(mle->vote_map, mle->node_map, - sizeof(mle->node_map)); + bitmap_copy(mle->vote_map, mle->node_map, + O2NM_MAX_NODES); /* put myself into the maybe map */ if (mle->type != DLM_MLE_BLOCK) set_bit(dlm->node_num, mle->maybe_map); @@ -1396,6 +1410,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, int found, ret; int set_maybe; int dispatch_assert = 0; + int dispatched = 0; if (!dlm_grab(dlm)) return DLM_MASTER_RESP_NO; @@ -1422,6 +1437,18 @@ way_up_top: /* take care of the easy cases up front */ spin_lock(&res->spinlock); + + /* + * Right after dlm spinlock was released, dlm_thread could have + * purged the lockres. Check if lockres got unhashed. If so + * start over. + */ + if (hlist_unhashed(&res->hash_node)) { + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + goto way_up_top; + } + if (res->state & (DLM_LOCK_RES_RECOVERING| DLM_LOCK_RES_MIGRATING)) { spin_unlock(&res->spinlock); @@ -1450,7 +1477,6 @@ way_up_top: goto send_response; } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { spin_unlock(&res->spinlock); - // mlog(0, "node %u is the master\n", res->owner); response = DLM_MASTER_RESP_NO; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1466,7 +1492,6 @@ way_up_top: BUG(); } - // mlog(0, "lockres is in progress...\n"); spin_lock(&dlm->master_lock); found = dlm_find_mle(dlm, &tmpmle, name, namelen); if (!found) { @@ -1476,8 +1501,6 @@ way_up_top: set_maybe = 1; spin_lock(&tmpmle->spinlock); if (tmpmle->type == DLM_MLE_BLOCK) { - // mlog(0, "this node is waiting for " - // "lockres to be mastered\n"); response = DLM_MASTER_RESP_NO; } else if (tmpmle->type == DLM_MLE_MIGRATION) { mlog(0, "node %u is master, but trying to migrate to " @@ -1504,8 +1527,6 @@ way_up_top: } else response = DLM_MASTER_RESP_NO; } else { - // mlog(0, "this node is attempting to " - // "master lockres\n"); response = DLM_MASTER_RESP_MAYBE; } if (set_maybe) @@ -1532,7 +1553,6 @@ way_up_top: found = dlm_find_mle(dlm, &tmpmle, name, namelen); if (!found) { /* this lockid has never been seen on this node yet */ - // mlog(0, "no mle found\n"); if (!mle) { spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -1546,15 +1566,11 @@ way_up_top: goto way_up_top; } - // mlog(0, "this is second time thru, already allocated, " - // "add the block.\n"); dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); __dlm_insert_mle(dlm, mle); response = DLM_MASTER_RESP_NO; } else { - // mlog(0, "mle was found\n"); - set_maybe = 1; spin_lock(&tmpmle->spinlock); if (tmpmle->master == dlm->node_num) { mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); @@ -1569,8 +1585,7 @@ way_up_top: response = DLM_MASTER_RESP_NO; } else response = DLM_MASTER_RESP_MAYBE; - if (set_maybe) - set_bit(request->node_idx, tmpmle->maybe_map); + set_bit(request->node_idx, tmpmle->maybe_map); spin_unlock(&tmpmle->spinlock); } spin_unlock(&dlm->master_lock); @@ -1588,27 +1603,28 @@ send_response: * dlm_assert_master_worker() isn't called, we drop it here. */ if (dispatch_assert) { - if (response != DLM_MASTER_RESP_YES) - mlog(ML_ERROR, "invalid response %d\n", response); - if (!res) { - mlog(ML_ERROR, "bad lockres while trying to assert!\n"); - BUG(); - } mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", dlm->node_num, res->lockname.len, res->lockname.name); + spin_lock(&res->spinlock); ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, DLM_ASSERT_MASTER_MLE_CLEANUP); if (ret < 0) { mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; + spin_unlock(&res->spinlock); dlm_lockres_put(res); + } else { + dispatched = 1; + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); } } else { if (res) dlm_lockres_put(res); } - dlm_put(dlm); + if (!dispatched) + dlm_put(dlm); return response; } @@ -1770,7 +1786,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, "MLE for it! (%.*s)\n", assert->node_idx, namelen, name); } else { - int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); + int bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (bit >= O2NM_MAX_NODES) { /* not necessarily an error, though less likely. * could be master just re-asserting. */ @@ -1872,8 +1888,6 @@ ok: spin_unlock(&res->spinlock); } - // mlog(0, "woo! got an assert_master from node %u!\n", - // assert->node_idx); if (mle) { int extra_ref = 0; int nn = -1; @@ -1888,8 +1902,10 @@ ok: * up nodes that this node contacted */ while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, nn+1)) < O2NM_MAX_NODES) { - if (nn != dlm->node_num && nn != assert->node_idx) + if (nn != dlm->node_num && nn != assert->node_idx) { master_request = 1; + break; + } } } mle->master = assert->node_idx; @@ -1923,7 +1939,7 @@ ok: * on this mle. */ spin_lock(&dlm->master_lock); - rr = atomic_read(&mle->mle_refs.refcount); + rr = kref_read(&mle->mle_refs); if (mle->inuse > 0) { if (extra_ref && rr < 3) err = 1; @@ -1995,6 +2011,10 @@ kill: "and killing the other node now! This node is OK and can continue.\n"); __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); + spin_lock(&dlm->master_lock); + if (mle) + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); *ret_data = (void *)res; dlm_put(dlm); @@ -2026,7 +2046,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, /* queue up work for dlm_assert_master_worker */ - dlm_grab(dlm); /* get an extra ref for the work item */ dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); item->u.am.lockres = res; /* already have a ref */ /* can optionally ignore node numbers higher than this node */ @@ -2064,7 +2083,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) flags = item->u.am.flags; spin_lock(&dlm->spinlock); - memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); + bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES); spin_unlock(&dlm->spinlock); clear_bit(dlm->node_num, nodemap); @@ -2115,6 +2134,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) dlm_lockres_release_ast(dlm, res); put: + dlm_lockres_drop_inflight_worker(dlm, res); + dlm_lockres_put(res); mlog(0, "finished with dlm_assert_master_worker\n"); @@ -2127,7 +2148,7 @@ put: * think that $RECOVERY is currently mastered by a dead node. If so, * we wait a short time to allow that node to get notified by its own * heartbeat stack, then check again. All $RECOVERY lock resources - * mastered by dead nodes are purged when the hearbeat callback is + * mastered by dead nodes are purged when the heartbeat callback is * fired, so we can know for sure that it is safe to continue once * the node returns a live node or no node. */ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, @@ -2206,8 +2227,11 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", dlm->name, namelen, lockname, res->owner, r); dlm_print_one_lock_resource(res); - BUG(); - } + if (r == -ENOMEM) + BUG(); + } else + ret = r; + return ret; } @@ -2275,7 +2299,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, res->lockname.len, res->lockname.name, node); dlm_print_one_lock_resource(res); } - ret = 0; + ret = DLM_DEREF_RESPONSE_DONE; goto done; } @@ -2295,7 +2319,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->work_lock); queue_work(dlm->dlm_worker, &dlm->dispatched_work); - return 0; + return DLM_DEREF_RESPONSE_INPROG; done: if (res) @@ -2305,6 +2329,102 @@ done: return ret; } +int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres_done *deref + = (struct dlm_deref_lockres_done *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + + spin_lock(&res->spinlock); + if (!(res->state & DLM_LOCK_RES_DROPPING_REF)) { + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done " + "but it is already derefed!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + ret = 0; + goto done; + } + + __dlm_do_purge_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + spin_unlock(&dlm->spinlock); + + ret = 0; +done: + if (res) + dlm_lockres_put(res); + dlm_put(dlm); + return ret; +} + +static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 node) +{ + struct dlm_deref_lockres_done deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key, + &deref, sizeof(deref), node, &r); + if (ret < 0) { + mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE " + " to node %u\n", dlm->name, namelen, + lockname, ret, node); + } else if (r < 0) { + /* ignore the error */ + mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", + dlm->name, namelen, lockname, node, r); + dlm_print_one_lock_resource(res); + } +} + static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) { struct dlm_ctxt *dlm; @@ -2318,13 +2438,15 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) spin_lock(&res->spinlock); BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); if (test_bit(node, res->refmap)) { - __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); dlm_lockres_clear_refmap_bit(dlm, res, node); cleared = 1; } spin_unlock(&res->spinlock); + dlm_drop_lockres_ref_done(dlm, res, node); + if (cleared) { mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", dlm->name, res->lockname.len, res->lockname.name, node); @@ -2340,13 +2462,13 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) } /* - * A migrateable resource is one that is : + * A migratable resource is one that is : * 1. locally mastered, and, * 2. zero local locks, and, * 3. one or more non-local locks, or, one or more references * Returns 1 if yes, 0 if not. */ -static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, +static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { enum dlm_lockres_list idx; @@ -2357,6 +2479,15 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); + /* delay migration when the lockres is in MIGRATING state */ + if (res->state & DLM_LOCK_RES_MIGRATING) + return 0; + + /* delay migration when the lockres is in RECOCERING state */ + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) + return 0; + if (res->owner != dlm->node_num) return 0; @@ -2368,7 +2499,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, continue; } cookie = be64_to_cpu(lock->ml.cookie); - mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " + mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on " "%s list\n", dlm->name, res->lockname.len, res->lockname.name, dlm_get_lock_cookie_node(cookie), @@ -2379,12 +2510,12 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, } if (!nonlocal) { - node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + node_ref = find_first_bit(res->refmap, O2NM_MAX_NODES); if (node_ref >= O2NM_MAX_NODES) return 0; } - mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, + mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len, res->lockname.name); return 1; @@ -2410,8 +2541,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, if (!dlm_grab(dlm)) return -EINVAL; - BUG_ON(target == O2NM_MAX_NODES); - name = res->lockname.name; namelen = res->lockname.len; @@ -2441,6 +2570,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); + /* get an extra reference on the mle. + * otherwise the assert_master from the new + * master will destroy this. + */ + if (ret != -EEXIST) + dlm_get_mle_inuse(mle); + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -2466,7 +2602,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (oldmle) { + if (ret != -EEXIST && oldmle) { /* master is known, detach if not already detached */ dlm_mle_detach_hb_events(dlm, oldmle); dlm_put_mle(oldmle); @@ -2476,6 +2612,7 @@ fail: if (mle_added) { dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); + dlm_put_mle_inuse(mle); } else if (mle) { kmem_cache_free(dlm_mle_cache, mle); mle = NULL; @@ -2493,17 +2630,6 @@ fail: * ensure that all assert_master work is flushed. */ flush_workqueue(dlm->dlm_worker); - /* get an extra reference on the mle. - * otherwise the assert_master from the new - * master will destroy this. - * also, make sure that all callers of dlm_get_mle - * take both dlm->spinlock and dlm->master_lock */ - spin_lock(&dlm->spinlock); - spin_lock(&dlm->master_lock); - dlm_get_mle_inuse(mle); - spin_unlock(&dlm->master_lock); - spin_unlock(&dlm->spinlock); - /* notify new node and send all lock state */ /* call send_one_lockres with migration flag. * this serves as notice to the target node that a @@ -2610,8 +2736,6 @@ leave: return ret; } -#define DLM_MIGRATION_RETRY_MS 100 - /* * Should be called only after beginning the domain leave process. * There should not be any remaining locks on nonlocal lock resources, @@ -2623,6 +2747,7 @@ leave: * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) + __must_hold(&dlm->spinlock) { int ret; int lock_dropped = 0; @@ -2631,7 +2756,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&dlm->spinlock); spin_lock(&res->spinlock); - if (dlm_is_lockres_migrateable(dlm, res)) + if (dlm_is_lockres_migratable(dlm, res)) target = dlm_pick_migration_target(dlm, res); spin_unlock(&res->spinlock); @@ -2723,7 +2848,7 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, dlm_lockres_release_ast(dlm, res); mlog(0, "about to wait on migration_wq, dirty=%s\n", - res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); + str_yes_no(res->state & DLM_LOCK_RES_DIRTY)); /* if the extra ref we just put was the final one, this * will pass thru immediately. otherwise, we need to wait * for the last ast to finish. */ @@ -2733,12 +2858,12 @@ again: msecs_to_jiffies(1000)); if (ret < 0) { mlog(0, "woken again: migrating? %s, dead? %s\n", - res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", - test_bit(target, dlm->domain_map) ? "no":"yes"); + str_yes_no(res->state & DLM_LOCK_RES_MIGRATING), + str_no_yes(test_bit(target, dlm->domain_map))); } else { mlog(0, "all is well: migrating? %s, dead? %s\n", - res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", - test_bit(target, dlm->domain_map) ? "no":"yes"); + str_yes_no(res->state & DLM_LOCK_RES_MIGRATING), + str_no_yes(test_bit(target, dlm->domain_map))); } if (!dlm_migration_can_proceed(dlm, res, target)) { mlog(0, "trying again...\n"); @@ -2758,13 +2883,15 @@ again: /* * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for * another try; otherwise, we are sure the MIGRATING state is there, - * drop the unneded state which blocked threads trying to DIRTY + * drop the unneeded state which blocked threads trying to DIRTY */ spin_lock(&res->spinlock); BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; if (!ret) BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + else + res->migration_pending = 0; spin_unlock(&res->spinlock); /* @@ -2839,7 +2966,7 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { enum dlm_lockres_list idx; - struct list_head *queue = &res->granted; + struct list_head *queue; struct dlm_lock *lock; int noderef; u8 nodenum = O2NM_MAX_NODES; @@ -2970,7 +3097,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, int ret = 0; if (!dlm_grab(dlm)) - return -EINVAL; + return 0; name = migrate->name; namelen = migrate->namelen; @@ -3011,6 +3138,9 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, migrate->new_master, migrate->master); + if (ret < 0) + kmem_cache_free(dlm_mle_cache, mle); + spin_unlock(&dlm->master_lock); unlock: spin_unlock(&dlm->spinlock); @@ -3061,7 +3191,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, mlog(0, "tried to migrate %.*s, but some " "process beat me to it\n", namelen, name); - ret = -EEXIST; + spin_unlock(&tmp->spinlock); + return -EEXIST; } else { /* bad. 2 NODES are trying to migrate! */ mlog(ML_ERROR, "migration error mle: " @@ -3081,11 +3212,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it so that only one mle will be found */ __dlm_unlink_mle(dlm, tmp); __dlm_mle_detach_hb_events(dlm, tmp); - ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; - mlog(0, "%s:%.*s: master=%u, newmaster=%u, " - "telling master to get ref for cleared out mle " - "during migration\n", dlm->name, namelen, name, - master, new_master); + if (tmp->type == DLM_MLE_MASTER) { + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref " + "for cleared out mle during " + "migration\n", dlm->name, + namelen, name, master, + new_master); + } } spin_unlock(&tmp->spinlock); } @@ -3157,7 +3292,7 @@ static void dlm_clean_block_mle(struct dlm_ctxt *dlm, BUG_ON(mle->type != DLM_MLE_BLOCK); spin_lock(&mle->spinlock); - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (bit != dead_node) { mlog(0, "mle found, but dead node %u would not have been " "master\n", dead_node); @@ -3183,7 +3318,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_master_list_entry *mle; struct dlm_lock_resource *res; struct hlist_head *bucket; - struct hlist_node *list; + struct hlist_node *tmp; unsigned int i; mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); @@ -3194,10 +3329,7 @@ top: spin_lock(&dlm->master_lock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); - hlist_for_each(list, bucket) { - mle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); - + hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { BUG_ON(mle->type != DLM_MLE_BLOCK && mle->type != DLM_MLE_MASTER && mle->type != DLM_MLE_MIGRATION); @@ -3231,6 +3363,15 @@ top: mle->new_master != dead_node) continue; + if (mle->new_master == dead_node && mle->inuse) { + mlog(ML_NOTICE, "%s: target %u died during " + "migration from %u, the MLE is " + "still keep used, ignore it!\n", + dlm->name, dead_node, + mle->master); + continue; + } + /* If we have reached this point, this mle needs to be * removed from the list and freed. */ dlm_clean_migration_mle(dlm, mle); @@ -3295,7 +3436,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, ret = 0; } - memset(iter.node_map, 0, sizeof(iter.node_map)); + bitmap_zero(iter.node_map, O2NM_MAX_NODES); set_bit(old_master, iter.node_map); mlog(0, "doing assert master of %.*s back to %u\n", res->lockname.len, res->lockname.name, old_master); @@ -3378,7 +3519,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm) int i; struct hlist_head *bucket; struct dlm_master_list_entry *mle; - struct hlist_node *tmp, *list; + struct hlist_node *tmp; /* * We notified all other nodes that we are exiting the domain and @@ -3390,13 +3531,11 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm) spin_lock(&dlm->master_lock); BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); - BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); + BUG_ON((find_first_bit(dlm->domain_map, O2NM_MAX_NODES) < O2NM_MAX_NODES)); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); - hlist_for_each_safe(list, tmp, bucket) { - mle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); + hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { if (mle->type != DLM_MLE_BLOCK) { mlog(ML_ERROR, "bad mle: %p\n", mle); dlm_print_one_mle(mle); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 773bd32bfd8c..843ee02bd85f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmrecovery.c * * recovery stuff * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ @@ -39,18 +22,18 @@ #include <linux/timer.h> #include <linux/kthread.h> #include <linux/delay.h> +#include <linux/string_choices.h> - -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); @@ -62,7 +45,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, u8 dead_node); -static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm); static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, @@ -141,13 +124,6 @@ static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); } -static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) -{ - spin_lock(&dlm->spinlock); - __dlm_reset_recovery(dlm); - spin_unlock(&dlm->spinlock); -} - /* Worker function used during recovery. */ void dlm_dispatch_work(struct work_struct *work) { @@ -205,7 +181,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) mlog(0, "starting dlm recovery thread...\n"); dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, - "dlm_reco_thread"); + "dlm_reco-%s", dlm->name); if (IS_ERR(dlm->dlm_reco_thread_task)) { mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); dlm->dlm_reco_thread_task = NULL; @@ -231,7 +207,7 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) * 1) all recovery threads cluster wide will work on recovering * ONE node at a time * 2) negotiate who will take over all the locks for the dead node. - * thats right... ALL the locks. + * that's right... ALL the locks. * 3) once a new master is chosen, everyone scans all locks * and moves aside those mastered by the dead guy * 4) each of these locks should be locked until recovery is done @@ -423,12 +399,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm) static void dlm_begin_recovery(struct dlm_ctxt *dlm) { - spin_lock(&dlm->spinlock); + assert_spin_locked(&dlm->spinlock); BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", dlm->name, dlm->reco.dead_node); dlm->reco.state |= DLM_RECO_STATE_ACTIVE; - spin_unlock(&dlm->spinlock); } static void dlm_end_recovery(struct dlm_ctxt *dlm) @@ -456,6 +431,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) spin_lock(&dlm->spinlock); + if (dlm->migrate_done) { + mlog(0, "%s: no need do recovery after migrating all " + "lock resources\n", dlm->name); + spin_unlock(&dlm->spinlock); + return 0; + } + /* check to see if the new master has died */ if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && test_bit(dlm->reco.new_master, dlm->recovery_map)) { @@ -469,7 +451,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { int bit; - bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit >= O2NM_MAX_NODES || bit < 0) dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else @@ -482,7 +464,6 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) } if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { - // mlog(0, "nothing to recover! sleeping now!\n"); spin_unlock(&dlm->spinlock); /* return to main thread loop and sleep. */ return 0; @@ -490,12 +471,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.dead_node); - spin_unlock(&dlm->spinlock); /* take write barrier */ /* (stops the list reshuffling thread, proxy ast handling) */ dlm_begin_recovery(dlm); + spin_unlock(&dlm->spinlock); + if (dlm->reco.new_master == dlm->node_num) goto master_here; @@ -537,7 +519,10 @@ master_here: /* success! see if any other nodes need recovery */ mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", dlm->name, dlm->reco.dead_node, dlm->node_num); - dlm_reset_recovery(dlm); + spin_lock(&dlm->spinlock); + __dlm_reset_recovery(dlm); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); } dlm_end_recovery(dlm); @@ -595,8 +580,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) msecs_to_jiffies(1000)); mlog(0, "waited 1 sec for %u, " "dead? %s\n", ndata->node_num, - dlm_is_node_dead(dlm, ndata->node_num) ? - "yes" : "no"); + str_yes_no(dlm_is_node_dead(dlm, ndata->node_num))); } else { /* -ENOMEM on the other node */ mlog(0, "%s: node %u returned " @@ -691,10 +675,18 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) spin_unlock(&dlm_reco_state_lock); mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass, - all_nodes_done?"yes":"no"); + str_yes_no(all_nodes_done)); if (all_nodes_done) { int ret; + /* Set this flag on recovery master to avoid + * a new recovery for another dead node start + * before the recovery is not done. That may + * cause recovery hung.*/ + spin_lock(&dlm->spinlock); + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + /* all nodes are now in DLM_RECO_NODE_DATA_DONE state * just send a finalize message to everyone and * clean up */ @@ -728,7 +720,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) } if (destroy) - dlm_destroy_recovery_area(dlm, dead_node); + dlm_destroy_recovery_area(dlm); return status; } @@ -739,7 +731,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_reco_node_data *ndata; spin_lock(&dlm->spinlock); - memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); + bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES); /* nodes can only be removed (by dying) after dropping * this lock, and death will be trapped later, so this should do */ spin_unlock(&dlm->spinlock); @@ -753,7 +745,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) ndata = kzalloc(sizeof(*ndata), GFP_NOFS); if (!ndata) { - dlm_destroy_recovery_area(dlm, dead_node); + dlm_destroy_recovery_area(dlm); return -ENOMEM; } ndata->node_num = num; @@ -767,7 +759,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) return 0; } -static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm) { struct dlm_reco_node_data *ndata, *next; LIST_HEAD(tmplist); @@ -787,6 +779,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, { struct dlm_lock_request lr; int ret; + int status; mlog(0, "\n"); @@ -800,13 +793,15 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, // send message ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, - &lr, sizeof(lr), request_from, NULL); + &lr, sizeof(lr), request_from, &status); /* negative status is handled by caller */ if (ret < 0) mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " "to recover dead node %u\n", dlm->name, ret, request_from, dead_node); + else + ret = status; // return from here, then // sleep until all received or error return ret; @@ -1056,6 +1051,9 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, dead_node, dlm->name); list_del_init(&lock->list); dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK + * - do manually */ + dlm_lock_put(lock); break; } } @@ -1100,7 +1098,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, { u64 mig_cookie = be64_to_cpu(mres->mig_cookie); int mres_total_locks = be32_to_cpu(mres->total_locks); - int sz, ret = 0, status = 0; + int ret = 0, status = 0; u8 orig_flags = mres->flags, orig_master = mres->master; @@ -1108,9 +1106,6 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, if (!mres->num_locks) return 0; - sz = sizeof(struct dlm_migratable_lockres) + - (mres->num_locks * sizeof(struct dlm_migratable_lock)); - /* add an all-done flag if we reached the last lock */ orig_flags = mres->flags; BUG_ON(total_locks > mres_total_locks); @@ -1124,7 +1119,8 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, /* send it */ ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, - sz, send_to, &status); + struct_size(mres, ml, mres->num_locks), + send_to, &status); if (ret < 0) { /* XXX: negative status is not handled. * this will end up killing this node. */ @@ -1356,10 +1352,20 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, char *buf = NULL; struct dlm_work_item *item = NULL; struct dlm_lock_resource *res = NULL; + unsigned int hash; if (!dlm_grab(dlm)) return -EINVAL; + if (!dlm_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not joined! " + "lockres %.*s, master %u\n", + dlm->name, mres->lockname_len, + mres->lockname, mres->master); + dlm_put(dlm); + return -EINVAL; + } + BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); real_master = mres->master; @@ -1383,11 +1389,26 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, /* lookup the lock to see if we have a secondary queue for this * already... just add the locks in and this will have its owner * and RECOVERY flag changed when it completes. */ - res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); + hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len, + hash); if (res) { /* this will get a ref on res */ /* mark it as recovering/migrating and hash it */ spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(0, "%s: node is attempting to migrate " + "lockres %.*s, but marked as dropping " + " ref!\n", dlm->name, + mres->lockname_len, mres->lockname); + ret = -EINVAL; + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + dlm_lockres_put(res); + goto leave; + } + if (mres->flags & DLM_MRES_RECOVERY) { res->state |= DLM_LOCK_RES_RECOVERING; } else { @@ -1404,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; } spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); } else { + spin_unlock(&dlm->spinlock); /* need to allocate, just like if it was * mastered here normally */ res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); @@ -1443,7 +1467,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, * The first one is handled at the end of this function. The * other two are handled in the worker thread after locks have * been attached. Yes, we don't wait for purge time to match - * kref_init. The lockres will still have atleast one ref + * kref_init. The lockres will still have at least one ref * added because it is in the hash __dlm_insert_lockres() */ extra_refs++; @@ -1633,7 +1657,7 @@ static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 nodenum, u8 *real_master) { - int ret = -EINVAL; + int ret; struct dlm_master_requery req; int status = DLM_LOCK_RES_OWNER_UNKNOWN; @@ -1642,14 +1666,18 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, req.namelen = res->lockname.len; memcpy(req.name, res->lockname.name, res->lockname.len); +resend: ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, &req, sizeof(req), nodenum, &status); - /* XXX: negative status not handled properly here. */ if (ret < 0) mlog(ML_ERROR, "Error %d when sending message %u (key " "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG, dlm->key, nodenum); - else { + else if (status == -ENOMEM) { + mlog_errno(status); + msleep(50); + goto resend; + } else { BUG_ON(status < 0); BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); *real_master = (u8) (status & 0xff); @@ -1673,6 +1701,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, unsigned int hash; int master = DLM_LOCK_RES_OWNER_UNKNOWN; u32 flags = DLM_ASSERT_MASTER_REQUERY; + int dispatched = 0; if (!dlm_grab(dlm)) { /* since the domain has gone away on this @@ -1691,17 +1720,28 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, int ret = dlm_dispatch_assert_master(dlm, res, 0, 0, flags); if (ret < 0) { - mlog_errno(-ENOMEM); - /* retry!? */ - BUG(); + mlog_errno(ret); + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + spin_unlock(&dlm->spinlock); + dlm_put(dlm); + /* sender will take care of this and retry */ + return ret; + } else { + dispatched = 1; + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); } - } else /* put.. incase we are not the master */ + } else { + /* put.. in case we are not the master */ + spin_unlock(&res->spinlock); dlm_lockres_put(res); - spin_unlock(&res->spinlock); + } } spin_unlock(&dlm->spinlock); - dlm_put(dlm); + if (!dispatched) + dlm_put(dlm); return master; } @@ -1747,15 +1787,14 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_migratable_lockres *mres) { struct dlm_migratable_lock *ml; - struct list_head *queue; + struct list_head *queue, *iter; struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; int i, j, bad; - struct dlm_lock *lock = NULL; + struct dlm_lock *lock; u8 from = O2NM_MAX_NODES; - unsigned int added = 0; __be64 c; mlog(0, "running %d locks for this lockres\n", mres->num_locks); @@ -1771,7 +1810,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); dlm_lockres_set_refmap_bit(dlm, res, from); spin_unlock(&res->spinlock); - added++; break; } BUG_ON(ml->highest_blocked != LKM_IVMODE); @@ -1788,14 +1826,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* MIGRATION ONLY! */ BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); + lock = NULL; spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each_entry(lock, tmpq, list) { - if (lock->ml.cookie != ml->cookie) - lock = NULL; - else + list_for_each(iter, tmpq) { + lock = list_entry(iter, + struct dlm_lock, list); + if (lock->ml.cookie == ml->cookie) break; + lock = NULL; } if (lock) break; @@ -1857,7 +1897,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); - added++; mlog(0, "just reordered a local lock!\n"); continue; @@ -1883,6 +1922,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, if (ml->type == LKM_NLMODE) goto skip_lvb; + /* + * If the lock is in the blocked list it can't have a valid lvb, + * so skip it + */ + if (ml->list == DLM_BLOCKED_LIST) + goto skip_lvb; + if (!dlm_lvb_is_empty(mres->lvb)) { if (lksb->flags & DLM_LKSB_PUT_LVB) { /* other node was trying to update @@ -1963,12 +2009,19 @@ skip_lvb: } if (!bad) { dlm_lock_get(newlock); - list_add_tail(&newlock->list, queue); + if (mres->flags & DLM_MRES_RECOVERY && + ml->list == DLM_CONVERTING_LIST && + newlock->ml.type > + newlock->ml.convert_type) { + /* newlock is doing downconvert, add it to the + * head of converting list */ + list_add(&newlock->list, queue); + } else + list_add_tail(&newlock->list, queue); mlog(0, "%s:%.*s: added lock for node %u, " "setting refmap bit\n", dlm->name, res->lockname.len, res->lockname.name, ml->node); dlm_lockres_set_refmap_bit(dlm, res, ml->node); - added++; } spin_unlock(&res->spinlock); } @@ -1980,11 +2033,8 @@ leave: dlm_lockres_drop_inflight_ref(dlm, res); spin_unlock(&res->spinlock); - if (ret < 0) { + if (ret < 0) mlog_errno(ret); - if (newlock) - dlm_lock_put(newlock); - } return ret; } @@ -2017,7 +2067,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, dlm_lock_get(lock); if (lock->convert_pending) { /* move converting lock back to granted */ - BUG_ON(i != DLM_CONVERTING_LIST); mlog(0, "node died with convert pending " "on %.*s. move back to granted list.\n", res->lockname.len, res->lockname.name); @@ -2109,6 +2158,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, bucket, hash_node) { + if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + if (!(res->state & DLM_LOCK_RES_RECOVERING)) continue; @@ -2246,6 +2302,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name, freed, dead_node); __dlm_print_one_lock_resource(res); } + res->state |= DLM_LOCK_RES_RECOVERY_WAITING; dlm_lockres_clear_refmap_bit(dlm, res, dead_node); } else if (test_bit(dead_node, res->refmap)) { mlog(0, "%s:%.*s: dead node %u had a ref, but had " @@ -2258,18 +2315,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, __dlm_dirty_lockres(dlm, res); } -/* if this node is the recovery master, and there are no - * locks for a given lockres owned by this node that are in - * either PR or EX mode, zero out the lvb before requesting. - * - */ - - static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) { struct dlm_lock_resource *res; int i; struct hlist_head *bucket; + struct hlist_node *tmp; struct dlm_lock *lock; @@ -2292,7 +2343,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); - hlist_for_each_entry(res, bucket, hash_node) { + hlist_for_each_entry_safe(res, tmp, bucket, hash_node) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, @@ -2306,9 +2357,24 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dead_node, dlm->name); list_del_init(&lock->list); dlm_lock_put(lock); + /* Can't schedule + * DLM_UNLOCK_FREE_LOCK + * - do manually */ + dlm_lock_put(lock); break; } } + + if ((res->owner == dead_node) && + (res->state & DLM_LOCK_RES_DROPPING_REF)) { + dlm_lockres_get(res); + __dlm_do_purge_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + continue; + } else if (res->owner == dlm->node_num) + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); spin_unlock(&res->spinlock); continue; } @@ -2317,17 +2383,31 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_revalidate_lvb(dlm, res, dead_node); if (res->owner == dead_node) { if (res->state & DLM_LOCK_RES_DROPPING_REF) { - mlog(ML_NOTICE, "%s: res %.*s, Skip " - "recovery as it is being freed\n", - dlm->name, res->lockname.len, - res->lockname.name); - } else - dlm_move_lockres_to_recovery_list(dlm, - res); - + mlog(0, "%s:%.*s: owned by " + "dead node %u, this node was " + "dropping its ref when master died. " + "continue, purging the lockres.\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + dlm_lockres_get(res); + __dlm_do_purge_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + continue; + } + dlm_move_lockres_to_recovery_list(dlm, res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); + } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + if (test_bit(dead_node, res->refmap)) { + mlog(0, "%s:%.*s: dead node %u had a ref, but had " + "no locks and had not purged before dying\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); + } } spin_unlock(&res->spinlock); } @@ -2391,11 +2471,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) * perhaps later we can genericize this for other waiters. */ wake_up(&dlm->migration_wq); - if (test_bit(idx, dlm->recovery_map)) - mlog(0, "domain %s, node %u already added " - "to recovery map!\n", dlm->name, idx); - else - set_bit(idx, dlm->recovery_map); + set_bit(idx, dlm->recovery_map); } void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) @@ -2555,7 +2631,7 @@ again: dlm_reco_master_ready(dlm), msecs_to_jiffies(1000)); if (!dlm_reco_master_ready(dlm)) { - mlog(0, "%s: reco master taking awhile\n", + mlog(0, "%s: reco master taking a while\n", dlm->name); goto again; } @@ -2620,7 +2696,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) continue; } retry: - ret = -EINVAL; mlog(0, "attempting to send begin reco msg to %d\n", nodenum); ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, @@ -2864,12 +2939,10 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, BUG(); } dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); spin_unlock(&dlm->spinlock); - dlm_reset_recovery(dlm); dlm_kick_recovery_thread(dlm); break; - default: - BUG(); } mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index e73c833fc2a1..eedf07ca23ca 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmthread.c * * standalone DLM module * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ @@ -40,22 +23,20 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static int dlm_thread(void *data); static void dlm_flush_asts(struct dlm_ctxt *dlm); -#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) - /* will exit holding res->spinlock, but may drop in function */ /* waits until flags are cleared on res->state */ void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags) @@ -106,11 +87,12 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) return 0; - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; /* Another node has this resource with this node as the master */ - bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + bit = find_first_bit(res->refmap, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) return 0; @@ -159,6 +141,52 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } +/* + * Do the real purge work: + * unhash the lockres, and + * clear flag DLM_LOCK_RES_DROPPING_REF. + * It requires dlm and lockres spinlock to be taken. + */ +void __dlm_do_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + if (!list_empty(&res->purge)) { + mlog(0, "%s: Removing res %.*s from purgelist\n", + dlm->name, res->lockname.len, res->lockname.name); + list_del_init(&res->purge); + dlm_lockres_put(res); + dlm->purge_count--; + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s: res %.*s in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } + + __dlm_unhash_lockres(dlm, res); + + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + + /* + * lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. + */ + res->state &= ~DLM_LOCK_RES_DROPPING_REF; +} + static void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { @@ -174,6 +202,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name, master); if (!master) { + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(ML_NOTICE, "%s: res %.*s already in DLM_LOCK_RES_DROPPING_REF state\n", + dlm->name, res->lockname.len, res->lockname.name); + spin_unlock(&res->spinlock); + return; + } + res->state |= DLM_LOCK_RES_DROPPING_REF; /* drop spinlock... retake below */ spin_unlock(&res->spinlock); @@ -202,6 +237,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, dlm->purge_count--; } + if (!master && ret == DLM_DEREF_RESPONSE_INPROG) { + mlog(0, "%s: deref %.*s in progress\n", + dlm->name, res->lockname.len, res->lockname.name); + spin_unlock(&res->spinlock); + return; + } + if (!__dlm_lockres_unused(res)) { mlog(ML_ERROR, "%s: res %.*s in use after deref\n", dlm->name, res->lockname.len, res->lockname.name); @@ -211,6 +253,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, __dlm_unhash_lockres(dlm, res); + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", + res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ if (!master) { @@ -259,12 +311,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, * refs on it. */ unused = __dlm_lockres_unused(lockres); if (!unused || - (lockres->state & DLM_LOCK_RES_MIGRATING)) { + (lockres->state & DLM_LOCK_RES_MIGRATING) || + (lockres->inflight_assert_workers != 0)) { mlog(0, "%s: res %.*s is in use or being remastered, " - "used %d, state %d\n", dlm->name, - lockres->lockname.len, lockres->lockname.name, - !unused, lockres->state); - list_move_tail(&dlm->purge_list, &lockres->purge); + "used %d, state %d, assert master workers %u\n", + dlm->name, lockres->lockname.len, + lockres->lockname.name, + !unused, lockres->state, + lockres->inflight_assert_workers); + list_move_tail(&lockres->purge, &dlm->purge_list); spin_unlock(&lockres->spinlock); continue; } @@ -286,8 +341,6 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { struct dlm_lock *lock, *target; - struct list_head *iter; - struct list_head *head; int can_grant = 1; /* @@ -314,9 +367,7 @@ converting: dlm->name, res->lockname.len, res->lockname.name); BUG(); } - head = &res->granted; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, &res->granted, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, @@ -333,9 +384,8 @@ converting: target->ml.convert_type; } } - head = &res->converting; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + + list_for_each_entry(lock, &res->converting, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, @@ -384,9 +434,7 @@ blocked: goto leave; target = list_entry(res->blocked.next, struct dlm_lock, list); - head = &res->granted; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, &res->granted, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { @@ -400,9 +448,7 @@ blocked: } } - head = &res->converting; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, &res->converting, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { @@ -466,7 +512,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); /* don't shuffle secondary queues */ - if ((res->owner == dlm->node_num)) { + if (res->owner == dlm->node_num) { if (res->state & (DLM_LOCK_RES_MIGRATING | DLM_LOCK_RES_BLOCK_DIRTY)) return; @@ -489,7 +535,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm) { mlog(0, "Starting dlm_thread...\n"); - dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); + dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s", + dlm->name); if (IS_ERR(dlm->dlm_thread_task)) { mlog_errno(PTR_ERR(dlm->dlm_thread_task)); dlm->dlm_thread_task = NULL; @@ -629,7 +676,6 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm) #define DLM_THREAD_TIMEOUT_MS (4 * 1000) #define DLM_THREAD_MAX_DIRTY 100 -#define DLM_THREAD_MAX_ASTS 10 static int dlm_thread(void *data) { @@ -695,7 +741,8 @@ static int dlm_thread(void *data) * dirty for a short while. */ BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); if (res->state & (DLM_LOCK_RES_IN_PROGRESS | - DLM_LOCK_RES_RECOVERING)) { + DLM_LOCK_RES_RECOVERING | + DLM_LOCK_RES_RECOVERY_WAITING)) { /* move it to the tail and keep going */ res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 850aa7e87537..7318e4794ef9 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmunlock.c * * underlying calls for unlocking locks * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ @@ -38,15 +21,15 @@ #include <linux/spinlock.h> #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" #define DLM_UNLOCK_FREE_LOCK 0x00000001 #define DLM_UNLOCK_CALL_AST 0x00000002 @@ -105,7 +88,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, enum dlm_status status; int actions = 0; int in_use; - u8 owner; + u8 owner; + int recovery_wait = 0; mlog(0, "master_node = %d, valblk = %d\n", master_node, flags & LKM_VALBLK); @@ -191,7 +175,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, DLM_UNLOCK_CLEAR_CONVERT_TYPE); } else if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR + ) { /* must clear the actions because this unlock * is about to be retried. cannot free or do * any list manipulation. */ @@ -200,14 +186,18 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, res->lockname.name, status==DLM_RECOVERING?"recovering": (status==DLM_MIGRATING?"migrating": - "forward")); + (status == DLM_FORWARD ? "forward" : + "nolockmanager"))); actions = 0; } if (flags & LKM_CANCEL) lock->cancel_pending = 0; - else - lock->unlock_pending = 0; - + else { + if (!lock->unlock_pending) + recovery_wait = 1; + else + lock->unlock_pending = 0; + } } /* get an extra ref on lock. if we are just switching @@ -241,6 +231,17 @@ leave: spin_unlock(&res->spinlock); wake_up(&res->wq); + if (recovery_wait) { + spin_lock(&res->spinlock); + /* Unlock request will directly succeed after owner dies, + * and the lock is already removed from grant list. We have to + * wait for RECOVERING done or we miss the chance to purge it + * since the removement is much faster than RECOVERING proc. + */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING); + spin_unlock(&res->spinlock); + } + /* let the caller's final dlm_lock_put handle the actual kfree */ if (actions & DLM_UNLOCK_FREE_LOCK) { /* this should always be coupled with list removal */ @@ -248,7 +249,7 @@ leave: mlog(0, "lock %u:%llu should be gone now! refs=%d\n", dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), - atomic_read(&lock->lock_refs.refcount)-1); + kref_read(&lock->lock_refs)-1); dlm_lock_put(lock); } if (actions & DLM_UNLOCK_CALL_AST) @@ -364,7 +365,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, * updated state to the recovery master. this thread * just needs to finish out the operation and call * the unlockast. */ - ret = DLM_NORMAL; + if (dlm_is_node_dead(dlm, owner)) + ret = DLM_NORMAL; + else + ret = DLM_NOLOCKMGR; } else { /* something bad. this will BUG in ocfs2 */ ret = dlm_err_to_dlm_status(tmpret); @@ -388,10 +392,9 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; struct dlm_lock_resource *res = NULL; - struct list_head *iter; - struct dlm_lock *lock = NULL; + struct dlm_lock *lock = NULL, *iter; enum dlm_status status = DLM_NORMAL; - int found = 0, i; + int i; struct dlm_lockstatus *lksb = NULL; int ignore; u32 flags; @@ -416,7 +419,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } if (!dlm_grab(dlm)) - return DLM_REJECTED; + return DLM_FORWARD; mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), "Domain %s not fully joined!\n", dlm->name); @@ -434,7 +437,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } queue=&res->granted; - found = 0; spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { spin_unlock(&res->spinlock); @@ -458,22 +460,21 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } for (i=0; i<3; i++) { - list_for_each(iter, queue) { - lock = list_entry(iter, struct dlm_lock, list); - if (lock->ml.cookie == unlock->cookie && - lock->ml.node == unlock->node_idx) { - dlm_lock_get(lock); - found = 1; + list_for_each_entry(iter, queue, list) { + if (iter->ml.cookie == unlock->cookie && + iter->ml.node == unlock->node_idx) { + dlm_lock_get(iter); + lock = iter; break; } } - if (found) + if (lock) break; /* scan granted -> converting -> blocked queues */ queue++; } spin_unlock(&res->spinlock); - if (!found) { + if (!lock) { status = DLM_IVLOCKID; goto not_found; } @@ -503,7 +504,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, dlm_kick_thread(dlm, res); not_found: - if (!found) + if (!lock) mlog(ML_ERROR, "failed to find lock to unlock! " "cookie=%u:%llu\n", dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), @@ -640,7 +641,9 @@ retry: if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR) { + /* We want to go away for a tiny bit to allow recovery * / migration to complete on this resource. I don't * know of any wait queue we could sleep on as this @@ -652,7 +655,7 @@ retry: msleep(50); mlog(0, "retrying unlock due to pending recovery/" - "migration/in-progress\n"); + "migration/in-progress/reconnect\n"); goto retry; } diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c deleted file mode 100644 index dfc0da4d158d..000000000000 --- a/fs/ocfs2/dlm/dlmver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION - -void dlm_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h deleted file mode 100644 index f674aee77a16..000000000000 --- a/fs/ocfs2/dlm/dlmver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLM_VER_H -#define DLM_VER_H - -void dlm_print_version(void); - -#endif /* DLM_VER_H */ diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index f14be89a6701..c7895f65be0e 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -1,5 +1,4 @@ -ccflags-y := -Ifs/ocfs2 - +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o -ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o +ocfs2_dlmfs-objs := userdlm.o dlmfs.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 12bafb7265ce..339f0b11cdc8 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -1,6 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmfs.c * * Code which implements the kernel side of a minimal userspace @@ -9,21 +8,6 @@ * which was a template for the fs side of this module. * * Copyright (C) 2003, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ /* Simple VFS hooks based on: */ @@ -36,6 +20,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/pagemap.h> #include <linux/types.h> #include <linux/slab.h> @@ -45,14 +30,13 @@ #include <linux/backing-dev.h> #include <linux/poll.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> -#include "stackglue.h" +#include "../stackglue.h" #include "userdlm.h" -#include "dlmfsver.h" #define MLOG_MASK_PREFIX ML_DLMFS -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static const struct super_operations dlmfs_ops; @@ -72,7 +56,7 @@ struct workqueue_struct *user_dlm_worker; * Over time, dlmfs has added some features that were not part of the * initial ABI. Unfortunately, some of these features are not detectable * via standard usage. For example, Linux's default poll always returns - * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs + * EPOLLIN, so there is no way for a caller of poll(2) to know when dlmfs * added poll support. Instead, we provide this list of new capabilities. * * Capabilities is a read-only attribute. We do it as a module parameter @@ -84,21 +68,20 @@ struct workqueue_struct *user_dlm_worker; * interaction. * * Capabilities: - * - bast : POLLIN against the file descriptor of a held lock + * - bast : EPOLLIN against the file descriptor of a held lock * signifies a bast fired on the lock. */ #define DLMFS_CAPABILITIES "bast stackglue" static int param_set_dlmfs_capabilities(const char *val, - struct kernel_param *kp) + const struct kernel_param *kp) { printk(KERN_ERR "%s: readonly parameter\n", kp->name); return -EINVAL; } static int param_get_dlmfs_capabilities(char *buffer, - struct kernel_param *kp) + const struct kernel_param *kp) { - return strlcpy(buffer, DLMFS_CAPABILITIES, - strlen(DLMFS_CAPABILITIES) + 1); + return sysfs_emit(buffer, DLMFS_CAPABILITIES); } module_param_call(capabilities, param_set_dlmfs_capabilities, param_get_dlmfs_capabilities, NULL, 0444); @@ -180,7 +163,7 @@ bail: static int dlmfs_file_release(struct inode *inode, struct file *file) { - int level, status; + int level; struct dlmfs_inode_private *ip = DLMFS_I(inode); struct dlmfs_filp_private *fp = file->private_data; @@ -189,7 +172,6 @@ static int dlmfs_file_release(struct inode *inode, mlog(0, "close called on inode %lu\n", inode->i_ino); - status = 0; if (fp) { level = fp->fp_lock_level; if (level != DLM_LOCK_IV) @@ -206,24 +188,25 @@ static int dlmfs_file_release(struct inode *inode, * We do ->setattr() just to override size changes. Our size is the size * of the LVB and nothing else. */ -static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) +static int dlmfs_file_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *attr) { int error; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; - error = inode_change_ok(inode, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } -static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) +static __poll_t dlmfs_file_poll(struct file *file, poll_table *wait) { - int event = 0; + __poll_t event = 0; struct inode *inode = file_inode(file); struct dlmfs_inode_private *ip = DLMFS_I(inode); @@ -231,58 +214,23 @@ static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) spin_lock(&ip->ip_lockres.l_lock); if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED) - event = POLLIN | POLLRDNORM; + event = EPOLLIN | EPOLLRDNORM; spin_unlock(&ip->ip_lockres.l_lock); return event; } -static ssize_t dlmfs_file_read(struct file *filp, +static ssize_t dlmfs_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - int bytes_left; - ssize_t readlen, got; - char *lvb_buf; - struct inode *inode = file_inode(filp); - - mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", - inode->i_ino, count, *ppos); + char lvb[DLM_LVB_LEN]; - if (*ppos >= i_size_read(inode)) + if (!user_dlm_read_lvb(file_inode(file), lvb)) return 0; - if (!count) - return 0; - - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - - /* don't read past the lvb */ - if ((count + *ppos) > i_size_read(inode)) - readlen = i_size_read(inode) - *ppos; - else - readlen = count; - - lvb_buf = kmalloc(readlen, GFP_NOFS); - if (!lvb_buf) - return -ENOMEM; - - got = user_dlm_read_lvb(inode, lvb_buf, readlen); - if (got) { - BUG_ON(got != readlen); - bytes_left = __copy_to_user(buf, lvb_buf, readlen); - readlen -= bytes_left; - } else - readlen = 0; - - kfree(lvb_buf); - - *ppos = *ppos + readlen; - - mlog(0, "read %zd bytes\n", readlen); - return readlen; + return simple_read_from_buffer(buf, count, ppos, lvb, sizeof(lvb)); } static ssize_t dlmfs_file_write(struct file *filp, @@ -290,43 +238,31 @@ static ssize_t dlmfs_file_write(struct file *filp, size_t count, loff_t *ppos) { + char lvb_buf[DLM_LVB_LEN]; int bytes_left; - ssize_t writelen; - char *lvb_buf; struct inode *inode = file_inode(filp); mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", inode->i_ino, count, *ppos); - if (*ppos >= i_size_read(inode)) + if (*ppos >= DLM_LVB_LEN) return -ENOSPC; - if (!count) - return 0; - - if (!access_ok(VERIFY_READ, buf, count)) - return -EFAULT; - /* don't write past the lvb */ - if ((count + *ppos) > i_size_read(inode)) - writelen = i_size_read(inode) - *ppos; - else - writelen = count - *ppos; + if (count > DLM_LVB_LEN - *ppos) + count = DLM_LVB_LEN - *ppos; - lvb_buf = kmalloc(writelen, GFP_NOFS); - if (!lvb_buf) - return -ENOMEM; - - bytes_left = copy_from_user(lvb_buf, buf, writelen); - writelen -= bytes_left; - if (writelen) - user_dlm_write_lvb(inode, lvb_buf, writelen); + if (!count) + return 0; - kfree(lvb_buf); + bytes_left = copy_from_user(lvb_buf, buf, count); + count -= bytes_left; + if (count) + user_dlm_write_lvb(inode, lvb_buf, count); - *ppos = *ppos + writelen; - mlog(0, "wrote %zd bytes\n", writelen); - return writelen; + *ppos = *ppos + count; + mlog(0, "wrote %zu bytes\n", count); + return count; } static void dlmfs_init_once(void *foo) @@ -344,39 +280,41 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb) { struct dlmfs_inode_private *ip; - ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS); + ip = alloc_inode_sb(sb, dlmfs_inode_cache, GFP_NOFS); if (!ip) return NULL; return &ip->ip_vfs_inode; } -static void dlmfs_i_callback(struct rcu_head *head) +static void dlmfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); } -static void dlmfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, dlmfs_i_callback); -} - static void dlmfs_evict_inode(struct inode *inode) { int status; struct dlmfs_inode_private *ip; + struct user_lock_res *lockres; + int teardown; clear_inode(inode); mlog(0, "inode %lu\n", inode->i_ino); ip = DLMFS_I(inode); + lockres = &ip->ip_lockres; if (S_ISREG(inode->i_mode)) { - status = user_dlm_destroy_lock(&ip->ip_lockres); - if (status < 0) - mlog_errno(status); + spin_lock(&lockres->l_lock); + teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN); + spin_unlock(&lockres->l_lock); + if (!teardown) { + status = user_dlm_destroy_lock(lockres); + if (status < 0) + mlog_errno(status); + } iput(ip->ip_parent); goto clear_fields; } @@ -391,25 +329,15 @@ clear_fields: ip->ip_conn = NULL; } -static struct backing_dev_info dlmfs_backing_dev_info = { - .name = "ocfs2-dlmfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - static struct inode *dlmfs_get_root_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); umode_t mode = S_IFDIR | 0755; - struct dlmfs_inode_private *ip; if (inode) { - ip = DLMFS_I(inode); - inode->i_ino = get_next_ino(); - inode_init_owner(inode, NULL, mode); - inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); + simple_inode_init_ts(inode); inc_nlink(inode); inode->i_fop = &simple_dir_operations; @@ -431,9 +359,8 @@ static struct inode *dlmfs_get_inode(struct inode *parent, return NULL; inode->i_ino = get_next_ino(); - inode_init_owner(inode, parent, mode); - inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode_init_owner(&nop_mnt_idmap, inode, parent, mode); + simple_inode_init_ts(inode); ip = DLMFS_I(inode); ip->ip_conn = DLMFS_I(parent)->ip_conn; @@ -475,13 +402,14 @@ static struct inode *dlmfs_get_inode(struct inode *parent, * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ -static int dlmfs_mkdir(struct inode * dir, - struct dentry * dentry, - umode_t mode) +static struct dentry *dlmfs_mkdir(struct mnt_idmap * idmap, + struct inode * dir, + struct dentry * dentry, + umode_t mode) { int status; struct inode *inode = NULL; - struct qstr *domain = &dentry->d_name; + const struct qstr *domain = &dentry->d_name; struct dlmfs_inode_private *ip; struct ocfs2_cluster_connection *conn; @@ -513,24 +441,24 @@ static int dlmfs_mkdir(struct inode * dir, ip->ip_conn = conn; inc_nlink(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + d_make_persistent(dentry, inode); status = 0; bail: if (status < 0) iput(inode); - return status; + return ERR_PTR(status); } -static int dlmfs_create(struct inode *dir, +static int dlmfs_create(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { int status = 0; struct inode *inode; - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; mlog(0, "create %.*s\n", name->len, name->name); @@ -551,8 +479,7 @@ static int dlmfs_create(struct inode *dir, goto bail; } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + d_make_persistent(dentry, inode); bail: return status; } @@ -561,7 +488,7 @@ static int dlmfs_unlink(struct inode *dir, struct dentry *dentry) { int status; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); mlog(0, "unlink inode %lu\n", inode->i_ino); @@ -569,8 +496,8 @@ static int dlmfs_unlink(struct inode *dir, * to acquire a lock, this basically destroys our lockres. */ status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres); if (status < 0) { - mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n", - dentry->d_name.len, dentry->d_name.name, status); + mlog(ML_ERROR, "unlink %pd, error %d from destroy\n", + dentry, status); goto bail; } status = simple_unlink(dir, dentry); @@ -578,13 +505,11 @@ bail: return status; } -static int dlmfs_fill_super(struct super_block * sb, - void * data, - int silent) +static int dlmfs_fill_super(struct super_block *sb, struct fs_context *fc) { sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = DLMFS_MAGIC; sb->s_op = &dlmfs_ops; sb->s_root = d_make_root(dlmfs_get_root_inode(sb)); @@ -618,9 +543,9 @@ static const struct inode_operations dlmfs_root_inode_operations = { static const struct super_operations dlmfs_ops = { .statfs = simple_statfs, .alloc_inode = dlmfs_alloc_inode, - .destroy_inode = dlmfs_destroy_inode, + .free_inode = dlmfs_free_inode, .evict_inode = dlmfs_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct inode_operations dlmfs_file_inode_operations = { @@ -628,17 +553,27 @@ static const struct inode_operations dlmfs_file_inode_operations = { .setattr = dlmfs_file_setattr, }; -static struct dentry *dlmfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int dlmfs_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, dlmfs_fill_super); +} + +static const struct fs_context_operations dlmfs_context_ops = { + .get_tree = dlmfs_get_tree, +}; + +static int dlmfs_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, dlmfs_fill_super); + fc->ops = &dlmfs_context_ops; + + return 0; } static struct file_system_type dlmfs_fs_type = { .owner = THIS_MODULE, .name = "ocfs2_dlmfs", - .mount = dlmfs_mount, - .kill_sb = kill_litter_super, + .kill_sb = kill_anon_super, + .init_fs_context = dlmfs_init_fs_context, }; MODULE_ALIAS_FS("ocfs2_dlmfs"); @@ -647,16 +582,10 @@ static int __init init_dlmfs_fs(void) int status; int cleanup_inode = 0, cleanup_worker = 0; - dlmfs_print_version(); - - status = bdi_init(&dlmfs_backing_dev_info); - if (status) - return status; - dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_ACCOUNT), dlmfs_init_once); if (!dlmfs_inode_cache) { status = -ENOMEM; @@ -664,7 +593,8 @@ static int __init init_dlmfs_fs(void) } cleanup_inode = 1; - user_dlm_worker = create_singlethread_workqueue("user_dlm"); + user_dlm_worker = alloc_workqueue("user_dlm", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!user_dlm_worker) { status = -ENOMEM; goto bail; @@ -679,7 +609,6 @@ bail: kmem_cache_destroy(dlmfs_inode_cache); if (cleanup_worker) destroy_workqueue(user_dlm_worker); - bdi_destroy(&dlmfs_backing_dev_info); } else printk("OCFS2 User DLM kernel interface loaded\n"); return status; @@ -689,7 +618,6 @@ static void __exit exit_dlmfs_fs(void) { unregister_filesystem(&dlmfs_fs_type); - flush_workqueue(user_dlm_worker); destroy_workqueue(user_dlm_worker); /* @@ -699,11 +627,11 @@ static void __exit exit_dlmfs_fs(void) rcu_barrier(); kmem_cache_destroy(dlmfs_inode_cache); - bdi_destroy(&dlmfs_backing_dev_info); } MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 DLM-Filesystem"); module_init(init_dlmfs_fs) module_exit(exit_dlmfs_fs) diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c deleted file mode 100644 index a733b3321f83..000000000000 --- a/fs/ocfs2/dlmfs/dlmfsver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmfsver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION - -void dlmfs_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h deleted file mode 100644 index f35eadbed25c..000000000000 --- a/fs/ocfs2/dlmfs/dlmfsver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLMFS_VER_H -#define DLMFS_VER_H - -void dlmfs_print_version(void); - -#endif /* DLMFS_VER_H */ diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 0499e3fb7bdb..617c92e7b925 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -1,6 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * userdlm.c * * Code which implements the kernel side of a minimal userspace @@ -10,36 +9,22 @@ * functions. * * Copyright (C) 2003, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/signal.h> +#include <linux/sched/signal.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/crc32.h> -#include "ocfs2_lockingver.h" -#include "stackglue.h" +#include "../ocfs2_lockingver.h" +#include "../stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) @@ -448,6 +433,11 @@ again: } spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + spin_unlock(&lockres->l_lock); + status = -EAGAIN; + goto bail; + } /* We only compare against the currently granted level * here. If the lock is blocked waiting on a downconvert, @@ -560,24 +550,20 @@ void user_dlm_write_lvb(struct inode *inode, spin_unlock(&lockres->l_lock); } -ssize_t user_dlm_read_lvb(struct inode *inode, - char *val, - unsigned int len) +bool user_dlm_read_lvb(struct inode *inode, char *val) { struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; char *lvb; - ssize_t ret = len; - - BUG_ON(len > DLM_LVB_LEN); + bool ret = true; spin_lock(&lockres->l_lock); BUG_ON(lockres->l_level < DLM_LOCK_PR); if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) { lvb = ocfs2_dlm_lvb(&lockres->l_lksb); - memcpy(val, lvb, len); + memcpy(val, lvb, DLM_LVB_LEN); } else - ret = 0; + ret = false; spin_unlock(&lockres->l_lock); return ret; @@ -614,7 +600,7 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); - return 0; + goto bail; } lockres->l_flags |= USER_LOCK_IN_TEARDOWN; @@ -628,22 +614,30 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) } if (lockres->l_ro_holders || lockres->l_ex_holders) { + lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); goto bail; } status = 0; if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { + /* + * lock is never requested, leave USER_LOCK_IN_TEARDOWN set + * to avoid new lock request coming in. + */ spin_unlock(&lockres->l_lock); goto bail; } - lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); if (status) { + spin_lock(&lockres->l_lock); + lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; + lockres->l_flags &= ~USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto bail; } @@ -667,7 +661,7 @@ void user_dlm_set_locking_protocol(void) ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version); } -struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name) +struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name) { int rc; struct ocfs2_cluster_connection *conn; diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h index 3b42d79531d7..47ba18eac423 100644 --- a/fs/ocfs2/dlmfs/userdlm.h +++ b/fs/ocfs2/dlmfs/userdlm.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * userdlm.h * * Userspace dlm defines * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ @@ -80,10 +64,8 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres, void user_dlm_write_lvb(struct inode *inode, const char *val, unsigned int len); -ssize_t user_dlm_read_lvb(struct inode *inode, - char *val, - unsigned int len); -struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name); +bool user_dlm_read_lvb(struct inode *inode, char *val); +struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name); void user_dlm_unregister(struct ocfs2_cluster_connection *conn); void user_dlm_set_locking_protocol(void); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3a44a648dae7..619ff03b15d6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * dlmglue.c * * Code which implements an OCFS2 specific interface to our DLM. * * Copyright (C) 2003, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/types.h> @@ -32,7 +16,10 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/time.h> +#include <linux/delay.h> #include <linux/quotaops.h> +#include <linux/sched/signal.h> +#include <linux/string_choices.h> #define MLOG_MASK_PREFIX ML_DLM_GLUE #include <cluster/masklog.h> @@ -54,6 +41,7 @@ #include "uptodate.h" #include "quota.h" #include "refcounttree.h" +#include "acl.h" #include "buffer_head_io.h" @@ -94,7 +82,9 @@ struct ocfs2_unblock_ctl { }; /* Lockdep class keys */ -struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES]; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES]; +#endif static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, int new_level); @@ -232,12 +222,12 @@ struct ocfs2_lock_res_ops { */ #define LOCK_TYPE_USES_LVB 0x2 -static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { +static const struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { .get_osb = ocfs2_get_inode_osb, .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = { +static const struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = { .get_osb = ocfs2_get_inode_osb, .check_downconvert = ocfs2_check_meta_downconvert, .set_lvb = ocfs2_set_meta_lvb, @@ -245,46 +235,50 @@ static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; -static struct ocfs2_lock_res_ops ocfs2_super_lops = { +static const struct ocfs2_lock_res_ops ocfs2_super_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH, }; -static struct ocfs2_lock_res_ops ocfs2_rename_lops = { +static const struct ocfs2_lock_res_ops ocfs2_rename_lops = { .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { +static const struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { +static const struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + +static const struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; -static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { +static const struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .get_osb = ocfs2_get_dentry_osb, .post_unlock = ocfs2_dentry_post_unlock, .downconvert_worker = ocfs2_dentry_convert_worker, .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { +static const struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { .get_osb = ocfs2_get_inode_osb, .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_flock_lops = { +static const struct ocfs2_lock_res_ops ocfs2_flock_lops = { .get_osb = ocfs2_get_file_osb, .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { +static const struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { .set_lvb = ocfs2_set_qinfo_lvb, .get_osb = ocfs2_get_qinfo_osb, .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, }; -static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = { +static const struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = { .check_downconvert = ocfs2_check_refcount_downconvert, .downconvert_worker = ocfs2_refcount_convert_worker, .flags = 0, @@ -432,6 +426,7 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) { res->l_lock_refresh = 0; + res->l_lock_wait = 0; memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats)); memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats)); } @@ -466,6 +461,8 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, if (ret) stats->ls_fail++; + + stats->ls_last = ktime_to_us(ktime_get_real()); } static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) @@ -473,6 +470,21 @@ static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) lockres->l_lock_refresh++; } +static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_mask_waiter *mw; + + if (list_empty(&lockres->l_mask_waiters)) { + lockres->l_lock_wait = 0; + return; + } + + mw = list_first_entry(&lockres->l_mask_waiters, + struct ocfs2_mask_waiter, mw_item); + lockres->l_lock_wait = + ktime_to_us(ktime_mono_to_real(mw->mw_lock_start)); +} + static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) { mw->mw_lock_start = ktime_get(); @@ -488,6 +500,9 @@ static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) { } +static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres) +{ +} static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) { } @@ -496,7 +511,7 @@ static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, struct ocfs2_lock_res *res, enum ocfs2_lock_type type, - struct ocfs2_lock_res_ops *ops, + const struct ocfs2_lock_res_ops *ops, void *priv) { res->l_type = type; @@ -531,6 +546,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) init_waitqueue_head(&res->l_event); INIT_LIST_HEAD(&res->l_blocked_list); INIT_LIST_HEAD(&res->l_mask_waiters); + INIT_LIST_HEAD(&res->l_holders); } void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, @@ -538,7 +554,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, unsigned int generation, struct inode *inode) { - struct ocfs2_lock_res_ops *ops; + const struct ocfs2_lock_res_ops *ops; switch(type) { case OCFS2_LOCK_TYPE_RW: @@ -554,7 +570,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, mlog_bug_on_msg(1, "type: %d\n", type); ops = NULL; /* thanks, gcc */ break; - }; + } ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno, generation, res->l_name); @@ -673,6 +689,35 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_nfs_sync_lops, osb); } +static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb) +{ + ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + init_rwsem(&osb->nfs_sync_rwlock); +} + +void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + /* Only one trimfs thread are allowed to work at the same time. */ + mutex_lock(&osb->obs_trim_fs_mutex); + + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); + ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, + &ocfs2_trim_fs_lops, osb); +} + +void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + ocfs2_simple_drop_lockres(osb, lockres); + ocfs2_lock_res_free(lockres); + + mutex_unlock(&osb->obs_trim_fs_mutex); +} + static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, struct ocfs2_super *osb) { @@ -748,6 +793,49 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) res->l_flags = 0UL; } +/* + * Keep a list of processes who have interest in a lockres. + * Note: this is now only used for check recursive cluster locking. + */ +static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + INIT_LIST_HEAD(&oh->oh_list); + oh->oh_owner_pid = get_pid(task_pid(current)); + + spin_lock(&lockres->l_lock); + list_add_tail(&oh->oh_list, &lockres->l_holders); + spin_unlock(&lockres->l_lock); +} + +static struct ocfs2_lock_holder * +ocfs2_pid_holder(struct ocfs2_lock_res *lockres, + struct pid *pid) +{ + struct ocfs2_lock_holder *oh; + + spin_lock(&lockres->l_lock); + list_for_each_entry(oh, &lockres->l_holders, oh_list) { + if (oh->oh_owner_pid == pid) { + spin_unlock(&lockres->l_lock); + return oh; + } + } + spin_unlock(&lockres->l_lock); + return NULL; +} + +static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + spin_lock(&lockres->l_lock); + list_del(&oh->oh_list); + spin_unlock(&lockres->l_lock); + + put_pid(oh->oh_owner_pid); +} + + static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) { @@ -814,6 +902,7 @@ static void lockres_set_flags(struct ocfs2_lock_res *lockres, list_del_init(&mw->mw_item); mw->mw_status = 0; complete(&mw->mw_complete); + ocfs2_track_lock_wait(lockres); } } static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) @@ -861,8 +950,13 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing * the OCFS2_LOCK_BUSY flag to prevent the dc thread from * downconverting the lock before the upconvert has fully completed. + * Do not prevent the dc thread from downconverting if NONBLOCK lock + * had already returned. */ - lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED)) + lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + else + lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED); lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); } @@ -1304,7 +1398,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) { wait_for_completion(&mw->mw_complete); /* Re-arm the completion in case we want to wait on it again */ - INIT_COMPLETION(mw->mw_complete); + reinit_completion(&mw->mw_complete); return mw->mw_status; } @@ -1320,24 +1414,37 @@ static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); mw->mw_mask = mask; mw->mw_goal = goal; + ocfs2_track_lock_wait(lockres); } /* returns 0 if the mw that was removed was already satisfied, -EBUSY * if the mask still hadn't reached its goal */ -static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, +static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, struct ocfs2_mask_waiter *mw) { - unsigned long flags; int ret = 0; - spin_lock_irqsave(&lockres->l_lock, flags); + assert_spin_locked(&lockres->l_lock); if (!list_empty(&mw->mw_item)) { if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) ret = -EBUSY; list_del_init(&mw->mw_item); init_completion(&mw->mw_complete); + ocfs2_track_lock_wait(lockres); } + + return ret; +} + +static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + ret = __lockres_remove_mask_waiter(lockres, mw); spin_unlock_irqrestore(&lockres->l_lock, flags); return ret; @@ -1355,7 +1462,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw, else ret = mw->mw_status; /* Re-arm the completion in case we want to wait on it again */ - INIT_COMPLETION(mw->mw_complete); + reinit_completion(&mw->mw_complete); return ret; } @@ -1373,6 +1480,13 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unsigned long flags; unsigned int gen; int noqueue_attempted = 0; + int dlm_locked = 0; + int kick_dc = 0; + + if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) { + mlog_errno(-EINVAL); + return -EINVAL; + } ocfs2_init_mask_waiter(&mw); @@ -1481,6 +1595,7 @@ again: ocfs2_recover_from_dlm_error(lockres, 1); goto out; } + dlm_locked = 1; mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", lockres->l_name); @@ -1501,7 +1616,12 @@ update_holders: unlock: lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + /* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ + kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (kick_dc) + ocfs2_wake_downconvert_thread(osb); out: /* * This is helping work around a lock inversion between the page lock @@ -1514,10 +1634,17 @@ out: if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { wait = 0; - if (lockres_remove_mask_waiter(lockres, &mw)) + spin_lock_irqsave(&lockres->l_lock, flags); + if (__lockres_remove_mask_waiter(lockres, &mw)) { + if (dlm_locked) + lockres_or_flags(lockres, + OCFS2_LOCK_NONBLOCK_FINISHED); + spin_unlock_irqrestore(&lockres->l_lock, flags); ret = -EAGAIN; - else + } else { + spin_unlock_irqrestore(&lockres->l_lock, flags); goto again; + } } if (wait) { ret = ocfs2_wait_for_mask(&mw); @@ -1566,7 +1693,7 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb, spin_unlock_irqrestore(&lockres->l_lock, flags); #ifdef CONFIG_DEBUG_LOCK_ALLOC if (lockres->l_lockdep_map.key != NULL) - rwsem_release(&lockres->l_lockdep_map, 1, caller_ip); + rwsem_release(&lockres->l_lockdep_map, caller_ip); #endif } @@ -1598,7 +1725,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode) int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); BUG_ON(!ocfs2_inode_is_new(inode)); mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1628,10 +1754,8 @@ int ocfs2_create_new_inode_locks(struct inode *inode) } ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); - if (ret) { + if (ret) mlog_errno(ret); - goto bail; - } bail: return ret; @@ -1643,8 +1767,6 @@ int ocfs2_rw_lock(struct inode *inode, int write) struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); - mlog(0, "inode %llu take %s RW lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); @@ -1656,14 +1778,34 @@ int ocfs2_rw_lock(struct inode *inode, int write) level = write ? DLM_LOCK_EX : DLM_LOCK_PR; - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, - 0); + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); if (status < 0) mlog_errno(status); return status; } +int ocfs2_try_rw_lock(struct inode *inode, int write) +{ + int status, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu try to take %s RW lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + return 0; + + lockres = &OCFS2_I(inode)->ip_rw_lockres; + + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + + status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0); + return status; +} + void ocfs2_rw_unlock(struct inode *inode, int write) { int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; @@ -1675,7 +1817,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write) write ? "EXMODE" : "PRMODE"); if (!ocfs2_mount_local(osb)) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + ocfs2_cluster_unlock(osb, lockres, level); } /* @@ -1687,8 +1829,6 @@ int ocfs2_open_lock(struct inode *inode) struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); - mlog(0, "inode %llu take PRMODE open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1697,8 +1837,7 @@ int ocfs2_open_lock(struct inode *inode) lockres = &OCFS2_I(inode)->ip_open_lockres; - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_PR, 0, 0); + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0); if (status < 0) mlog_errno(status); @@ -1712,8 +1851,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write) struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); - mlog(0, "inode %llu try to take %s open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); @@ -1737,8 +1874,7 @@ int ocfs2_try_open_lock(struct inode *inode, int write) * other nodes and the -EAGAIN will indicate to the caller that * this inode is still in use. */ - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, - level, DLM_LKF_NOQUEUE, 0); + status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0); out: return status; @@ -1759,11 +1895,9 @@ void ocfs2_open_unlock(struct inode *inode) goto out; if(lockres->l_ro_holders) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_PR); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR); if(lockres->l_ex_holders) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_EX); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); out: return; @@ -2005,15 +2139,15 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, } #define OCFS2_SEC_BITS 34 -#define OCFS2_SEC_SHIFT (64 - 34) +#define OCFS2_SEC_SHIFT (64 - OCFS2_SEC_BITS) #define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) /* LVB only has room for 64 bits of time here so we pack it for * now. */ -static u64 ocfs2_pack_timespec(struct timespec *spec) +static u64 ocfs2_pack_timespec(struct timespec64 *spec) { u64 res; - u64 sec = spec->tv_sec; + u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull); u32 nsec = spec->tv_nsec; res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); @@ -2029,6 +2163,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec64 ts; lvb = ocfs2_dlm_lvb(&lockres->l_lksb); @@ -2049,12 +2184,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); - lvb->lvb_iatime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); - lvb->lvb_ictime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); - lvb->lvb_imtime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); + ts = inode_get_atime(inode); + lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = inode_get_ctime(inode); + lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = inode_get_mtime(inode); + lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); @@ -2063,22 +2198,25 @@ out: mlog_meta_lvb(0, lockres); } -static void ocfs2_unpack_timespec(struct timespec *spec, +static void ocfs2_unpack_timespec(struct timespec64 *spec, u64 packed_time) { spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; } -static void ocfs2_refresh_inode_from_lvb(struct inode *inode) +static int ocfs2_refresh_inode_from_lvb(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec64 ts; mlog_meta_lvb(0, lockres); lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (inode_wrong_type(inode, be16_to_cpu(lvb->lvb_imode))) + return -ESTALE; /* We're safe here without the lockres lock... */ spin_lock(&oi->ip_lock); @@ -2099,13 +2237,14 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); inode->i_mode = be16_to_cpu(lvb->lvb_imode); set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); - ocfs2_unpack_timespec(&inode->i_atime, - be64_to_cpu(lvb->lvb_iatime_packed)); - ocfs2_unpack_timespec(&inode->i_mtime, - be64_to_cpu(lvb->lvb_imtime_packed)); - ocfs2_unpack_timespec(&inode->i_ctime, - be64_to_cpu(lvb->lvb_ictime_packed)); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed)); + inode_set_atime_to_ts(inode, ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed)); + inode_set_mtime_to_ts(inode, ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed)); + inode_set_ctime_to_ts(inode, ts); spin_unlock(&oi->ip_lock); + return 0; } static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, @@ -2208,7 +2347,8 @@ static int ocfs2_inode_lock_update(struct inode *inode, if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { mlog(0, "Trusting LVB on inode %llu\n", (unsigned long long)oi->ip_blkno); - ocfs2_refresh_inode_from_lvb(inode); + status = ocfs2_refresh_inode_from_lvb(inode); + goto bail_refresh; } else { /* Boo, we have to go to disk. */ /* read bh, cast, ocfs2_refresh_inode */ @@ -2218,6 +2358,10 @@ static int ocfs2_inode_lock_update(struct inode *inode, goto bail_refresh; } fe = (struct ocfs2_dinode *) (*bh)->b_data; + if (inode_wrong_type(inode, le16_to_cpu(fe->i_mode))) { + status = -ESTALE; + goto bail_refresh; + } /* This is a good chance to make sure we're not * locking an invalid object. ocfs2_read_inode_block() @@ -2291,8 +2435,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *local_bh = NULL; - BUG_ON(!inode); - mlog(0, "inode %llu, take %s META lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); @@ -2307,8 +2449,9 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, goto getbh; } - if (ocfs2_mount_local(osb)) - goto local; + if ((arg_flags & OCFS2_META_LOCK_GETBH) || + ocfs2_mount_local(osb)) + goto update; if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); @@ -2337,14 +2480,14 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); -local: +update: /* * We only see this flag if we're being called from * ocfs2_read_locked_inode(). It means we're locking an inode * which hasn't been populated yet, so clear the refresh flag * and let the caller handle it. */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { status = 0; if (lockres) ocfs2_complete_lock_res_refresh(lockres, 0); @@ -2381,44 +2524,41 @@ bail: ocfs2_inode_unlock(inode, ex); } - if (local_bh) - brelse(local_bh); - + brelse(local_bh); return status; } /* * This is working around a lock inversion between tasks acquiring DLM - * locks while holding a page lock and the downconvert thread which - * blocks dlm lock acquiry while acquiring page locks. + * locks while holding a folio lock and the downconvert thread which + * blocks dlm lock acquiry while acquiring folio locks. * - * ** These _with_page variantes are only intended to be called from aop - * methods that hold page locks and return a very specific *positive* error + * ** These _with_folio variants are only intended to be called from aop + * methods that hold folio locks and return a very specific *positive* error * code that aop methods pass up to the VFS -- test for errors with != 0. ** * * The DLM is called such that it returns -EAGAIN if it would have * blocked waiting for the downconvert thread. In that case we unlock - * our page so the downconvert thread can make progress. Once we've + * our folio so the downconvert thread can make progress. Once we've * done this we have to return AOP_TRUNCATED_PAGE so the aop method * that called us can bubble that back up into the VFS who will then * immediately retry the aop call. - * - * We do a blocking lock and immediate unlock before returning, though, so that - * the lock has a great chance of being cached on this node by the time the VFS - * calls back to retry the aop. This has a potential to livelock as nodes - * ping locks back and forth, but that's a risk we're willing to take to avoid - * the lock inversion simply. */ -int ocfs2_inode_lock_with_page(struct inode *inode, - struct buffer_head **ret_bh, - int ex, - struct page *page) +int ocfs2_inode_lock_with_folio(struct inode *inode, + struct buffer_head **ret_bh, int ex, struct folio *folio) { int ret; ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { - unlock_page(page); + folio_unlock(folio); + /* + * If we can't get inode lock immediately, we should not return + * directly here, since this will lead to a softlockup problem. + * The method is to get a blocking lock and immediately unlock + * before returning, this can avoid CPU resource waste due to + * lots of retries, and benefits fairness in getting lock. + */ if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; @@ -2429,13 +2569,18 @@ int ocfs2_inode_lock_with_page(struct inode *inode, int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, - int *level) + int *level, int wait) { int ret; - ret = ocfs2_inode_lock(inode, NULL, 0); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, 0); + else + ret = ocfs2_try_inode_lock(inode, NULL, 0); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); return ret; } @@ -2447,16 +2592,20 @@ int ocfs2_inode_lock_atime(struct inode *inode, struct buffer_head *bh = NULL; ocfs2_inode_unlock(inode, 0); - ret = ocfs2_inode_lock(inode, &bh, 1); + if (wait) + ret = ocfs2_inode_lock(inode, &bh, 1); + else + ret = ocfs2_try_inode_lock(inode, &bh, 1); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); return ret; } *level = 1; if (ocfs2_should_update_atime(inode, vfsmnt)) ocfs2_update_inode_atime(inode, bh); - if (bh) - brelse(bh); + brelse(bh); } else *level = 0; @@ -2474,9 +2623,126 @@ void ocfs2_inode_unlock(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + ocfs2_cluster_unlock(osb, lockres, level); +} + +/* + * This _tracker variants are introduced to deal with the recursive cluster + * locking issue. The idea is to keep track of a lock holder on the stack of + * the current process. If there's a lock holder on the stack, we know the + * task context is already protected by cluster locking. Currently, they're + * used in some VFS entry routines. + * + * return < 0 on error, return == 0 if there's no lock holder on the stack + * before this call, return == 1 if this call would be a recursive locking. + * return == -1 if this lock attempt will cause an upgrade which is forbidden. + * + * When taking lock levels into account,we face some different situations. + * + * 1. no lock is held + * In this case, just lock the inode as requested and return 0 + * + * 2. We are holding a lock + * For this situation, things diverges into several cases + * + * wanted holding what to do + * ex ex see 2.1 below + * ex pr see 2.2 below + * pr ex see 2.1 below + * pr pr see 2.1 below + * + * 2.1 lock level that is been held is compatible + * with the wanted level, so no lock action will be tacken. + * + * 2.2 Otherwise, an upgrade is needed, but it is forbidden. + * + * Reason why upgrade within a process is forbidden is that + * lock upgrade may cause dead lock. The following illustrates + * how it happens. + * + * thread on node1 thread on node2 + * ocfs2_inode_lock_tracker(ex=0) + * + * <====== ocfs2_inode_lock_tracker(ex=1) + * + * ocfs2_inode_lock_tracker(ex=1) + */ +int ocfs2_inode_lock_tracker(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct ocfs2_lock_holder *oh) +{ + int status = 0; + struct ocfs2_lock_res *lockres; + struct ocfs2_lock_holder *tmp_oh; + struct pid *pid = task_pid(current); + + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + tmp_oh = ocfs2_pid_holder(lockres, pid); + + if (!tmp_oh) { + /* + * This corresponds to the case 1. + * We haven't got any lock before. + */ + status = ocfs2_inode_lock_full(inode, ret_bh, ex, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + oh->oh_ex = ex; + ocfs2_add_holder(lockres, oh); + return 0; + } + + if (unlikely(ex && !tmp_oh->oh_ex)) { + /* + * case 2.2 upgrade may cause dead lock, forbid it. + */ + mlog(ML_ERROR, "Recursive locking is not permitted to " + "upgrade to EX level from PR level.\n"); + dump_stack(); + return -EINVAL; + } + + /* + * case 2.1 OCFS2_META_LOCK_GETBH flag make ocfs2_inode_lock_full. + * ignore the lock level and just update it. + */ + if (ret_bh) { + status = ocfs2_inode_lock_full(inode, ret_bh, ex, + OCFS2_META_LOCK_GETBH); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + } + return 1; +} + +void ocfs2_inode_unlock_tracker(struct inode *inode, + int ex, + struct ocfs2_lock_holder *oh, + int had_lock) +{ + struct ocfs2_lock_res *lockres; + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + /* had_lock means that the current process already takes the cluster + * lock previously. + * If had_lock is 1, we have nothing to do here. + * If had_lock is 0, we will release the lock. + */ + if (!had_lock) { + ocfs2_inode_unlock(inode, oh->oh_ex); + ocfs2_remove_holder(lockres, oh); + } } int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno) @@ -2544,11 +2810,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb, * refreshed, so we do it here. Of course, making sense of * everything is up to the caller :) */ status = ocfs2_should_refresh_lock_res(lockres); - if (status < 0) { - ocfs2_cluster_unlock(osb, lockres, level); - mlog_errno(status); - goto bail; - } if (status) { status = ocfs2_refresh_slot_info(osb); @@ -2608,14 +2869,25 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex) if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ex) + down_write(&osb->nfs_sync_rwlock); + else + down_read(&osb->nfs_sync_rwlock); + if (ocfs2_mount_local(osb)) return 0; status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE, 0, 0); - if (status < 0) + if (status < 0) { mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status); + if (ex) + up_write(&osb->nfs_sync_rwlock); + else + up_read(&osb->nfs_sync_rwlock); + } + return status; } @@ -2626,6 +2898,74 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) if (!ocfs2_mount_local(osb)) ocfs2_cluster_unlock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE); + if (ex) + up_write(&osb->nfs_sync_rwlock); + else + up_read(&osb->nfs_sync_rwlock); +} + +int ocfs2_trim_fs_lock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info, int trylock) +{ + int status; + struct ocfs2_trim_fs_lvb *lvb; + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + if (info) + info->tf_valid = 0; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, + trylock ? DLM_LKF_NOQUEUE : 0, 0); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + return status; + } + + if (info) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) && + lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) { + info->tf_valid = 1; + info->tf_success = lvb->lvb_success; + info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum); + info->tf_start = be64_to_cpu(lvb->lvb_start); + info->tf_len = be64_to_cpu(lvb->lvb_len); + info->tf_minlen = be64_to_cpu(lvb->lvb_minlen); + info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen); + } + } + + return status; +} + +void ocfs2_trim_fs_unlock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info) +{ + struct ocfs2_trim_fs_lvb *lvb; + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + if (ocfs2_mount_local(osb)) + return; + + if (info) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION; + lvb->lvb_success = info->tf_success; + lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum); + lvb->lvb_start = cpu_to_be64(info->tf_start); + lvb->lvb_len = cpu_to_be64(info->tf_len); + lvb->lvb_minlen = cpu_to_be64(info->tf_minlen); + lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen); + } + + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); } int ocfs2_dentry_lock(struct dentry *dentry, int ex) @@ -2698,7 +3038,7 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) kref_init(&dlm_debug->d_refcnt); INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); - dlm_debug->d_locking_state = NULL; + dlm_debug->d_filter_secs = 0; out: return dlm_debug; } @@ -2769,6 +3109,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) struct ocfs2_lock_res *iter = v; struct ocfs2_lock_res *dummy = &priv->p_iter_res; + (*pos)++; spin_lock(&ocfs2_dlm_tracking_lock); iter = ocfs2_dlm_next_res(iter, priv); list_del_init(&dummy->l_debug_list); @@ -2789,17 +3130,40 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) * - Lock stats printed * New in version 3 * - Max time in lock stats is in usecs (instead of nsecs) + * New in version 4 + * - Add last pr/ex unlock times and first lock wait time in usecs */ -#define OCFS2_DLM_DEBUG_STR_VERSION 3 +#define OCFS2_DLM_DEBUG_STR_VERSION 4 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) { int i; char *lvb; struct ocfs2_lock_res *lockres = v; +#ifdef CONFIG_OCFS2_FS_STATS + u64 now, last; + struct ocfs2_dlm_debug *dlm_debug = + ((struct ocfs2_dlm_seq_priv *)m->private)->p_dlm_debug; +#endif if (!lockres) return -EINVAL; +#ifdef CONFIG_OCFS2_FS_STATS + if (!lockres->l_lock_wait && dlm_debug->d_filter_secs) { + now = ktime_to_us(ktime_get_real()); + last = max(lockres->l_lock_prmode.ls_last, + lockres->l_lock_exmode.ls_last); + /* + * Use d_filter_secs field to filter lock resources dump, + * the default d_filter_secs(0) value filters nothing, + * otherwise, only dump the last N seconds active lock + * resources. + */ + if (div_u64(now - last, 1000000) > dlm_debug->d_filter_secs) + return 0; + } +#endif + seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION); if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY) @@ -2841,6 +3205,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max) # define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max) # define lock_refresh(_l) ((_l)->l_lock_refresh) +# define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last) +# define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last) +# define lock_wait(_l) ((_l)->l_lock_wait) #else # define lock_num_prmode(_l) (0) # define lock_num_exmode(_l) (0) @@ -2851,6 +3218,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_max_prmode(_l) (0) # define lock_max_exmode(_l) (0) # define lock_refresh(_l) (0) +# define lock_last_prmode(_l) (0ULL) +# define lock_last_exmode(_l) (0ULL) +# define lock_wait(_l) (0ULL) #endif /* The following seq_print was added in version 2 of this output */ seq_printf(m, "%u\t" @@ -2861,7 +3231,10 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) "%llu\t" "%u\t" "%u\t" - "%u\t", + "%u\t" + "%llu\t" + "%llu\t" + "%llu\t", lock_num_prmode(lockres), lock_num_exmode(lockres), lock_num_prmode_failed(lockres), @@ -2870,7 +3243,10 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) lock_total_exmode(lockres), lock_max_prmode(lockres), lock_max_exmode(lockres), - lock_refresh(lockres)); + lock_refresh(lockres), + lock_last_prmode(lockres), + lock_last_exmode(lockres), + lock_wait(lockres)); /* End the line */ seq_printf(m, "\n"); @@ -2897,37 +3273,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) { - int ret; struct ocfs2_dlm_seq_priv *priv; - struct seq_file *seq; struct ocfs2_super *osb; - priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); + priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv)); if (!priv) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; + mlog_errno(-ENOMEM); + return -ENOMEM; } + osb = inode->i_private; ocfs2_get_dlm_debug(osb->osb_dlm_debug); priv->p_dlm_debug = osb->osb_dlm_debug; INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); - ret = seq_open(file, &ocfs2_dlm_seq_ops); - if (ret) { - kfree(priv); - mlog_errno(ret); - goto out; - } - - seq = file->private_data; - seq->private = priv; - ocfs2_add_lockres_tracking(&priv->p_iter_res, priv->p_dlm_debug); -out: - return ret; + return 0; } static const struct file_operations ocfs2_dlm_debug_fops = { @@ -2937,36 +3300,24 @@ static const struct file_operations ocfs2_dlm_debug_fops = { .llseek = seq_lseek, }; -static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) +static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) { - int ret = 0; struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; - dlm_debug->d_locking_state = debugfs_create_file("locking_state", - S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_dlm_debug_fops); - if (!dlm_debug->d_locking_state) { - ret = -EINVAL; - mlog(ML_ERROR, - "Unable to create locking state debugfs file.\n"); - goto out; - } + debugfs_create_file("locking_state", S_IFREG|S_IRUSR, + osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); + debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, + &dlm_debug->d_filter_secs); ocfs2_get_dlm_debug(dlm_debug); -out: - return ret; } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) { struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; - if (dlm_debug) { - debugfs_remove(dlm_debug->d_locking_state); + if (dlm_debug) ocfs2_put_dlm_debug(dlm_debug); - } } int ocfs2_dlm_init(struct ocfs2_super *osb) @@ -2979,14 +3330,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto local; } - status = ocfs2_dlm_init_debug(osb); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_dlm_init_debug(osb); /* launch downconvert thread */ - osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc"); + osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s", + osb->uuid_str); if (IS_ERR(osb->dc_task)) { status = PTR_ERR(osb->dc_task); osb->dc_task = NULL; @@ -2996,6 +3344,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) /* for now, uuid == domain */ status = ocfs2_cluster_connect(osb->osb_cluster_stack, + osb->osb_cluster_name, + strlen(osb->osb_cluster_name), osb->uuid_str, strlen(osb->uuid_str), &lproto, ocfs2_do_node_down, osb, @@ -3005,7 +3355,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto bail; } - status = ocfs2_cluster_this_node(&osb->node_num); + status = ocfs2_cluster_this_node(conn, &osb->node_num); if (status < 0) { mlog_errno(status); mlog(ML_ERROR, @@ -3017,12 +3367,10 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); - ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + ocfs2_nfs_sync_lock_init(osb); ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); osb->cconn = conn; - - status = 0; bail: if (status < 0) { ocfs2_dlm_shutdown_debug(osb); @@ -3054,10 +3402,12 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb, ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres); ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres); - ocfs2_cluster_disconnect(osb->cconn, hangup_pending); - osb->cconn = NULL; + if (osb->cconn) { + ocfs2_cluster_disconnect(osb->cconn, hangup_pending); + osb->cconn = NULL; - ocfs2_dlm_shutdown_debug(osb); + ocfs2_dlm_shutdown_debug(osb); + } } static int ocfs2_drop_lock(struct ocfs2_super *osb, @@ -3142,22 +3492,60 @@ out: return 0; } +static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + /* Mark the lockres as being dropped. It will no longer be * queued if blocking, but we still may have to wait on it * being dequeued from the downconvert thread before we can consider * it safe to drop. * * You can *not* attempt to call cluster_lock on this lockres anymore. */ -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) { int status; struct ocfs2_mask_waiter mw; - unsigned long flags; + unsigned long flags, flags2; ocfs2_init_mask_waiter(&mw); spin_lock_irqsave(&lockres->l_lock, flags); lockres->l_flags |= OCFS2_LOCK_FREEING; + if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) { + /* + * We know the downconvert is queued but not in progress + * because we are the downconvert thread and processing + * different lock. So we can just remove the lock from the + * queue. This is not only an optimization but also a way + * to avoid the following deadlock: + * ocfs2_dentry_post_unlock() + * ocfs2_dentry_lock_put() + * ocfs2_drop_dentry_lock() + * iput() + * ocfs2_evict_inode() + * ocfs2_clear_inode() + * ocfs2_mark_lockres_freeing() + * ... blocks waiting for OCFS2_LOCK_QUEUED + * since we are the downconvert thread which + * should clear the flag. + */ + spin_unlock_irqrestore(&lockres->l_lock, flags); + spin_lock_irqsave(&osb->dc_task_lock, flags2); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock_irqrestore(&osb->dc_task_lock, flags2); + /* + * Warn if we recurse into another post_unlock call. Strictly + * speaking it isn't a problem but we need to be careful if + * that happens (stack overflow, deadlocks, ...) so warn if + * ocfs2 grows a path for which this can happen. + */ + WARN_ON_ONCE(lockres->l_ops->post_unlock); + /* Since the lock is freeing we don't do much in the fn below */ + ocfs2_process_blocked_lock(osb, lockres); + return; + } while (lockres->l_flags & OCFS2_LOCK_QUEUED) { lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -3178,7 +3566,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, { int ret; - ocfs2_mark_lockres_freeing(lockres); + ocfs2_mark_lockres_freeing(osb, lockres); ret = ocfs2_drop_lock(osb, lockres); if (ret) mlog_errno(ret); @@ -3264,6 +3652,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, lockres->l_level, new_level); + /* + * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always + * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that + * we can recover correctly from node failure. Otherwise, we may get + * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. + */ + if (ocfs2_userspace_stack(osb) && + lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lvb = 1; + if (lvb) dlm_flags |= DLM_LKF_VALBLK; @@ -3403,9 +3801,9 @@ recheck: * set when the ast is received for an upconvert just before the * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast * on the heels of the ast, we want to delay the downconvert just - * enough to allow the up requestor to do its task. Because this + * enough to allow the up requester to do its task. Because this * lock is in the blocked queue, the lock will be downconverted - * as soon as the requestor is done with the lock. + * as soon as the requester is done with the lock. */ if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) goto leave_requeue; @@ -3516,6 +3914,17 @@ downconvert: spin_unlock_irqrestore(&lockres->l_lock, flags); ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb, gen); + /* The dlm lock convert is being cancelled in background, + * ocfs2_cancel_convert() is asynchronous in fs/dlm, + * requeue it, try again later. + */ + if (ret == -EBUSY) { + ctl->requeue = 1; + mlog(ML_BASTS, "lockres %s, ReQ: Downconvert busy\n", + lockres->l_name); + ret = 0; + msleep(20); + } leave: if (ret) @@ -3543,7 +3952,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, oi = OCFS2_I(inode); oi->ip_dir_lock_gen++; mlog(0, "generation: %u\n", oi->ip_dir_lock_gen); - goto out; + goto out_forget; } if (!S_ISREG(inode->i_mode)) @@ -3574,6 +3983,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, filemap_fdatawait(mapping); } +out_forget: + forget_all_cached_acls(inode); + out: return UNBLOCK_CONTINUE; } @@ -3703,8 +4115,10 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, break; spin_unlock(&dentry_attach_lock); - mlog(0, "d_delete(%.*s);\n", dentry->d_name.len, - dentry->d_name.name); + if (S_ISDIR(dl->dl_inode->i_mode)) + shrink_dcache_parent(dentry); + + mlog(0, "d_delete(%pd);\n", dentry); /* * The following dcache calls may do an @@ -3924,7 +4338,7 @@ unqueue: ocfs2_schedule_blocked_lock(osb, lockres); mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name, - ctl.requeue ? "yes" : "no"); + str_yes_no(ctl.requeue)); spin_unlock_irqrestore(&lockres->l_lock, flags); if (ctl.unblock_action != UNBLOCK_CONTINUE @@ -3971,9 +4385,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) osb->dc_work_sequence = osb->dc_wake_sequence; processed = osb->blocked_lock_count; - while (processed) { - BUG_ON(list_empty(&osb->blocked_lock_list)); - + /* + * blocked lock processing in this loop might call iput which can + * remove items off osb->blocked_lock_list. Downconvert up to + * 'processed' number of locks, but stop short if we had some + * removed in ocfs2_mark_lockres_freeing when downconverting. + */ + while (processed && !list_empty(&osb->blocked_lock_list)) { lockres = list_entry(osb->blocked_lock_list.next, struct ocfs2_lock_res, l_blocked_list); list_del_init(&lockres->l_blocked_list); @@ -4018,7 +4436,6 @@ static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb) static int ocfs2_downconvert_thread(void *arg) { - int status = 0; struct ocfs2_super *osb = arg; /* only quit once we've been asked to stop and there is no more @@ -4036,7 +4453,7 @@ static int ocfs2_downconvert_thread(void *arg) } osb->dc_task = NULL; - return status; + return 0; } void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb) diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 1d596d8c4a4a..a3ebd7303ea2 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * dlmglue.h * * description here * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ @@ -70,6 +54,35 @@ struct ocfs2_orphan_scan_lvb { __be32 lvb_os_seqno; }; +#define OCFS2_TRIMFS_LVB_VERSION 1 + +struct ocfs2_trim_fs_lvb { + __u8 lvb_version; + __u8 lvb_success; + __u8 lvb_reserved[2]; + __be32 lvb_nodenum; + __be64 lvb_start; + __be64 lvb_len; + __be64 lvb_minlen; + __be64 lvb_trimlen; +}; + +struct ocfs2_trim_fs_info { + u8 tf_valid; /* lvb is valid, or not */ + u8 tf_success; /* trim is successful, or not */ + u32 tf_nodenum; /* osb node number */ + u64 tf_start; /* trim start offset in clusters */ + u64 tf_len; /* trim end offset in clusters */ + u64 tf_minlen; /* trim minimum contiguous free clusters */ + u64 tf_trimlen; /* trimmed length in bytes */ +}; + +struct ocfs2_lock_holder { + struct list_head oh_list; + struct pid *oh_owner_pid; + int oh_ex; +}; + /* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) @@ -77,6 +90,8 @@ struct ocfs2_orphan_scan_lvb { #define OCFS2_META_LOCK_NOQUEUE (0x02) /* don't block waiting for the downconvert thread, instead return -EAGAIN */ #define OCFS2_LOCK_NONBLOCK (0x04) +/* just get back disk inode bh if we've got cluster lock. */ +#define OCFS2_META_LOCK_GETBH (0x08) /* Locking subclasses of inode cluster lock */ enum { @@ -109,22 +124,21 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); int ocfs2_rw_lock(struct inode *inode, int write); +int ocfs2_try_rw_lock(struct inode *inode, int write); void ocfs2_rw_unlock(struct inode *inode, int write); int ocfs2_open_lock(struct inode *inode); int ocfs2_try_open_lock(struct inode *inode, int write); void ocfs2_open_unlock(struct inode *inode); int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, - int *level); + int *level, int wait); int ocfs2_inode_lock_full_nested(struct inode *inode, struct buffer_head **ret_bh, int ex, int arg_flags, int subclass); -int ocfs2_inode_lock_with_page(struct inode *inode, - struct buffer_head **ret_bh, - int ex, - struct page *page); +int ocfs2_inode_lock_with_folio(struct inode *inode, + struct buffer_head **ret_bh, int ex, struct folio *folio); /* Variants without special locking class or flags */ #define ocfs2_inode_lock_full(i, r, e, f)\ ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL) @@ -133,6 +147,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode, /* 99% of the time we don't want to supply any additional flags -- * those are for very specific cases only. */ #define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL) +#define ocfs2_try_inode_lock(i, b, e)\ + ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\ + OI_LS_NORMAL) void ocfs2_inode_unlock(struct inode *inode, int ex); int ocfs2_super_lock(struct ocfs2_super *osb, @@ -146,6 +163,12 @@ int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex); +void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb); +void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb); +int ocfs2_trim_fs_lock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info, int trylock); +void ocfs2_trim_fs_unlock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info); int ocfs2_dentry_lock(struct dentry *dentry, int ex); void ocfs2_dentry_unlock(struct dentry *dentry, int ex); int ocfs2_file_lock(struct file *file, int ex, int trylock); @@ -157,7 +180,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); @@ -169,4 +193,15 @@ void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); /* To set the locking protocol on module initialization */ void ocfs2_set_locking_protocol(void); + +/* The _tracker pair is used to avoid cluster recursive locking */ +int ocfs2_inode_lock_tracker(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct ocfs2_lock_holder *oh); +void ocfs2_inode_unlock_tracker(struct inode *inode, + int ex, + struct ocfs2_lock_holder *oh, + int had_lock); + #endif /* DLMGLUE_H */ diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 29651167190d..b95724b767e1 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * export.c * * Functions to facilitate NFS exporting * * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -82,7 +66,6 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, } status = ocfs2_test_inode_bit(osb, blkno, &set); - trace_ocfs2_get_dentry_test_bit(status, set); if (status < 0) { if (status == -EINVAL) { /* @@ -96,6 +79,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, goto unlock_nfs_sync; } + trace_ocfs2_get_dentry_test_bit(status, set); /* If the inode allocator bit is clear, this inode must be stale */ if (!set) { status = -ESTALE; @@ -119,16 +103,16 @@ check_err: if (IS_ERR(inode)) { mlog_errno(PTR_ERR(inode)); - result = (void *)inode; + result = ERR_CAST(inode); goto bail; } check_gen: if (handle->ih_generation != inode->i_generation) { - iput(inode); trace_ocfs2_get_dentry_generation((unsigned long long)blkno, handle->ih_generation, inode->i_generation); + iput(inode); result = ERR_PTR(-ESTALE); goto bail; } @@ -147,17 +131,25 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) int status; u64 blkno; struct dentry *parent; - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); + int set; trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); + status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); + parent = ERR_PTR(status); + goto bail; + } + status = ocfs2_inode_lock(dir, NULL, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); parent = ERR_PTR(status); - goto bail; + goto unlock_nfs_sync; } status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno); @@ -166,11 +158,31 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) goto bail_unlock; } + status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set); + if (status < 0) { + if (status == -EINVAL) { + status = -ESTALE; + } else + mlog(ML_ERROR, "test inode bit failed %d\n", status); + parent = ERR_PTR(status); + goto bail_unlock; + } + + trace_ocfs2_get_dentry_test_bit(status, set); + if (!set) { + status = -ESTALE; + parent = ERR_PTR(status); + goto bail_unlock; + } + parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); bail_unlock: ocfs2_inode_unlock(dir, 0); +unlock_nfs_sync: + ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1); + bail: trace_ocfs2_get_parent_end(parent); @@ -243,9 +255,9 @@ static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb, if (fh_len < 3 || fh_type > 2) return NULL; - handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32; - handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]); - handle.ih_generation = le32_to_cpu(fid->raw[2]); + handle.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[0]) << 32; + handle.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[1]); + handle.ih_generation = le32_to_cpu((__force __le32)fid->raw[2]); return ocfs2_get_dentry(sb, &handle); } @@ -257,9 +269,9 @@ static struct dentry *ocfs2_fh_to_parent(struct super_block *sb, if (fh_type != 2 || fh_len < 6) return NULL; - parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32; - parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]); - parent.ih_generation = le32_to_cpu(fid->raw[5]); + parent.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[3]) << 32; + parent.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[4]); + parent.ih_generation = le32_to_cpu((__force __le32)fid->raw[5]); return ocfs2_get_dentry(sb, &parent); } diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h index 41a738678c37..636357400505 100644 --- a/fs/ocfs2/export.h +++ b/fs/ocfs2/export.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * export.h * * Function prototypes * * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_EXPORT_H diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 2487116d0d33..ef147e8b3271 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -1,25 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * extent_map.c * * Block/Cluster mapping functions * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License, version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -38,6 +23,7 @@ #include "inode.h" #include "super.h" #include "symlink.h" +#include "aops.h" #include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -305,8 +291,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -415,7 +401,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, { int i, ret, tree_height, len; struct ocfs2_dinode *di; - struct ocfs2_extent_block *uninitialized_var(eb); + struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; struct buffer_head *eb_bh = NULL; @@ -441,14 +427,24 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; } } + if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { + ocfs2_error(inode->i_sb, + "Inode %lu has an invalid extent (next_free_rec %u, count %u)\n", + inode->i_ino, + le16_to_cpu(el->l_next_free_rec), + le16_to_cpu(el->l_count)); + ret = -EROFS; + goto out; + } + i = ocfs2_search_extent_list(el, v_cluster); if (i == -1) { /* @@ -475,8 +471,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0)", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0)\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; @@ -564,8 +561,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in xattr leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -582,8 +579,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; @@ -600,8 +598,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, *extent_flags = rec->e_flags; } out: - if (eb_bh) - brelse(eb_bh); + brelse(eb_bh); return ret; } @@ -610,7 +607,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, unsigned int *extent_flags) { int ret; - unsigned int uninitialized_var(hole_len), flags = 0; + unsigned int hole_len, flags = 0; struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; @@ -709,6 +706,8 @@ out: * it not only handles the fiemap for inlined files, but also deals * with the fast symlink, cause they have no difference for extent * mapping per se. + * + * Must be called with ip_alloc_sem semaphore held. */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, @@ -720,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, u64 phys; u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; struct ocfs2_inode_info *oi = OCFS2_I(inode); + lockdep_assert_held_read(&oi->ip_alloc_sem); di = (struct ocfs2_dinode *)di_bh->b_data; if (ocfs2_inode_is_fast_symlink(inode)) @@ -735,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) return ret; } @@ -744,8 +747,6 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, return 0; } -#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) - int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len) { @@ -757,7 +758,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; - ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, map_start, &map_len, 0); if (ret) return ret; @@ -781,7 +782,6 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, cpos = map_start >> osb->s_clustersize_bits; mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, map_start + map_len); - mapping_end -= cpos; is_last = 0; while (cpos < mapping_end && !is_last) { u32 fe_flags; @@ -808,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; - + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, len_bytes, fe_flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret) break; @@ -831,6 +833,50 @@ out: return ret; } +/* Is IO overwriting allocated blocks? */ +int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh, + u64 map_start, u64 map_len) +{ + int ret = 0, is_last; + u32 mapping_end, cpos; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_rec rec; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (ocfs2_size_fits_inline_data(di_bh, map_start + map_len)) + return ret; + else + return -EAGAIN; + } + + cpos = map_start >> osb->s_clustersize_bits; + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, + map_start + map_len); + is_last = 0; + while (cpos < mapping_end && !is_last) { + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, + NULL, &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) + break; + + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + break; + + cpos = le32_to_cpu(rec.e_cpos) + + le16_to_cpu(rec.e_leaf_clusters); + } + + if (cpos < mapping_end) + ret = -EAGAIN; +out: + return ret; +} + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) { struct inode *inode = file->f_mapping->host; @@ -852,20 +898,20 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) down_read(&OCFS2_I(inode)->ip_alloc_sem); - if (*offset >= inode->i_size) { + if (*offset >= i_size_read(inode)) { ret = -ENXIO; goto out_unlock; } if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { if (whence == SEEK_HOLE) - *offset = inode->i_size; + *offset = i_size_read(inode); goto out_unlock; } clen = 0; cpos = *offset >> cs_bits; - cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); + cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); while (cpos < cend && !is_last) { ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, @@ -904,8 +950,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) extlen = clen; extlen <<= cs_bits; - if ((extoff + extlen) > inode->i_size) - extlen = inode->i_size - extoff; + if ((extoff + extlen) > i_size_read(inode)) + extlen = i_size_read(inode) - extoff; extoff += extlen; if (extoff > *offset) *offset = extoff; @@ -945,7 +991,13 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, } while (done < nr) { - down_read(&OCFS2_I(inode)->ip_alloc_sem); + if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { + rc = -EAGAIN; + mlog(ML_ERROR, + "Inode #%llu ip_alloc_sem is temporarily unavailable\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + break; + } rc = ocfs2_extent_map_get_blocks(inode, v_block + done, &p_block, &p_count, NULL); up_read(&OCFS2_I(inode)->ip_alloc_sem); diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 67ea57d2fd59..bc4ed59fb925 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -1,25 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * extent_map.h * * In-memory file extent mappings for OCFS2. * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License, version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef _EXTENT_MAP_H @@ -53,6 +38,9 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len); +int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh, + u64 map_start, u64 map_len); + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 41000f223ca4..21d797ccccd0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * file.c * * File open, close, extend, truncate * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/capability.h> @@ -37,6 +21,7 @@ #include <linux/falloc.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <cluster/masklog.h> @@ -100,19 +85,22 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) struct ocfs2_inode_info *oi = OCFS2_I(inode); trace_ocfs2_file_open(inode, file, file->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)oi->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, mode); - if (file->f_mode & FMODE_WRITE) - dquot_initialize(inode); + if (file->f_mode & FMODE_WRITE) { + status = dquot_initialize(inode); + if (status) + goto leave; + } spin_lock(&oi->ip_lock); /* Check that the inode hasn't been wiped from disk by another * node. If it hasn't then we're safe as long as we hold the * spin lock until our increment of open count. */ - if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + if (oi->ip_flags & OCFS2_INODE_DELETED) { spin_unlock(&oi->ip_lock); status = -ENOENT; @@ -136,6 +124,8 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) spin_unlock(&oi->ip_lock); } + file->f_mode |= FMODE_NOWAIT; + leave: return status; } @@ -175,43 +165,40 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { int err = 0; - journal_t *journal; struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + journal_t *journal = osb->journal->j_journal; + int ret; + tid_t commit_tid; + bool needs_barrier = false; trace_ocfs2_sync_file(inode, file, file->f_path.dentry, - OCFS2_I(inode)->ip_blkno, + oi->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, (unsigned long long)datasync); - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + err = file_write_and_wait_range(file, start, end); if (err) return err; - /* - * Probably don't need the i_mutex at all in here, just putting it here - * to be consistent with how fsync used to be called, someone more - * familiar with the fs could possibly remove it. - */ - mutex_lock(&inode->i_mutex); - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { - /* - * We still have to flush drive's caches to get data to the - * platter - */ - if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - goto bail; + commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid; + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + err = jbd2_complete_transaction(journal, commit_tid); + if (needs_barrier) { + ret = blkdev_issue_flush(inode->i_sb->s_bdev); + if (!err) + err = ret; } - journal = osb->journal->j_journal; - err = jbd2_journal_force_commit(journal); - -bail: if (err) mlog_errno(err); - mutex_unlock(&inode->i_mutex); return (err < 0) ? -EIO : 0; } @@ -219,14 +206,14 @@ bail: int ocfs2_should_update_atime(struct inode *inode, struct vfsmount *vfsmnt) { - struct timespec now; + struct timespec64 now; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return 0; if ((inode->i_flags & S_NOATIME) || - ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) + ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))) return 0; /* @@ -245,15 +232,19 @@ int ocfs2_should_update_atime(struct inode *inode, return 0; if (vfsmnt->mnt_flags & MNT_RELATIME) { - if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || - (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 atime = inode_get_atime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + + if ((timespec64_compare(&atime, &mtime) <= 0) || + (timespec64_compare(&atime, &ctime) <= 0)) return 1; return 0; } - now = CURRENT_TIME; - if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) + now = current_time(inode); + if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum)) return 0; else return 1; @@ -283,21 +274,22 @@ int ocfs2_update_inode_atime(struct inode *inode, /* * Don't use ocfs2_mark_inode_dirty() here as we don't always - * have i_mutex to guard against concurrent changes to other + * have i_rwsem to guard against concurrent changes to other * inode fields. */ - inode->i_atime = CURRENT_TIME; - di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + inode_set_atime_to_ts(inode, current_time(inode)); + di->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + ocfs2_commit_trans(osb, handle); out: return ret; } -static int ocfs2_set_inode_size(handle_t *handle, +int ocfs2_set_inode_size(handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, u64 new_i_size) @@ -306,7 +298,7 @@ static int ocfs2_set_inode_size(handle_t *handle, i_size_write(inode, new_i_size); inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { @@ -338,6 +330,7 @@ int ocfs2_simple_size_update(struct inode *inode, if (ret < 0) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_commit_trans(osb, handle); out: return ret; @@ -370,7 +363,7 @@ static int ocfs2_cow_file_pos(struct inode *inode, if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) goto out; - return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); + return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); out: return status; @@ -426,12 +419,13 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, } i_size_write(inode, new_i_size); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); di = (struct ocfs2_dinode *) fe_bh->b_data; di->i_size = cpu_to_le64(new_i_size); - di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); @@ -441,7 +435,7 @@ out: return status; } -static int ocfs2_truncate_file(struct inode *inode, +int ocfs2_truncate_file(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size) { @@ -474,11 +468,6 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail; } - /* lets handle the simple truncate cases before doing any more - * cluster locking. */ - if (new_i_size == le64_to_cpu(fe->i_size)) - goto bail; - down_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_resv_discard(&osb->osb_la_resmap, @@ -491,10 +480,11 @@ static int ocfs2_truncate_file(struct inode *inode, * greater than page size, so we have to truncate them * anyway. */ - unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(inode->i_mapping, new_i_size); if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + unmap_mapping_range(inode->i_mapping, + new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); status = ocfs2_truncate_inline(inode, di_bh, new_i_size, i_size_read(inode), 1); if (status) @@ -513,6 +503,9 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail_unlock_sem; } + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + status = ocfs2_commit_truncate(osb, inode, di_bh); if (status < 0) { mlog_errno(status); @@ -551,19 +544,16 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret) { - int ret; struct ocfs2_extent_tree et; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); - ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, - clusters_to_add, mark_unwritten, - data_ac, meta_ac, reason_ret); - - return ret; + return ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); } -static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten) +static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) { int status = 0; int restart_func = 0; @@ -574,13 +564,13 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, handle_t *handle = NULL; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; - enum ocfs2_alloc_restarted why; + enum ocfs2_alloc_restarted why = RESTART_NONE; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_extent_tree et; int did_quota = 0; /* - * This function only exists for file systems which don't + * Unwritten extent only exists for file systems which * support holes. */ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); @@ -603,8 +593,7 @@ restart_all: goto leave; } - credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, - clusters_to_add); + credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -653,7 +642,7 @@ restarted_transaction: mlog_errno(status); goto leave; } - + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(inode)->ip_lock); @@ -671,11 +660,7 @@ restarted_transaction: } else { BUG_ON(why != RESTART_TRANS); - /* TODO: This can be more intelligent. */ - credits = ocfs2_calc_extend_credits(osb->sb, - &fe->id2.i_list, - clusters_to_add); - status = ocfs2_extend_trans(handle, credits); + status = ocfs2_allocate_extend_trans(handle, 1); if (status < 0) { /* handle still has to be committed at * this point. */ @@ -723,7 +708,10 @@ leave: * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. */ -static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, + struct buffer_head *di_bh, + loff_t start_byte, + loff_t length) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -739,9 +727,17 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) goto out; } - ret = ocfs2_jbd2_file_inode(handle, inode); - if (ret < 0) + ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 1); out: if (ret) { @@ -756,31 +752,41 @@ out: * to be too fragile to do exactly what we need without us having to * worry about recursive locking in ->write_begin() and ->write_end(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, - u64 abs_to) + u64 abs_to, struct buffer_head *di_bh) { struct address_space *mapping = inode->i_mapping; - struct page *page; - unsigned long index = abs_from >> PAGE_CACHE_SHIFT; - handle_t *handle = NULL; + struct folio *folio; + unsigned long index = abs_from >> PAGE_SHIFT; + handle_t *handle; int ret = 0; unsigned zero_from, zero_to, block_start, block_end; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; BUG_ON(abs_from >= abs_to); - BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); + BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; - mlog_errno(ret); + handle = ocfs2_zero_start_ordered_transaction(inode, di_bh, + abs_from, + abs_to - abs_from); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); goto out; } - /* Get the offsets within the page that we want to zero */ - zero_from = abs_from & (PAGE_CACHE_SIZE - 1); - zero_to = abs_to & (PAGE_CACHE_SIZE - 1); + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + mlog_errno(ret); + goto out_commit_trans; + } + + /* Get the offsets within the folio that we want to zero */ + zero_from = offset_in_folio(folio, abs_from); + zero_to = offset_in_folio(folio, abs_to); if (!zero_to) - zero_to = PAGE_CACHE_SIZE; + zero_to = folio_size(folio); trace_ocfs2_write_zero_page( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -791,44 +797,48 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* We know that zero_from is block aligned */ for (block_start = zero_from; block_start < zero_to; block_start = block_end) { - block_end = block_start + (1 << inode->i_blkbits); + block_end = block_start + i_blocksize(inode); /* * block_start is block-aligned. Bump it by one to force * __block_write_begin and block_commit_write to zero the * whole block. */ - ret = __block_write_begin(page, block_start + 1, 0, + ret = __block_write_begin(folio, block_start + 1, 0, ocfs2_get_block); if (ret < 0) { mlog_errno(ret); goto out_unlock; } - if (!handle) { - handle = ocfs2_zero_start_ordered_transaction(inode); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; - break; - } - } /* must not update i_size! */ - ret = block_commit_write(page, block_start + 1, - block_start + 1); - if (ret < 0) - mlog_errno(ret); - else - ret = 0; + block_commit_write(folio, block_start + 1, block_start + 1); } - if (handle) - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + /* + * fs-writeback will release the dirty pages without page lock + * whose offset are over inode size, the release happens at + * block_write_full_folio(). + */ + i_size_write(inode, abs_to); + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); + di->i_mtime_nsec = di->i_ctime_nsec; + if (handle) { + ocfs2_journal_dirty(handle, di_bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); + } out_unlock: - unlock_page(page); - page_cache_release(page); + folio_unlock(folio); + folio_put(folio); +out_commit_trans: + if (handle) + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: return ret; } @@ -899,7 +909,7 @@ static int ocfs2_zero_extend_get_range(struct inode *inode, zero_clusters = last_cpos - zero_cpos; if (needs_cow) { - rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, + rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, UINT_MAX); if (rc) { mlog_errno(rc); @@ -920,7 +930,7 @@ out: * has made sure that the entire range needs zeroing. */ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, - u64 range_end) + u64 range_end, struct buffer_head *di_bh) { int rc = 0; u64 next_pos; @@ -933,10 +943,10 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, BUG_ON(range_start >= range_end); while (zero_pos < range_end) { - next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; + next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE; if (next_pos > range_end) next_pos = range_end; - rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh); if (rc < 0) { mlog_errno(rc); break; @@ -982,7 +992,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, range_end = zero_to_size; ret = ocfs2_zero_extend_range(inode, range_start, - range_end); + range_end, di_bh); if (ret) { mlog_errno(ret); break; @@ -1004,7 +1014,7 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, * Only quota files call this without a bh, and they can't be * refcounted. */ - BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode)); BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); @@ -1014,8 +1024,8 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, clusters_to_add -= oi->ip_clusters; if (clusters_to_add) { - ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, - clusters_to_add, 0); + ret = ocfs2_extend_allocation(inode, oi->ip_clusters, + clusters_to_add, 0); if (ret) { mlog_errno(ret); goto out; @@ -1055,7 +1065,7 @@ static int ocfs2_extend_file(struct inode *inode, /* * The alloc sem blocks people in read/write from reading our * allocation until we're done changing it. We depend on - * i_mutex to block other extend/truncate calls while we're + * i_rwsem to block other extend/truncate calls while we're * here. We even have to hold it for sparse files because there * might be some tail zeroing. */ @@ -1101,23 +1111,30 @@ out: return ret; } -int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) +int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) { int status = 0, size_change; - struct inode *inode = dentry->d_inode; + int inode_locked = 0; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; handle_t *handle = NULL; struct dquot *transfer_to[MAXQUOTAS] = { }; int qtype; + int had_lock; + struct ocfs2_lock_holder oh; trace_ocfs2_setattr(inode, dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, dentry->d_name.len, dentry->d_name.name, - attr->ia_valid, attr->ia_mode, - from_kuid(&init_user_ns, attr->ia_uid), - from_kgid(&init_user_ns, attr->ia_gid)); + attr->ia_valid, + attr->ia_valid & ATTR_MODE ? attr->ia_mode : 0, + attr->ia_valid & ATTR_UID ? + from_kuid(&init_user_ns, attr->ia_uid) : 0, + attr->ia_valid & ATTR_GID ? + from_kgid(&init_user_ns, attr->ia_gid) : 0); /* ensuring we don't even attempt to truncate a symlink */ if (S_ISLNK(inode->i_mode)) @@ -1128,14 +1145,24 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) return 0; - status = inode_change_ok(inode, attr); + status = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (status) return status; - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); + if (is_quota_modification(&nop_mnt_idmap, inode, attr)) { + status = dquot_initialize(inode); + if (status) + return status; + } size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { + /* + * Here we should wait dio to finish before inode lock + * to avoid a deadlock between ocfs2_setattr() and + * ocfs2_dio_end_io_write() + */ + inode_dio_wait(inode); + status = ocfs2_rw_lock(inode, 1); if (status < 0) { mlog_errno(status); @@ -1143,21 +1170,39 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - status = ocfs2_inode_lock(inode, &bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); + had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); + if (had_lock < 0) { + status = had_lock; goto bail_unlock_rw; + } else if (had_lock) { + /* + * As far as we know, ocfs2_setattr() could only be the first + * VFS entry point in the call chain of recursive cluster + * locking issue. + * + * For instance: + * chmod_common() + * notify_change() + * ocfs2_setattr() + * posix_acl_chmod() + * ocfs2_iop_get_acl() + * + * But, we're not 100% sure if it's always true, because the + * ordering of the VFS entry points in the call chain is out + * of our control. So, we'd better dump the stack here to + * catch the other cases of recursive locking. + */ + mlog(ML_ERROR, "Another case of recursive locking:\n"); + dump_stack(); } + inode_locked = 1; - if (size_change && attr->ia_size != i_size_read(inode)) { + if (size_change) { status = inode_newsize_ok(inode, attr->ia_size); if (status) goto bail_unlock; - inode_dio_wait(inode); - - if (i_size_read(inode) > attr->ia_size) { + if (i_size_read(inode) >= attr->ia_size) { if (ocfs2_should_order_data(inode)) { status = ocfs2_begin_ordered_truncate(inode, attr->ia_size); @@ -1186,8 +1231,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); - if (!transfer_to[USRQUOTA]) { - status = -ESRCH; + if (IS_ERR(transfer_to[USRQUOTA])) { + status = PTR_ERR(transfer_to[USRQUOTA]); + transfer_to[USRQUOTA] = NULL; goto bail_unlock; } } @@ -1195,31 +1241,34 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); - if (!transfer_to[GRPQUOTA]) { - status = -ESRCH; + if (IS_ERR(transfer_to[GRPQUOTA])) { + status = PTR_ERR(transfer_to[GRPQUOTA]); + transfer_to[GRPQUOTA] = NULL; goto bail_unlock; } } + down_write(&OCFS2_I(inode)->ip_alloc_sem); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + 2 * ocfs2_quota_trans_credits(sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail_unlock_alloc; } status = __dquot_transfer(inode, transfer_to); if (status < 0) goto bail_commit; } else { + down_write(&OCFS2_I(inode)->ip_alloc_sem); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail_unlock_alloc; } } - setattr_copy(inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); @@ -1228,44 +1277,58 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); +bail_unlock_alloc: + up_write(&OCFS2_I(inode)->ip_alloc_sem); bail_unlock: - ocfs2_inode_unlock(inode, 1); + if (status && inode_locked) { + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); + inode_locked = 0; + } bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - brelse(bh); /* Release quota pointers in case we acquired them */ - for (qtype = 0; qtype < MAXQUOTAS; qtype++) + for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++) dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { - status = ocfs2_acl_chmod(inode); + status = ocfs2_acl_chmod(inode, bh); if (status < 0) mlog_errno(status); } + if (inode_locked) + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); + brelse(bh); return status; } -int ocfs2_getattr(struct vfsmount *mnt, - struct dentry *dentry, - struct kstat *stat) +int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { - struct inode *inode = dentry->d_inode; - struct super_block *sb = dentry->d_inode->i_sb; + struct inode *inode = d_inode(path->dentry); + struct super_block *sb = path->dentry->d_sb; struct ocfs2_super *osb = sb->s_fs_info; int err; - err = ocfs2_inode_revalidate(dentry); + err = ocfs2_inode_revalidate(path->dentry); if (err) { if (err != -ENOENT) mlog_errno(err); goto bail; } - generic_fillattr(inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others don't incorrectly think the file is completely sparse. + */ + if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + stat->blocks += (stat->size + 511)>>9; /* We set the blksize from the cluster size for performance */ stat->blksize = osb->s_clustersize; @@ -1274,23 +1337,35 @@ bail: return err; } -int ocfs2_permission(struct inode *inode, int mask) +int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode, + int mask) { - int ret; + int ret, had_lock; + struct ocfs2_lock_holder oh; if (mask & MAY_NOT_BLOCK) return -ECHILD; - ret = ocfs2_inode_lock(inode, NULL, 0); - if (ret) { - if (ret != -ENOENT) - mlog_errno(ret); + had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh); + if (had_lock < 0) { + ret = had_lock; goto out; + } else if (had_lock) { + /* See comments in ocfs2_setattr() for details. + * The call chain of this case could be: + * do_sys_open() + * may_open() + * inode_permission() + * ocfs2_permission() + * ocfs2_iop_get_acl() + */ + mlog(ML_ERROR, "Another case of recursive locking:\n"); + dump_stack(); } - ret = generic_permission(inode, mask); + ret = generic_permission(&nop_mnt_idmap, inode, mask); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); out: return ret; } @@ -1327,6 +1402,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode, di = (struct ocfs2_dinode *) bh->b_data; di->i_mode = cpu_to_le16(inode->i_mode); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); @@ -1336,44 +1412,6 @@ out: return ret; } -/* - * Will look for holes and unwritten extents in the range starting at - * pos for count bytes (inclusive). - */ -static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, - size_t count) -{ - int ret = 0; - unsigned int extent_flags; - u32 cpos, clusters, extent_len, phys_cpos; - struct super_block *sb = inode->i_sb; - - cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; - clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; - - while (clusters) { - ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, - &extent_flags); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { - ret = 1; - break; - } - - if (extent_len > clusters) - extent_len = clusters; - - clusters -= extent_len; - cpos += extent_len; - } -out: - return ret; -} - static int ocfs2_write_remove_suid(struct inode *inode) { int ret; @@ -1455,7 +1493,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, goto next; } - ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); + ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); @@ -1495,14 +1533,55 @@ static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, } } +/* + * zero out partial blocks of one cluster. + * + * start: file offset where zero starts, will be made upper block aligned. + * len: it will be trimmed to the end of current cluster if "start + len" + * is bigger than it. + */ +static int ocfs2_zeroout_partial_cluster(struct inode *inode, + u64 start, u64 len) +{ + int ret; + u64 start_block, end_block, nr_blocks; + u64 p_block, offset; + u32 cluster, p_cluster, nr_clusters; + struct super_block *sb = inode->i_sb; + u64 end = ocfs2_align_bytes_to_clusters(sb, start); + + if (start + len < end) + end = start + len; + + start_block = ocfs2_blocks_for_bytes(sb, start); + end_block = ocfs2_blocks_for_bytes(sb, end); + nr_blocks = end_block - start_block; + if (!nr_blocks) + return 0; + + cluster = ocfs2_bytes_to_clusters(sb, start); + ret = ocfs2_get_clusters(inode, cluster, &p_cluster, + &nr_clusters, NULL); + if (ret) + return ret; + if (!p_cluster) + return 0; + + offset = start_block - ocfs2_clusters_to_blocks(sb, cluster); + p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset; + return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS); +} + static int ocfs2_zero_partial_clusters(struct inode *inode, u64 start, u64 len) { int ret = 0; - u64 tmpend, end = start + len; + u64 tmpend = 0; + u64 end = start + len; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int csize = osb->s_clustersize; handle_t *handle; + loff_t isize = i_size_read(inode); /* * The "start" and "end" values are NOT necessarily part of @@ -1523,6 +1602,26 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) goto out; + /* No page cache for EOF blocks, issue zero out to disk. */ + if (end > isize) { + /* + * zeroout eof blocks in last cluster starting from + * "isize" even "start" > "isize" because it is + * complicated to zeroout just at "start" as "start" + * may be not aligned with block size, buffer write + * would be required to do that, but out of eof buffer + * write is not supported. + */ + ret = ocfs2_zeroout_partial_cluster(inode, isize, + end - isize); + if (ret) { + mlog_errno(ret); + goto out; + } + if (start >= isize) + goto out; + end = isize; + } handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1531,18 +1630,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, } /* - * We want to get the byte offset of the end of the 1st cluster. + * If start is on a cluster boundary and end is somewhere in another + * cluster, we have not COWed the cluster starting at start, unless + * end is also within the same cluster. So, in this case, we skip this + * first call to ocfs2_zero_range_for_truncate() truncate and move on + * to the next one. */ - tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); - if (tmpend > end) - tmpend = end; + if ((start & (csize - 1)) != 0) { + /* + * We want to get the byte offset of the end of the 1st + * cluster. + */ + tmpend = (u64)osb->s_clustersize + + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; - trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, - (unsigned long long)tmpend); + trace_ocfs2_zero_partial_clusters_range1( + (unsigned long long)start, + (unsigned long long)tmpend); - ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); - if (ret) - mlog_errno(ret); + ret = ocfs2_zero_range_for_truncate(inode, handle, start, + tmpend); + if (ret) + mlog_errno(ret); + } if (tmpend < end) { /* @@ -1559,6 +1671,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, if (ret) mlog_errno(ret); } + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_commit_trans(osb, handle); out: @@ -1645,9 +1758,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode, *done = ret; } -static int ocfs2_remove_inode_range(struct inode *inode, - struct buffer_head *di_bh, u64 byte_start, - u64 byte_len) +int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len) { int ret = 0, flags = 0, done = 0, i; u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; @@ -1674,6 +1787,14 @@ static int ocfs2_remove_inode_range(struct inode *inode, return 0; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di); + + if (byte_start > id_count || byte_start + byte_len > id_count) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, byte_start + byte_len, 0); if (ret) { @@ -1697,8 +1818,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, * within one cluster(means is not exactly aligned to clustersize). */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { - + if (ocfs2_is_refcount_inode(inode)) { ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); if (ret) { mlog_errno(ret); @@ -1786,7 +1906,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, phys_cpos, trunc_len, flags, - &dealloc, refcount_loc); + &dealloc, refcount_loc, false); if (ret < 0) { mlog_errno(ret); goto out; @@ -1800,6 +1920,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); out: + ocfs2_free_path(path); ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); @@ -1816,7 +1937,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, { int ret; s64 llen; - loff_t size; + loff_t size, orig_isize; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *di_bh = NULL; handle_t *handle; @@ -1825,8 +1946,10 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); /* * This prevents concurrent writes on other nodes */ @@ -1873,14 +1996,15 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } size = sr->l_start + sr->l_len; - if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { + if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 || + cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) { if (sr->l_len <= 0) { ret = -EINVAL; goto out_inode_unlock; } } - if (file && should_remove_suid(file->f_path.dentry)) { + if (file && setattr_should_drop_suidgid(&nop_mnt_idmap, file_inode(file))) { ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); @@ -1907,6 +2031,15 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, default: ret = -EINVAL; } + + orig_isize = i_size_read(inode); + /* zeroout eof blocks in the cluster. */ + if (!ret && change_size && orig_isize < size) { + ret = ocfs2_zeroout_partial_cluster(inode, orig_isize, + size - orig_isize); + if (!ret) + i_size_write(inode, size); + } up_write(&OCFS2_I(inode)->ip_alloc_sem); if (ret) { mlog_errno(ret); @@ -1923,10 +2056,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; } - if (change_size && i_size_read(inode) < size) - i_size_write(inode, size); - - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); if (ret < 0) mlog_errno(ret); @@ -1943,7 +2073,7 @@ out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -1983,14 +2113,20 @@ static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, struct ocfs2_space_resv sr; int change_size = 1; int cmd = OCFS2_IOC_RESVSP64; + int ret = 0; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (!ocfs2_writes_unwritten_extents(osb)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_KEEP_SIZE) + if (mode & FALLOC_FL_KEEP_SIZE) { change_size = 0; + } else { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + return ret; + } if (mode & FALLOC_FL_PUNCH_HOLE) cmd = OCFS2_IOC_UNRESVSP64; @@ -2012,7 +2148,7 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, struct super_block *sb = inode->i_sb; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || - !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || + !ocfs2_is_refcount_inode(inode) || OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; @@ -2042,13 +2178,6 @@ out: return ret; } -static void ocfs2_aiodio_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ocfs2_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); -} - static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) { int blockmask = inode->i_sb->s_blocksize - 1; @@ -2059,57 +2188,107 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) return 0; } -static int ocfs2_prepare_inode_for_refcount(struct inode *inode, - struct file *file, - loff_t pos, size_t count, - int *meta_level) +static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, + struct buffer_head **di_bh, + int meta_level, + int write_sem, + int wait) { - int ret; - struct buffer_head *di_bh = NULL; - u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; - u32 clusters = - ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + int ret = 0; - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret) { - mlog_errno(ret); + if (wait) + ret = ocfs2_inode_lock(inode, di_bh, meta_level); + else + ret = ocfs2_try_inode_lock(inode, di_bh, meta_level); + if (ret < 0) goto out; + + if (wait) { + if (write_sem) + down_write(&OCFS2_I(inode)->ip_alloc_sem); + else + down_read(&OCFS2_I(inode)->ip_alloc_sem); + } else { + if (write_sem) + ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem); + else + ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem); + + if (!ret) { + ret = -EAGAIN; + goto out_unlock; + } } - *meta_level = 1; + return ret; - ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); - if (ret) - mlog_errno(ret); +out_unlock: + brelse(*di_bh); + *di_bh = NULL; + ocfs2_inode_unlock(inode, meta_level); out: - brelse(di_bh); return ret; } +static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode, + struct buffer_head **di_bh, + int meta_level, + int write_sem) +{ + if (write_sem) + up_write(&OCFS2_I(inode)->ip_alloc_sem); + else + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(*di_bh); + *di_bh = NULL; + + if (meta_level >= 0) + ocfs2_inode_unlock(inode, meta_level); +} + static int ocfs2_prepare_inode_for_write(struct file *file, - loff_t *ppos, - size_t count, - int appending, - int *direct_io, - int *has_refcount) + loff_t pos, size_t count, int wait) { - int ret = 0, meta_level = 0; + int ret = 0, meta_level = 0, overwrite_io = 0; + int write_sem = 0; struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - loff_t saved_pos = 0, end; + struct inode *inode = d_inode(dentry); + struct buffer_head *di_bh = NULL; + u32 cpos; + u32 clusters; /* * We start with a read level meta lock and only jump to an ex * if we need to make modifications here. */ for(;;) { - ret = ocfs2_inode_lock(inode, NULL, meta_level); + ret = ocfs2_inode_lock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem, + wait); if (ret < 0) { - meta_level = -1; - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } + /* + * Check if IO will overwrite allocated blocks in case + * IOCB_NOWAIT flag is set. + */ + if (!wait && !overwrite_io) { + overwrite_io = 1; + + ret = ocfs2_overwrite_io(inode, di_bh, pos, count); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto out_unlock; + } + } + /* Clear suid / sgid if necessary. We do this here * instead of later in the write path because * remove_suid() calls ->setattr without any hint that @@ -2119,9 +2298,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (should_remove_suid(dentry)) { + if (setattr_should_drop_suidgid(&nop_mnt_idmap, inode)) { if (meta_level == 0) { - ocfs2_inode_unlock(inode, meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); meta_level = 1; continue; } @@ -2133,146 +2315,106 @@ static int ocfs2_prepare_inode_for_write(struct file *file, } } - /* work on a copy of ppos until we're sure that we won't have - * to recalculate it due to relocking. */ - if (appending) - saved_pos = i_size_read(inode); - else - saved_pos = *ppos; - - end = saved_pos + count; - - ret = ocfs2_check_range_for_refcount(inode, saved_pos, count); + ret = ocfs2_check_range_for_refcount(inode, pos, count); if (ret == 1) { - ocfs2_inode_unlock(inode, meta_level); - meta_level = -1; - - ret = ocfs2_prepare_inode_for_refcount(inode, - file, - saved_pos, - count, - &meta_level); - if (has_refcount) - *has_refcount = 1; - if (direct_io) - *direct_io = 0; + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); + meta_level = 1; + write_sem = 1; + ret = ocfs2_inode_lock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem, + wait); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto out; + } + + cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + clusters = + ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); } if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out_unlock; } - /* - * Skip the O_DIRECT checks if we don't need - * them. - */ - if (!direct_io || !(*direct_io)) - break; - - /* - * There's no sane way to do direct writes to an inode - * with inline data. - */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - *direct_io = 0; - break; - } - - /* - * Allowing concurrent direct writes means - * i_size changes wouldn't be synchronized, so - * one node could wind up truncating another - * nodes writes. - */ - if (end > i_size_read(inode)) { - *direct_io = 0; - break; - } - - /* - * We don't fill holes during direct io, so - * check for them here. If any are found, the - * caller will have to retake some cluster - * locks and initiate the io as buffered. - */ - ret = ocfs2_check_range_for_holes(inode, saved_pos, count); - if (ret == 1) { - *direct_io = 0; - ret = 0; - } else if (ret < 0) - mlog_errno(ret); break; } - if (appending) - *ppos = saved_pos; - out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, - saved_pos, appending, count, - direct_io, has_refcount); + pos, count, wait); - if (meta_level >= 0) - ocfs2_inode_unlock(inode, meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); out: return ret; } -static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) +static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) { - int ret, direct_io, appending, rw_level, have_alloc_sem = 0; - int can_do_direct, has_refcount = 0; + int rw_level; ssize_t written = 0; - size_t ocount; /* original count */ - size_t count; /* after file limit checks */ - loff_t old_size, *ppos = &iocb->ki_pos; - u32 old_clusters; + ssize_t ret; + size_t count = iov_iter_count(from); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); - int unaligned_dio = 0; + void *saved_ki_complete = NULL; + int append_write = ((iocb->ki_pos + count) >= + i_size_read(inode) ? 1 : 0); + int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; - trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, + trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, - (unsigned int)nr_segs); + (unsigned int)from->nr_segs); /* GRRRRR */ - if (iocb->ki_left == 0) - return 0; - - appending = file->f_flags & O_APPEND ? 1 : 0; - direct_io = file->f_flags & O_DIRECT ? 1 : 0; + if (!direct_io && nowait) + return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + if (count == 0) + return 0; - ocfs2_iocb_clear_sem_locked(iocb); + if (nowait) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else + inode_lock(inode); -relock: - /* to match setattr's i_mutex -> rw_lock ordering */ - if (direct_io) { - have_alloc_sem = 1; - /* communicate with ocfs2_dio_end_io */ - ocfs2_iocb_set_sem_locked(iocb); - } + ocfs2_iocb_init_rw_locked(iocb); /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". + * For append write, we must take rw EX. */ - rw_level = (!direct_io || full_coherency); + rw_level = (!direct_io || full_coherency || append_write); - ret = ocfs2_rw_lock(inode, rw_level); + if (nowait) + ret = ocfs2_try_rw_lock(inode, rw_level); + else + ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { - mlog_errno(ret); - goto out_sems; + if (ret != -EAGAIN) + mlog_errno(ret); + goto out_mutex; } /* @@ -2285,112 +2427,48 @@ relock: * other nodes to drop their caches. Buffered I/O * already does this in write_begin(). */ - ret = ocfs2_inode_lock(inode, NULL, 1); + if (nowait) + ret = ocfs2_try_inode_lock(inode, NULL, 1); + else + ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } ocfs2_inode_unlock(inode, 1); } - can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file, ppos, - iocb->ki_left, appending, - &can_do_direct, &has_refcount); - if (ret < 0) { - mlog_errno(ret); + ret = generic_write_checks(iocb, from); + if (ret <= 0) { + if (ret) + mlog_errno(ret); goto out; } + count = ret; - if (direct_io && !is_sync_kiocb(iocb)) - unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, - *ppos); - - /* - * We can't complete the direct I/O as requested, fall back to - * buffered I/O. - */ - if (direct_io && !can_do_direct) { - ocfs2_rw_unlock(inode, rw_level); - - have_alloc_sem = 0; - rw_level = -1; - - direct_io = 0; - goto relock; + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto out; } - if (unaligned_dio) { + if (direct_io && !is_sync_kiocb(iocb) && + ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) { /* - * Wait on previous unaligned aio to complete before - * proceeding. + * Make it a sync io if it's an unaligned aio. */ - ocfs2_aiodio_wait(inode); - - /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ - atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); - ocfs2_iocb_set_unaligned_aio(iocb); + saved_ki_complete = xchg(&iocb->ki_complete, NULL); } - /* - * To later detect whether a journal commit for sync writes is - * necessary, we sample i_size, and cluster count here. - */ - old_size = i_size_read(inode); - old_clusters = OCFS2_I(inode)->ip_clusters; - /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); - ret = generic_segment_checks(iov, &nr_segs, &ocount, - VERIFY_READ); - if (ret) - goto out_dio; - - count = ocount; - ret = generic_write_checks(file, ppos, &count, - S_ISBLK(inode->i_mode)); - if (ret) - goto out_dio; - - if (direct_io) { - written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, - ppos, count, ocount); - if (written < 0) { - ret = written; - goto out_dio; - } - } else { - current->backing_dev_info = file->f_mapping->backing_dev_info; - written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, - ppos, count, 0); - current->backing_dev_info = NULL; - } - -out_dio: + written = __generic_file_write_iter(iocb, from); /* buffered aio wouldn't have proper lock coverage today */ - BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); - - if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || - ((file->f_flags & O_DIRECT) && !direct_io)) { - ret = filemap_fdatawrite_range(file->f_mapping, pos, - pos + count - 1); - if (ret < 0) - written = ret; - - if (!ret && ((old_size != i_size_read(inode)) || - (old_clusters != OCFS2_I(inode)->ip_clusters) || - has_refcount)) { - ret = jbd2_journal_force_commit(osb->journal->j_journal); - if (ret < 0) - written = ret; - } - - if (!ret) - ret = filemap_fdatawait_range(file->f_mapping, pos, - pos + count - 1); - } + BUG_ON(written == -EIOCBQUEUED && !direct_io); /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io @@ -2401,153 +2479,62 @@ out_dio: * async dio is going to do it in the future or an end_io after an * error has already done it. */ - if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { + if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { rw_level = -1; - have_alloc_sem = 0; - unaligned_dio = 0; - } - - if (unaligned_dio) { - ocfs2_iocb_clear_unaligned_aio(iocb); - atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); - } - -out: - if (rw_level != -1) - ocfs2_rw_unlock(inode, rw_level); - -out_sems: - if (have_alloc_sem) - ocfs2_iocb_clear_sem_locked(iocb); - - mutex_unlock(&inode->i_mutex); - - if (written) - ret = written; - return ret; -} - -static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, - struct file *out, - struct splice_desc *sd) -{ - int ret; - - ret = ocfs2_prepare_inode_for_write(out, &sd->pos, - sd->total_len, 0, NULL, NULL); - if (ret < 0) { - mlog_errno(ret); - return ret; } - return splice_from_pipe_feed(pipe, sd, pipe_to_file); -} - -static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, - loff_t *ppos, - size_t len, - unsigned int flags) -{ - int ret; - struct address_space *mapping = out->f_mapping; - struct inode *inode = mapping->host; - struct splice_desc sd = { - .total_len = len, - .flags = flags, - .pos = *ppos, - .u.file = out, - }; - - - trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - out->f_path.dentry->d_name.len, - out->f_path.dentry->d_name.name, len); - - pipe_lock(pipe); - - splice_from_pipe_begin(&sd); - do { - ret = splice_from_pipe_next(pipe, &sd); - if (ret <= 0) - break; + if (unlikely(written <= 0)) + goto out; - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - ret = ocfs2_rw_lock(inode, 1); + if (((file->f_flags & O_DSYNC) && !direct_io) || + IS_SYNC(inode)) { + ret = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); if (ret < 0) - mlog_errno(ret); - else { - ret = ocfs2_splice_to_file(pipe, out, &sd); - ocfs2_rw_unlock(inode, 1); - } - mutex_unlock(&inode->i_mutex); - } while (ret > 0); - splice_from_pipe_end(pipe, &sd); - - pipe_unlock(pipe); - - if (sd.num_spliced) - ret = sd.num_spliced; - - if (ret > 0) { - int err; + written = ret; - err = generic_write_sync(out, *ppos, ret); - if (err) - ret = err; - else - *ppos += ret; + if (!ret) { + ret = jbd2_journal_force_commit(osb->journal->j_journal); + if (ret < 0) + written = ret; + } - balance_dirty_pages_ratelimited(mapping); + if (!ret) + ret = filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); } - return ret; -} - -static ssize_t ocfs2_file_splice_read(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - int ret = 0, lock_level = 0; - struct inode *inode = file_inode(in); - - trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - in->f_path.dentry->d_name.len, - in->f_path.dentry->d_name.name, len); +out: + if (saved_ki_complete) + xchg(&iocb->ki_complete, saved_ki_complete); - /* - * See the comment in ocfs2_file_aio_read() - */ - ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level); - if (ret < 0) { - mlog_errno(ret); - goto bail; - } - ocfs2_inode_unlock(inode, lock_level); + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); - ret = generic_file_splice_read(in, ppos, pipe, len, flags); +out_mutex: + inode_unlock(inode); -bail: + if (written) + ret = written; return ret; } -static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) +static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, + struct iov_iter *to) { - int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; + int ret = 0, rw_level = -1, lock_level = 0; struct file *filp = iocb->ki_filp; struct inode *inode = file_inode(filp); + int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; - trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, + trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, filp->f_path.dentry->d_name.len, - filp->f_path.dentry->d_name.name, nr_segs); + filp->f_path.dentry->d_name.name, + to->nr_segs); /* GRRRRR */ if (!inode) { @@ -2556,19 +2543,24 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, goto bail; } - ocfs2_iocb_clear_sem_locked(iocb); + if (!direct_io && nowait) + return -EOPNOTSUPP; + + ocfs2_iocb_init_rw_locked(iocb); /* - * buffered reads protect themselves in ->readpage(). O_DIRECT reads + * buffered reads protect themselves in ->read_folio(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. */ - if (filp->f_flags & O_DIRECT) { - have_alloc_sem = 1; - ocfs2_iocb_set_sem_locked(iocb); + if (direct_io) { + if (nowait) + ret = ocfs2_try_rw_lock(inode, 0); + else + ret = ocfs2_rw_lock(inode, 0); - ret = ocfs2_rw_lock(inode, 0); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto bail; } rw_level = 0; @@ -2583,50 +2575,94 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * * Take and drop the meta data lock to update inode fields * like i_size. This allows the checks down below - * generic_file_aio_read() a chance of actually working. + * copy_splice_read() a chance of actually working. */ - ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); + ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, + !nowait); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto bail; } ocfs2_inode_unlock(inode, lock_level); - ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); - trace_generic_file_aio_read_ret(ret); + ret = generic_file_read_iter(iocb, to); + trace_generic_file_read_iter_ret(ret); /* buffered aio wouldn't have proper lock coverage today */ - BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); + BUG_ON(ret == -EIOCBQUEUED && !direct_io); - /* see ocfs2_file_aio_write */ + /* see ocfs2_file_write_iter */ if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { rw_level = -1; - have_alloc_sem = 0; } bail: - if (have_alloc_sem) - ocfs2_iocb_clear_sem_locked(iocb); - if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); return ret; } +static ssize_t ocfs2_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + ssize_t ret = 0; + int lock_level = 0; + + trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + in->f_path.dentry->d_name.len, + in->f_path.dentry->d_name.name, + flags); + + /* + * We're fine letting folks race truncates and extending writes with + * read across the cluster, just like they can locally. Hence no + * rw_lock during read. + * + * Take and drop the meta data lock to update inode fields like i_size. + * This allows the checks down below filemap_splice_read() a chance of + * actually working. + */ + ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level, 1); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto bail; + } + ocfs2_inode_unlock(inode, lock_level); + + ret = filemap_splice_read(in, ppos, pipe, len, flags); + trace_filemap_splice_read_ret(ret); +bail: + return ret; +} + /* Refer generic_file_llseek_unlocked() */ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_SET: break; case SEEK_END: - offset += inode->i_size; + /* SEEK_END requires the OCFS2 inode lock for the file + * because it references the file's size. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + offset += i_size_read(inode); + ocfs2_inode_unlock(inode, 0); break; case SEEK_CUR: if (offset == 0) { @@ -2649,29 +2685,113 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret) return ret; return offset; } +static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb); + struct buffer_head *in_bh = NULL, *out_bh = NULL; + bool same_inode = (inode_in == inode_out); + loff_t remapped = 0; + ssize_t ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + if (!ocfs2_refcount_tree(osb)) + return -EOPNOTSUPP; + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + /* Lock both files against IO */ + ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh); + if (ret) + return ret; + + /* Check file eligibility and prepare for block sharing. */ + ret = -EINVAL; + if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) || + (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) + goto out_unlock; + + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + /* Lock out changes to the allocation maps and remap. */ + down_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, + SINGLE_DEPTH_NESTING); + + /* Zap any page cache for the destination file's range. */ + truncate_inode_pages_range(&inode_out->i_data, + round_down(pos_out, PAGE_SIZE), + round_up(pos_out + len, PAGE_SIZE) - 1); + + remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, + inode_out, out_bh, pos_out, len); + up_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + up_write(&OCFS2_I(inode_out)->ip_alloc_sem); + if (remapped < 0) { + ret = remapped; + mlog_errno(ret); + goto out_unlock; + } + + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode_in, 0); + ocfs2_extent_map_trunc(inode_out, 0); + + ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + +out_unlock: + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return remapped > 0 ? remapped : ret; +} + +static loff_t ocfs2_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct ocfs2_file_private *fp = file->private_data; + + return generic_llseek_cookie(file, offset, whence, &fp->cookie); +} + const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, - .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, + .fileattr_get = ocfs2_fileattr_get, + .fileattr_set = ocfs2_fileattr_set, }; const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, + .listxattr = ocfs2_listxattr, .permission = ocfs2_permission, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, }; /* @@ -2680,14 +2800,12 @@ const struct inode_operations ocfs2_special_file_iops = { */ const struct file_operations ocfs2_fops = { .llseek = ocfs2_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .mmap = ocfs2_mmap, + .mmap_prepare = ocfs2_mmap_prepare, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, - .aio_read = ocfs2_file_aio_read, - .aio_write = ocfs2_file_aio_write, + .read_iter = ocfs2_file_read_iter, + .write_iter = ocfs2_file_write_iter, .unlocked_ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, @@ -2695,14 +2813,17 @@ const struct file_operations ocfs2_fops = { .lock = ocfs2_lock, .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, - .splice_write = ocfs2_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, + .remap_file_range = ocfs2_remap_file_range, + .fop_flags = FOP_ASYNC_LOCK, }; +WRAP_DIR_ITER(ocfs2_readdir) // FIXME! const struct file_operations ocfs2_dops = { - .llseek = generic_file_llseek, + .llseek = ocfs2_dir_llseek, .read = generic_read_dir, - .iterate = ocfs2_readdir, + .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, @@ -2712,6 +2833,7 @@ const struct file_operations ocfs2_dops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, + .fop_flags = FOP_ASYNC_LOCK, }; /* @@ -2728,28 +2850,27 @@ const struct file_operations ocfs2_dops = { */ const struct file_operations ocfs2_fops_no_plocks = { .llseek = ocfs2_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .mmap = ocfs2_mmap, + .mmap_prepare = ocfs2_mmap_prepare, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, - .aio_read = ocfs2_file_aio_read, - .aio_write = ocfs2_file_aio_write, + .read_iter = ocfs2_file_read_iter, + .write_iter = ocfs2_file_write_iter, .unlocked_ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif .flock = ocfs2_flock, - .splice_read = ocfs2_file_splice_read, - .splice_write = ocfs2_file_splice_write, + .splice_read = filemap_splice_read, + .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, + .remap_file_range = ocfs2_remap_file_range, }; const struct file_operations ocfs2_dops_no_plocks = { - .llseek = generic_file_llseek, + .llseek = ocfs2_dir_llseek, .read = generic_read_dir, - .iterate = ocfs2_readdir, + .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 97bf761c9e7c..41e65e45a9f3 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * file.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_FILE_H @@ -36,6 +20,7 @@ struct ocfs2_alloc_context; enum ocfs2_alloc_restarted; struct ocfs2_file_private { + u64 cookie; struct file *fp_file; struct mutex fp_mutex; struct ocfs2_lock_res fp_flock; @@ -51,17 +36,27 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_set_inode_size(handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size); int ocfs2_simple_size_update(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size); +int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); -int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); -int ocfs2_permission(struct inode *inode, int mask); +int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr); +int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int ocfs2_permission(struct mnt_idmap *idmap, + struct inode *inode, + int mask); int ocfs2_should_update_atime(struct inode *inode, struct vfsmount *vfsmnt); @@ -73,4 +68,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, size_t count); +int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len); #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c new file mode 100644 index 000000000000..3ad7baf67658 --- /dev/null +++ b/fs/ocfs2/filecheck.c @@ -0,0 +1,509 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * filecheck.c + * + * Code which implements online file check. + * + * Copyright (C) 2016 SuSE. All rights reserved. + */ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/fs.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/sysctl.h> +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "ocfs2_fs.h" +#include "stackglue.h" +#include "inode.h" + +#include "filecheck.h" + + +/* File check error strings, + * must correspond with error number in header file. + */ +static const char * const ocfs2_filecheck_errs[] = { + "SUCCESS", + "FAILED", + "INPROGRESS", + "READONLY", + "INJBD", + "INVALIDINO", + "BLOCKECC", + "BLOCKNO", + "VALIDFLAG", + "GENERATION", + "UNSUPPORTED" +}; + +struct ocfs2_filecheck_entry { + struct list_head fe_list; + unsigned long fe_ino; + unsigned int fe_type; + unsigned int fe_done:1; + unsigned int fe_status:31; +}; + +struct ocfs2_filecheck_args { + unsigned int fa_type; + union { + unsigned long fa_ino; + unsigned int fa_len; + }; +}; + +static const char * +ocfs2_filecheck_error(int errno) +{ + if (!errno) + return ocfs2_filecheck_errs[errno]; + + BUG_ON(errno < OCFS2_FILECHECK_ERR_START || + errno > OCFS2_FILECHECK_ERR_END); + return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; +} + +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_filecheck_attr_chk = + __ATTR(check, S_IRUSR | S_IWUSR, + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_fix = + __ATTR(fix, S_IRUSR | S_IWUSR, + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_set = + __ATTR(set, S_IRUSR | S_IWUSR, + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct attribute *ocfs2_filecheck_attrs[] = { + &ocfs2_filecheck_attr_chk.attr, + &ocfs2_filecheck_attr_fix.attr, + &ocfs2_filecheck_attr_set.attr, + NULL +}; +ATTRIBUTE_GROUPS(ocfs2_filecheck); + +static void ocfs2_filecheck_release(struct kobject *kobj) +{ + struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); + + complete(&entry->fs_kobj_unregister); +} + +static ssize_t +ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->show) + ret = kattr->show(kobj, kattr, buf); + kobject_put(kobj); + return ret; +} + +static ssize_t +ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->store) + ret = kattr->store(kobj, kattr, buf, count); + kobject_put(kobj); + return ret; +} + +static const struct sysfs_ops ocfs2_filecheck_ops = { + .show = ocfs2_filecheck_show, + .store = ocfs2_filecheck_store, +}; + +static struct kobj_type ocfs2_ktype_filecheck = { + .default_groups = ocfs2_filecheck_groups, + .sysfs_ops = &ocfs2_filecheck_ops, + .release = ocfs2_filecheck_release, +}; + +static void +ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) +{ + struct ocfs2_filecheck_entry *p; + + spin_lock(&entry->fs_fcheck->fc_lock); + while (!list_empty(&entry->fs_fcheck->fc_head)) { + p = list_first_entry(&entry->fs_fcheck->fc_head, + struct ocfs2_filecheck_entry, fe_list); + list_del(&p->fe_list); + BUG_ON(!p->fe_done); /* To free a undone file check entry */ + kfree(p); + } + spin_unlock(&entry->fs_fcheck->fc_lock); + + kfree(entry->fs_fcheck); + entry->fs_fcheck = NULL; +} + +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb) +{ + int ret; + struct ocfs2_filecheck *fcheck; + struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent; + + fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); + if (!fcheck) + return -ENOMEM; + + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + + entry->fs_kobj.kset = osb->osb_dev_kset; + init_completion(&entry->fs_kobj_unregister); + ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck, + NULL, "filecheck"); + if (ret) { + kobject_put(&entry->fs_kobj); + kfree(fcheck); + return ret; + } + + entry->fs_fcheck = fcheck; + return 0; +} + +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb) +{ + if (!osb->osb_fc_ent.fs_fcheck) + return; + + kobject_del(&osb->osb_fc_ent.fs_kobj); + kobject_put(&osb->osb_fc_ent.fs_kobj); + wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister); + ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent); +} + +static int +ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int count); +static int +ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int len) +{ + int ret; + + if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE)) + return -EINVAL; + + spin_lock(&ent->fs_fcheck->fc_lock); + if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { + mlog(ML_NOTICE, + "Cannot set online file check maximum entry number " + "to %u due to too many pending entries(%u)\n", + len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); + ret = -EBUSY; + } else { + if (len < ent->fs_fcheck->fc_size) + BUG_ON(!ocfs2_filecheck_erase_entries(ent, + ent->fs_fcheck->fc_size - len)); + + ent->fs_fcheck->fc_max = len; + ret = 0; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + + return ret; +} + +#define OCFS2_FILECHECK_ARGS_LEN 24 +static int +ocfs2_filecheck_args_get_long(const char *buf, size_t count, + unsigned long *val) +{ + char buffer[OCFS2_FILECHECK_ARGS_LEN]; + + memcpy(buffer, buf, count); + buffer[count] = '\0'; + + if (kstrtoul(buffer, 0, val)) + return 1; + + return 0; +} + +static int +ocfs2_filecheck_type_parse(const char *name, unsigned int *type) +{ + if (!strncmp(name, "fix", 4)) + *type = OCFS2_FILECHECK_TYPE_FIX; + else if (!strncmp(name, "check", 6)) + *type = OCFS2_FILECHECK_TYPE_CHK; + else if (!strncmp(name, "set", 4)) + *type = OCFS2_FILECHECK_TYPE_SET; + else + return 1; + + return 0; +} + +static int +ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, + struct ocfs2_filecheck_args *args) +{ + unsigned long val = 0; + unsigned int type; + + /* too short/long args length */ + if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN)) + return 1; + + if (ocfs2_filecheck_type_parse(name, &type)) + return 1; + if (ocfs2_filecheck_args_get_long(buf, count, &val)) + return 1; + + if (val <= 0) + return 1; + + args->fa_type = type; + if (type == OCFS2_FILECHECK_TYPE_SET) + args->fa_len = (unsigned int)val; + else + args->fa_ino = val; + + return 0; +} + +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + + ssize_t ret = 0, total = 0, remain = PAGE_SIZE; + unsigned int type; + struct ocfs2_filecheck_entry *p; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); + + if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) + return -EINVAL; + + if (type == OCFS2_FILECHECK_TYPE_SET) { + spin_lock(&ent->fs_fcheck->fc_lock); + total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); + spin_unlock(&ent->fs_fcheck->fc_lock); + goto exit; + } + + ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n"); + total += ret; + remain -= ret; + spin_lock(&ent->fs_fcheck->fc_lock); + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (p->fe_type != type) + continue; + + ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n", + p->fe_ino, p->fe_done, + ocfs2_filecheck_error(p->fe_status)); + if (ret >= remain) { + /* snprintf() didn't fit */ + total = -E2BIG; + break; + } + total += ret; + remain -= ret; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + +exit: + return total; +} + +static inline int +ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned long ino) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (!p->fe_done) { + if (p->fe_ino == ino) + return 1; + } + } + + return 0; +} + +static inline int +ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (p->fe_done) { + list_del(&p->fe_list); + kfree(p); + ent->fs_fcheck->fc_size--; + ent->fs_fcheck->fc_done--; + return 1; + } + } + + return 0; +} + +static int +ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int count) +{ + unsigned int i = 0; + unsigned int ret = 0; + + while (i++ < count) { + if (ocfs2_filecheck_erase_entry(ent)) + ret++; + else + break; + } + + return (ret == count ? 1 : 0); +} + +static void +ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, + struct ocfs2_filecheck_entry *entry) +{ + spin_lock(&ent->fs_fcheck->fc_lock); + entry->fe_done = 1; + ent->fs_fcheck->fc_done++; + spin_unlock(&ent->fs_fcheck->fc_lock); +} + +static unsigned int +ocfs2_filecheck_handle(struct ocfs2_super *osb, + unsigned long ino, unsigned int flags) +{ + unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; + struct inode *inode = NULL; + int rc; + + inode = ocfs2_iget(osb, ino, flags, 0); + if (IS_ERR(inode)) { + rc = (int)(-(long)inode); + if (rc >= OCFS2_FILECHECK_ERR_START && + rc < OCFS2_FILECHECK_ERR_END) + ret = rc; + else + ret = OCFS2_FILECHECK_ERR_FAILED; + } else + iput(inode); + + return ret; +} + +static void +ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, + struct ocfs2_filecheck_entry *entry) +{ + struct ocfs2_super *osb = container_of(ent, struct ocfs2_super, + osb_fc_ent); + + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) + entry->fe_status = ocfs2_filecheck_handle(osb, + entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); + else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) + entry->fe_status = ocfs2_filecheck_handle(osb, + entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); + else + entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; + + ocfs2_filecheck_done_entry(ent, entry); +} + +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret = 0; + struct ocfs2_filecheck_args args; + struct ocfs2_filecheck_entry *entry; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); + + if (count == 0) + return count; + + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) + return -EINVAL; + + if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { + ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); + goto exit; + } + + entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS); + if (!entry) { + ret = -ENOMEM; + goto exit; + } + + spin_lock(&ent->fs_fcheck->fc_lock); + if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) { + ret = -EEXIST; + kfree(entry); + } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done == 0)) { + mlog(ML_NOTICE, + "Cannot do more file check " + "since file check queue(%u) is full now\n", + ent->fs_fcheck->fc_max); + ret = -EAGAIN; + kfree(entry); + } else { + if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done > 0)) { + /* Delete the oldest entry which was done, + * make sure the entry size in list does + * not exceed maximum value + */ + BUG_ON(!ocfs2_filecheck_erase_entry(ent)); + } + + entry->fe_ino = args.fa_ino; + entry->fe_type = args.fa_type; + entry->fe_done = 0; + entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS; + list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head); + ent->fs_fcheck->fc_size++; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + + if (!ret) + ocfs2_filecheck_handle_entry(ent, entry); + +exit: + return ret ?: count; +} diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h new file mode 100644 index 000000000000..d3bcb8bcfeb0 --- /dev/null +++ b/fs/ocfs2/filecheck.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * filecheck.h + * + * Online file check. + * + * Copyright (C) 2016 SuSE. All rights reserved. + */ + + +#ifndef FILECHECK_H +#define FILECHECK_H + +#include <linux/types.h> +#include <linux/list.h> + + +/* File check errno */ +enum { + OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */ + OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */ + OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */ + OCFS2_FILECHECK_ERR_READONLY, /* Read only */ + OCFS2_FILECHECK_ERR_INJBD, /* Buffer in jbd */ + OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */ + OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */ + OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */ + OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */ + OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */ + OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */ +}; + +#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED +#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED + +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */ + struct kobject fs_kobj; + struct completion fs_kobj_unregister; + struct ocfs2_filecheck *fs_fcheck; +}; + + +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb); +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb); + +#endif /* FILECHECK_H */ diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index d8208b20dc53..22da768e65b7 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -1,29 +1,14 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * heartbeat.c * - * Register ourselves with the heartbaet service, keep our node maps + * Register ourselves with the heartbeat service, keep our node maps * up to date, and fire off recovery when needed. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ +#include <linux/bitmap.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> @@ -40,18 +25,12 @@ #include "buffer_head_io.h" -static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, - int bit); -static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, - int bit); - /* special case -1 for now * TODO: should *really* make sure the calling func never passes -1!! */ static void ocfs2_node_map_init(struct ocfs2_node_map *map) { map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; - memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * - sizeof(unsigned long)); + bitmap_zero(map->map, OCFS2_NODE_MAP_MAX_NODES); } void ocfs2_init_node_maps(struct ocfs2_super *osb) @@ -81,12 +60,6 @@ void ocfs2_do_node_down(int node_num, void *data) ocfs2_recovery_thread(osb, node_num); } -static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, - int bit) -{ - set_bit(bit, map->map); -} - void ocfs2_node_map_set_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit) @@ -95,16 +68,10 @@ void ocfs2_node_map_set_bit(struct ocfs2_super *osb, return; BUG_ON(bit >= map->num_nodes); spin_lock(&osb->node_map_lock); - __ocfs2_node_map_set_bit(map, bit); + set_bit(bit, map->map); spin_unlock(&osb->node_map_lock); } -static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, - int bit) -{ - clear_bit(bit, map->map); -} - void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit) @@ -113,7 +80,7 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, return; BUG_ON(bit >= map->num_nodes); spin_lock(&osb->node_map_lock); - __ocfs2_node_map_clear_bit(map, bit); + clear_bit(bit, map->map); spin_unlock(&osb->node_map_lock); } diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h index 74b9c5dda28d..f1f8b1802fe4 100644 --- a/fs/ocfs2/heartbeat.h +++ b/fs/ocfs2/heartbeat.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * heartbeat.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_HEARTBEAT_H diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index f87f9bd1edff..8340525e5589 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * inode.c * * vfs' aops, fops, dops and iops * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -28,6 +12,7 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/quotaops.h> +#include <linux/iversion.h> #include <asm/byteorder.h> @@ -53,6 +38,7 @@ #include "xattr.h" #include "refcounttree.h" #include "ocfs2_trace.h" +#include "filecheck.h" #include "buffer_head_io.h" @@ -64,8 +50,6 @@ struct ocfs2_find_inode_args unsigned int fi_sysfile_type; }; -static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; - static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args); static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); @@ -74,6 +58,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh); +static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, + struct buffer_head **bh, + int flags, int type); +static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, + struct buffer_head *bh); +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, + struct buffer_head *bh); + void ocfs2_set_inode_flags(struct inode *inode) { unsigned int flags = OCFS2_I(inode)->ip_attr; @@ -127,9 +119,11 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { + int rc = -ESTALE; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; + journal_t *journal = osb->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -158,17 +152,43 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, mlog_errno(PTR_ERR(inode)); goto bail; } - trace_ocfs2_iget5_locked(inode->i_state); - if (inode->i_state & I_NEW) { - ocfs2_read_locked_inode(inode, &args); + trace_ocfs2_iget5_locked(inode_state_read_once(inode)); + if (inode_state_read_once(inode) & I_NEW) { + rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } if (is_bad_inode(inode)) { iput(inode); - inode = ERR_PTR(-ESTALE); + inode = ERR_PTR(rc); goto bail; } + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + read_unlock(&journal->j_state_lock); + oi->i_sync_tid = tid; + oi->i_datasync_tid = tid; + } + bail: if (!IS_ERR(inode)) { trace_ocfs2_iget_end(inode, @@ -178,6 +198,22 @@ bail: return inode; } +static int ocfs2_dinode_has_extents(struct ocfs2_dinode *di) +{ + /* inodes flagged with other stuff in id2 */ + if (le32_to_cpu(di->i_flags) & + (OCFS2_SUPER_BLOCK_FL | OCFS2_LOCAL_ALLOC_FL | OCFS2_CHAIN_FL | + OCFS2_DEALLOC_FL)) + return 0; + /* i_flags doesn't indicate when id2 is a fast symlink */ + if (S_ISLNK(le16_to_cpu(di->i_mode)) && le64_to_cpu(di->i_size) && + !le32_to_cpu(di->i_clusters)) + return 0; + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) + return 0; + + return 1; +} /* * here's how inodes get read from disk: @@ -214,14 +250,77 @@ bail: static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = opaque; +#ifdef CONFIG_LOCKDEP + static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, ocfs2_file_ip_alloc_sem_key; +#endif inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; - if (args->fi_sysfile_type != 0) - lockdep_set_class(&inode->i_mutex, - &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); +#ifdef CONFIG_LOCKDEP + switch (args->fi_sysfile_type) { + case BAD_BLOCK_SYSTEM_INODE: + break; + case GLOBAL_INODE_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GLOBAL_INODE_ALLOC_SYSTEM_INODE]); + break; + case SLOT_MAP_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[SLOT_MAP_SYSTEM_INODE]); + break; + case HEARTBEAT_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[HEARTBEAT_SYSTEM_INODE]); + break; + case GLOBAL_BITMAP_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GLOBAL_BITMAP_SYSTEM_INODE]); + break; + case USER_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[USER_QUOTA_SYSTEM_INODE]); + break; + case GROUP_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GROUP_QUOTA_SYSTEM_INODE]); + break; + case ORPHAN_DIR_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[ORPHAN_DIR_SYSTEM_INODE]); + break; + case EXTENT_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE]); + break; + case INODE_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[INODE_ALLOC_SYSTEM_INODE]); + break; + case JOURNAL_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[JOURNAL_SYSTEM_INODE]); + break; + case LOCAL_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_ALLOC_SYSTEM_INODE]); + break; + case TRUNCATE_LOG_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[TRUNCATE_LOG_SYSTEM_INODE]); + break; + case LOCAL_USER_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_USER_QUOTA_SYSTEM_INODE]); + break; + case LOCAL_GROUP_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_GROUP_QUOTA_SYSTEM_INODE]); + break; + default: + WARN_ONCE(1, "Unknown sysfile type %d\n", args->fi_sysfile_type); + } if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || @@ -231,6 +330,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) else lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, &ocfs2_file_ip_alloc_sem_key); +#endif return 0; } @@ -265,7 +365,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); - inode->i_version = 1; + inode_set_iversion(inode, 1); inode->i_generation = le32_to_cpu(fe->i_generation); inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); inode->i_mode = le16_to_cpu(fe->i_mode); @@ -280,12 +380,12 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_mapping->a_ops = &ocfs2_aops; } - inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); - inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); - inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); - inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); - inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + inode_set_atime(inode, le64_to_cpu(fe->i_atime), + le32_to_cpu(fe->i_atime_nsec)); + inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), + le32_to_cpu(fe->i_mtime_nsec)); + inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), + le32_to_cpu(fe->i_ctime_nsec)); if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) mlog(ML_ERROR, @@ -334,6 +434,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, break; case S_IFLNK: inode->i_op = &ocfs2_symlink_inode_operations; + inode_nohighmem(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); break; default: @@ -382,23 +483,13 @@ static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_super *osb; struct ocfs2_dinode *fe; struct buffer_head *bh = NULL; - int status, can_lock; + int status, can_lock, lock_level = 0; u32 generation = 0; status = -EINVAL; - if (inode == NULL || inode->i_sb == NULL) { - mlog(ML_ERROR, "bad inode\n"); - return status; - } sb = inode->i_sb; osb = OCFS2_SB(sb); - if (!args) { - mlog(ML_ERROR, "bad inode args\n"); - make_bad_inode(inode); - return status; - } - /* * To improve performance of cold-cache inode stats, we take * the cluster lock here if possible. @@ -460,7 +551,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, mlog_errno(status); return status; } - status = ocfs2_inode_lock(inode, NULL, 0); + status = ocfs2_inode_lock(inode, NULL, lock_level); if (status) { make_bad_inode(inode); mlog_errno(status); @@ -477,16 +568,32 @@ static int ocfs2_read_locked_inode(struct inode *inode, } if (can_lock) { - status = ocfs2_read_inode_block_full(inode, &bh, - OCFS2_BH_IGNORE_CACHE); + if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) + status = ocfs2_filecheck_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE, 0); + else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) + status = ocfs2_filecheck_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE, 1); + else + status = ocfs2_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); /* * If buffer is in jbd, then its checksum may not have been * computed as yet. */ - if (!status && !buffer_jbd(bh)) - status = ocfs2_validate_inode_block(osb->sb, bh); + if (!status && !buffer_jbd(bh)) { + if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) + status = ocfs2_filecheck_validate_inode_block( + osb->sb, bh); + else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) + status = ocfs2_filecheck_repair_inode_block( + osb->sb, bh); + else + status = ocfs2_validate_inode_block( + osb->sb, bh); + } } if (status < 0) { mlog_errno(status); @@ -503,7 +610,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, */ mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), - "Inode %llu: system file state is ambigous\n", + "Inode %llu: system file state is ambiguous\n", (unsigned long long)args->fi_blkno); if (S_ISCHR(le16_to_cpu(fe->i_mode)) || @@ -514,17 +621,29 @@ static int ocfs2_read_locked_inode(struct inode *inode, BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); + if (buffer_dirty(bh) && !buffer_jbd(bh)) { + if (can_lock) { + ocfs2_inode_unlock(inode, lock_level); + lock_level = 1; + ocfs2_inode_lock(inode, NULL, lock_level); + } + status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + status = 0; bail: if (can_lock) - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock(inode, lock_level); if (status < 0) make_bad_inode(inode); - if (args && bh) - brelse(bh); + brelse(bh); return status; } @@ -580,10 +699,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, handle = NULL; status = ocfs2_commit_truncate(osb, inode, fe_bh); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto out; - } } out: @@ -608,15 +725,15 @@ static int ocfs2_remove_inode(struct inode *inode, ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, le16_to_cpu(di->i_suballoc_slot)); if (!inode_alloc_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); mlog_errno(status); goto bail; @@ -632,7 +749,7 @@ static int ocfs2_remove_inode(struct inode *inode, if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto bail_commit; @@ -647,7 +764,7 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail_commit; } - di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); + di->i_dtime = cpu_to_le64(ktime_get_real_seconds()); di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); ocfs2_journal_dirty(handle, di_bh); @@ -663,7 +780,7 @@ bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode_alloc_inode, 1); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); brelse(inode_alloc_bh); bail: iput(inode_alloc_inode); @@ -674,7 +791,7 @@ bail: /* * Serialize with orphan dir recovery. If the process doing * recovery on this orphan dir does an iget() with the dir - * i_mutex held, we'll deadlock here. Instead we detect this + * i_rwsem held, we'll deadlock here. Instead we detect this * and exit early - recovery will wipe this inode for us. */ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, @@ -726,7 +843,7 @@ static int ocfs2_wipe_inode(struct inode *inode, ORPHAN_DIR_SYSTEM_INODE, orphaned_slot); if (!orphan_dir_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto bail; } @@ -734,10 +851,10 @@ static int ocfs2_wipe_inode(struct inode *inode, /* Lock the orphan dir. The lock will be held for the entire * delete_inode operation. We do this now to avoid races with * recovery completion on other nodes. */ - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); mlog_errno(status); goto bail; @@ -786,7 +903,7 @@ bail_unlock_dir: return status; ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); @@ -814,11 +931,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail; } - /* If we're coming from downconvert_thread we can't go into our own - * voting [hello, deadlock city!], so unforuntately we just - * have to skip deleting this guy. That's OK though because - * the node who's doing the actual deleting should handle it - * anyway. */ + /* + * If we're coming from downconvert_thread we can't go into our own + * voting [hello, deadlock city!] so we cannot delete the inode. But + * since we dropped last inode ref when downconverting dentry lock, + * we cannot have the file open and thus the node doing unlink will + * take care of deleting the inode. + */ if (current == osb->dc_task) goto bail; @@ -832,12 +951,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail_unlock; } - /* If we have allowd wipe of this inode for another node, it - * will be marked here so we can safely skip it. Recovery will - * cleanup any inodes we might inadvertently skip here. */ - if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) - goto bail_unlock; - ret = 1; bail_unlock: spin_unlock(&oi->ip_lock); @@ -951,7 +1064,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); if (sync_data) filemap_write_and_wait(inode->i_mapping); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } static void ocfs2_delete_inode(struct inode *inode) @@ -959,6 +1072,7 @@ static void ocfs2_delete_inode(struct inode *inode) int wipe, status; sigset_t oldset; struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; trace_ocfs2_delete_inode(inode->i_ino, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -970,8 +1084,6 @@ static void ocfs2_delete_inode(struct inode *inode) if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) goto bail; - dquot_initialize(inode); - if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most @@ -980,6 +1092,8 @@ static void ocfs2_delete_inode(struct inode *inode) goto bail; } + dquot_initialize(inode); + /* We want to block signals in delete_inode as the lock and * messaging paths may return us -ERESTARTSYS. Which would * cause us to exit early, resulting in inodes being orphaned @@ -1013,6 +1127,14 @@ static void ocfs2_delete_inode(struct inode *inode) goto bail_unlock_nfs_sync; } + di = (struct ocfs2_dinode *)di_bh->b_data; + /* Skip inode deletion and wait for dio orphan entry recovered + * first */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unlock_inode; + } + /* Query the cluster. This will be the final decision made * before we go ahead and wipe the inode. */ status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); @@ -1067,27 +1189,28 @@ static void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, inode->i_nlink); - mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, + mlog_bug_on_msg(osb == NULL, "Inode=%lu\n", inode->i_ino); dquot_drop(inode); - /* To preven remote deletes we hold open lock before, now it + /* To prevent remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); /* Do these before all the other work so that we don't bounce * the downconvert thread while waiting to destroy the locks. */ - ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); - ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, + ocfs2_resv_discard(&osb->osb_la_resmap, &oi->ip_la_data_resv); ocfs2_resv_init_once(&oi->ip_la_data_resv); @@ -1097,12 +1220,15 @@ static void ocfs2_clear_inode(struct inode *inode) * exception here are successfully wiped inodes - their * metadata can now be considered to be part of the system * inodes from which it came. */ - if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) + if (!(oi->ip_flags & OCFS2_INODE_DELETED)) ocfs2_checkpoint_inode(inode); mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), "Clear inode of %llu, inode has io markers\n", (unsigned long long)oi->ip_blkno); + mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list), + "Clear inode of %llu, inode has unwritten extents\n", + (unsigned long long)oi->ip_blkno); ocfs2_extent_map_trunc(inode, 0); @@ -1157,46 +1283,32 @@ static void ocfs2_clear_inode(struct inode *inode) * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ - jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, + if (!osb->journal) + return; + + jbd2_journal_release_jbd_inode(osb->journal->j_journal, &oi->ip_jinode); } void ocfs2_evict_inode(struct inode *inode) { + write_inode_now(inode, 1); + if (!inode->i_nlink || (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); } else { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } ocfs2_clear_inode(inode); } -/* Called under inode_lock, with no more references on the - * struct inode, so it's safe here to check the flags field - * and to manipulate i_nlink without any other locks. */ -int ocfs2_drop_inode(struct inode *inode) -{ - struct ocfs2_inode_info *oi = OCFS2_I(inode); - int res; - - trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, - inode->i_nlink, oi->ip_flags); - - if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) - res = 1; - else - res = generic_drop_inode(inode); - - return res; -} - /* * This is called from our getattr. */ int ocfs2_inode_revalidate(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int status = 0; trace_ocfs2_inode_revalidate(inode, @@ -1262,14 +1374,15 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_uid = cpu_to_le32(i_uid_read(inode)); fe->i_gid = cpu_to_le32(i_gid_read(inode)); fe->i_mode = cpu_to_le16(inode->i_mode); - fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); + fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); + fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); ocfs2_journal_dirty(handle, bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: return status; } @@ -1297,12 +1410,12 @@ void ocfs2_refresh_inode(struct inode *inode, inode->i_blocks = 0; else inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); - inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); - inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); - inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); - inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + inode_set_atime(inode, le64_to_cpu(fe->i_atime), + le32_to_cpu(fe->i_atime_nsec)); + inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), + le32_to_cpu(fe->i_mtime_nsec)); + inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), + le32_to_cpu(fe->i_ctime_nsec)); spin_unlock(&OCFS2_I(inode)->ip_lock); } @@ -1336,41 +1449,256 @@ int ocfs2_validate_inode_block(struct super_block *sb, rc = -EINVAL; if (!OCFS2_IS_VALID_DINODE(di)) { - ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - di->i_signature); + rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + di->i_signature); goto bail; } if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { - ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); goto bail; } - if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { - ocfs2_error(sb, - "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", - (unsigned long long)bh->b_blocknr); + if (!(le32_to_cpu(di->i_flags) & OCFS2_VALID_FL)) { + rc = ocfs2_error(sb, + "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", + (unsigned long long)bh->b_blocknr); goto bail; } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Invalid dinode #%llu: fs_generation is %u\n", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(di->i_fs_generation)); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); goto bail; } + if (le16_to_cpu(di->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && + (u32)le16_to_cpu(di->i_suballoc_slot) > OCFS2_SB(sb)->max_slots - 1) { + rc = ocfs2_error(sb, "Invalid dinode %llu: suballoc slot %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(di->i_suballoc_slot)); + goto bail; + } + + if ((le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) && + le32_to_cpu(di->i_clusters)) { + rc = ocfs2_error(sb, "Invalid dinode %llu: %u clusters\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_clusters)); + goto bail; + } + + if (le32_to_cpu(di->i_flags) & OCFS2_CHAIN_FL) { + struct ocfs2_chain_list *cl = &di->id2.i_chain; + u16 bpc = 1 << (OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits); + + if (le16_to_cpu(cl->cl_count) != ocfs2_chain_recs_per_inode(sb)) { + rc = ocfs2_error(sb, "Invalid dinode %llu: chain list count %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(cl->cl_count)); + goto bail; + } + if (le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { + rc = ocfs2_error(sb, "Invalid dinode %llu: chain list index %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(cl->cl_next_free_rec)); + goto bail; + } + if (OCFS2_SB(sb)->bitmap_blkno && + OCFS2_SB(sb)->bitmap_blkno != le64_to_cpu(di->i_blkno) && + le16_to_cpu(cl->cl_bpc) != bpc) { + rc = ocfs2_error(sb, "Invalid dinode %llu: bits per cluster %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(cl->cl_bpc)); + goto bail; + } + } + rc = 0; bail: return rc; } +static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + trace_ocfs2_filecheck_validate_inode_block( + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * Call ocfs2_validate_meta_ecc() first since it has ecc repair + * function, but we should not return error immediately when ecc + * validation fails, because the reason is quite likely the invalid + * inode number inputted. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); + if (rc) { + mlog(ML_ERROR, + "Filecheck: checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + rc = -OCFS2_FILECHECK_ERR_BLOCKECC; + } + + if (!OCFS2_IS_VALID_DINODE(di)) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, di->i_signature); + rc = -OCFS2_FILECHECK_ERR_INVALIDINO; + goto bail; + } else if (rc) + goto bail; + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = -OCFS2_FILECHECK_ERR_BLOCKNO; + goto bail; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL " + "not set\n", + (unsigned long long)bh->b_blocknr); + rc = -OCFS2_FILECHECK_ERR_VALIDFLAG; + goto bail; + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + rc = -OCFS2_FILECHECK_ERR_GENERATION; + } + +bail: + return rc; +} + +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int changed = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + if (!ocfs2_filecheck_validate_inode_block(sb, bh)) + return 0; + + trace_ocfs2_filecheck_repair_inode_block( + (unsigned long long)bh->b_blocknr); + + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { + mlog(ML_ERROR, + "Filecheck: cannot repair dinode #%llu " + "on readonly filesystem\n", + (unsigned long long)bh->b_blocknr); + return -OCFS2_FILECHECK_ERR_READONLY; + } + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "Filecheck: cannot repair dinode #%llu, " + "its buffer is in jbd\n", + (unsigned long long)bh->b_blocknr); + return -OCFS2_FILECHECK_ERR_INJBD; + } + + if (!OCFS2_IS_VALID_DINODE(di)) { + /* Cannot fix invalid inode block */ + return -OCFS2_FILECHECK_ERR_INVALIDINO; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + /* Cannot just add VALID_FL flag back as a fix, + * need more things to check here. + */ + return -OCFS2_FILECHECK_ERR_VALIDFLAG; + } + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + di->i_blkno = cpu_to_le64(bh->b_blocknr); + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: fs_generation to %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + } + + if (ocfs2_dinode_has_extents(di) && + le16_to_cpu(di->id2.i_list.l_next_free_rec) > le16_to_cpu(di->id2.i_list.l_count)) { + di->id2.i_list.l_next_free_rec = di->id2.i_list.l_count; + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: l_next_free_rec to %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(di->id2.i_list.l_next_free_rec)); + } + + if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { + ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); + mark_buffer_dirty(bh); + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: compute meta ecc\n", + (unsigned long long)bh->b_blocknr); + } + + return 0; +} + +static int +ocfs2_filecheck_read_inode_block_full(struct inode *inode, + struct buffer_head **bh, + int flags, int type) +{ + int rc; + struct buffer_head *tmp = *bh; + + if (!type) /* Check inode block */ + rc = ocfs2_read_blocks(INODE_CACHE(inode), + OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, + ocfs2_filecheck_validate_inode_block); + else /* Repair inode block */ + rc = ocfs2_read_blocks(INODE_CACHE(inode), + OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, + ocfs2_filecheck_repair_inode_block); + + /* If ocfs2_read_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags) { @@ -1380,6 +1708,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1, &tmp, flags, ocfs2_validate_inode_block); + if (rc < 0) + make_bad_inode(inode); /* If ocfs2_read_blocks() got us a new bh, pass it up. */ if (!rc && !*bh) *bh = tmp; @@ -1408,6 +1738,7 @@ static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info } static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) +__acquires(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); @@ -1415,6 +1746,7 @@ static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) } static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) +__releases(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 621fc73bf23d..07bd838e7843 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * inode.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_INODE_H @@ -43,9 +27,6 @@ struct ocfs2_inode_info /* protects extended attribute changes on this inode */ struct rw_semaphore ip_xattr_sem; - /* Number of outstanding AIO's which are not page aligned */ - atomic_t ip_unaligned_aio; - /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; @@ -57,6 +38,9 @@ struct ocfs2_inode_info u32 ip_flags; /* see below */ u32 ip_attr; /* inode attributes */ + /* Record unwritten extents during direct io. */ + struct list_head ip_unwritten_list; + /* protected by recovery_lock. */ struct inode *ip_next_orphan; @@ -73,6 +57,15 @@ struct ocfs2_inode_info u32 ip_dir_lock_gen; struct ocfs2_alloc_reservation ip_la_data_resv; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + + struct dquot __rcu *i_dquot[MAXQUOTAS]; }; /* @@ -84,8 +77,6 @@ struct ocfs2_inode_info #define OCFS2_INODE_BITMAP 0x00000004 /* This inode has been wiped from disk */ #define OCFS2_INODE_DELETED 0x00000008 -/* Another node is deleting, so our delete is a nop */ -#define OCFS2_INODE_SKIP_DELETE 0x00000010 /* Has the inode been orphaned on another node? * * This hints to ocfs2_drop_inode that it should clear i_nlink before @@ -100,11 +91,13 @@ struct ocfs2_inode_info * rely on ocfs2_delete_inode to sort things out under the proper * cluster locks. */ -#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 +#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010 /* Does someone have the file open O_DIRECT */ -#define OCFS2_INODE_OPEN_DIRECT 0x00000040 +#define OCFS2_INODE_OPEN_DIRECT 0x00000020 /* Tell the inode wipe code it's not in orphan dir */ -#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 +/* Entry in orphan dir with 'dio-' prefix */ +#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { @@ -114,8 +107,6 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) #define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL) #define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL) -extern struct kmem_cache *ocfs2_inode_cache; - extern const struct address_space_operations ocfs2_aops; extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops; @@ -125,30 +116,25 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode) } void ocfs2_evict_inode(struct inode *inode); -int ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4 +#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8 + struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); -int ocfs2_inode_init_private(struct inode *inode); int ocfs2_inode_revalidate(struct dentry *dentry); void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, int create_ino); -void ocfs2_read_inode(struct inode *inode); -void ocfs2_read_inode2(struct inode *inode, void *opaque); -ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, - size_t size, loff_t *offp); void ocfs2_sync_blockdev(struct super_block *sb); void ocfs2_refresh_inode(struct inode *inode, struct ocfs2_dinode *fe); int ocfs2_mark_inode_dirty(handle_t *handle, struct inode *inode, struct buffer_head *bh); -struct buffer_head *ocfs2_bread(struct inode *inode, - int block, int *err, int reada); void ocfs2_set_inode_flags(struct inode *inode); void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); @@ -157,7 +143,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) { int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; - return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); + return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits; } /* Validate that a bh contains a valid inode */ @@ -178,4 +164,10 @@ static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_ return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache); } +/* Does this inode have the reflink flag set? */ +static inline bool ocfs2_is_refcount_inode(struct inode *inode) +{ + return (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); +} + #endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 0c60ef2d8056..b6864602814c 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ocfs2/ioctl.c * @@ -7,7 +8,9 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/blkdev.h> #include <linux/compat.h> +#include <linux/fileattr.h> #include <cluster/masklog.h> @@ -34,9 +37,8 @@ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) /* - * This call is void because we are already reporting an error that may - * be -EFAULT. The error will be returned from the ioctl(2) call. It's - * just a best-effort to tell userspace that this request caused the error. + * This is just a best-effort to tell userspace that this request + * caused the error. */ static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) @@ -60,8 +62,10 @@ static inline int o2info_coherent(struct ocfs2_info_request *req) return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); } -static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) +int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { + struct inode *inode = d_inode(dentry); + unsigned int flags; int status; status = ocfs2_inode_lock(inode, NULL, 0); @@ -70,15 +74,19 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) return status; } ocfs2_get_inode_flags(OCFS2_I(inode)); - *flags = OCFS2_I(inode)->ip_attr; + flags = OCFS2_I(inode)->ip_attr; ocfs2_inode_unlock(inode, 0); + fileattr_fill_flags(fa, flags & OCFS2_FL_VISIBLE); + return status; } -static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, - unsigned mask) +int ocfs2_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa) { + struct inode *inode = d_inode(dentry); + unsigned int flags = fa->flags; struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -86,7 +94,8 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, unsigned oldflags; int status; - mutex_lock(&inode->i_mutex); + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { @@ -94,27 +103,18 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, goto bail; } - status = -EACCES; - if (!inode_owner_or_capable(inode)) - goto bail_unlock; - if (!S_ISDIR(inode->i_mode)) flags &= ~OCFS2_DIRSYNC_FL; oldflags = ocfs2_inode->ip_attr; - flags = flags & mask; - flags |= oldflags & ~mask; + flags = flags & OCFS2_FL_MODIFIABLE; + flags |= oldflags & ~OCFS2_FL_MODIFIABLE; - /* - * The IMMUTABLE and APPEND_ONLY flags can only be changed by - * the relevant capability. - */ + /* Check already done by VFS, but repeat with ocfs lock */ status = -EPERM; - if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & - (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { - if (!capable(CAP_LINUX_IMMUTABLE)) - goto bail_unlock; - } + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && + !capable(CAP_LINUX_IMMUTABLE)) + goto bail_unlock; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { @@ -125,6 +125,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, ocfs2_inode->ip_attr = flags; ocfs2_set_inode_flags(inode); + inode_set_ctime_current(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); if (status < 0) @@ -135,146 +136,113 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, bail_unlock: ocfs2_inode_unlock(inode, 1); bail: - mutex_unlock(&inode->i_mutex); - brelse(bh); return status; } -int ocfs2_info_handle_blocksize(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_blocksize(struct inode *inode, + struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_blocksize oib; if (o2info_from_user(oib, req)) - goto bail; + return -EFAULT; oib.ib_blocksize = inode->i_sb->s_blocksize; o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oib.ib_req, req); + return -EFAULT; - return status; + return 0; } -int ocfs2_info_handle_clustersize(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_clustersize(struct inode *inode, + struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_clustersize oic; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oic, req)) - goto bail; + return -EFAULT; oic.ic_clustersize = osb->s_clustersize; o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oic.ic_req, req); - - return status; + return 0; } -int ocfs2_info_handle_maxslots(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_maxslots(struct inode *inode, + struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_maxslots oim; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oim, req)) - goto bail; + return -EFAULT; oim.im_max_slots = osb->max_slots; o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oim.im_req, req); + return -EFAULT; - return status; + return 0; } -int ocfs2_info_handle_label(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_label(struct inode *inode, + struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_label oil; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oil, req)) - goto bail; + return -EFAULT; memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oil.il_req, req); - - return status; + return 0; } -int ocfs2_info_handle_uuid(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_uuid(struct inode *inode, + struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_uuid oiu; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oiu, req)) - goto bail; + return -EFAULT; memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oiu.iu_req, req); + return -EFAULT; - return status; + return 0; } -int ocfs2_info_handle_fs_features(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_fs_features(struct inode *inode, + struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_fs_features oif; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oif, req)) - goto bail; + return -EFAULT; oif.if_compat_features = osb->s_feature_compat; oif.if_incompat_features = osb->s_feature_incompat; @@ -283,44 +251,34 @@ int ocfs2_info_handle_fs_features(struct inode *inode, o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oif.if_req, req); + return -EFAULT; - return status; + return 0; } -int ocfs2_info_handle_journal_size(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_journal_size(struct inode *inode, + struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_journal_size oij; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oij, req)) - goto bail; + return -EFAULT; - oij.ij_journal_size = osb->journal->j_inode->i_size; + oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oij.ij_req, req); - - return status; + return 0; } -int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, - struct inode *inode_alloc, u64 blkno, - struct ocfs2_info_freeinode *fi, u32 slot) +static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, + struct inode *inode_alloc, u64 blkno, + struct ocfs2_info_freeinode *fi, + u32 slot) { int status = 0, unlock = 0; @@ -328,9 +286,9 @@ int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, struct ocfs2_dinode *dinode_alloc = NULL; if (inode_alloc) - mutex_lock(&inode_alloc->i_mutex); + inode_lock(inode_alloc); - if (o2info_coherent(&fi->ifi_req)) { + if (inode_alloc && o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); if (status < 0) { mlog_errno(status); @@ -358,20 +316,20 @@ bail: ocfs2_inode_unlock(inode_alloc, 0); if (inode_alloc) - mutex_unlock(&inode_alloc->i_mutex); + inode_unlock(inode_alloc); brelse(bh); return status; } -int ocfs2_info_handle_freeinode(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_freeinode(struct inode *inode, + struct ocfs2_info_request __user *req) { u32 i; u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; + int status, type = INODE_ALLOC_SYSTEM_INODE; struct ocfs2_info_freeinode *oifi = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *inode_alloc = NULL; @@ -383,8 +341,10 @@ int ocfs2_info_handle_freeinode(struct inode *inode, goto out_err; } - if (o2info_from_user(*oifi, req)) - goto bail; + if (o2info_from_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } oifi->ifi_slotnum = osb->max_slots; @@ -398,13 +358,11 @@ int ocfs2_info_handle_freeinode(struct inode *inode, goto bail; } } else { - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, i); + int len = ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, i); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); + namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; @@ -412,23 +370,26 @@ int ocfs2_info_handle_freeinode(struct inode *inode, } status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); - if (status < 0) - goto bail; iput(inode_alloc); inode_alloc = NULL; + + if (status < 0) + goto bail; } o2info_set_request_filled(&oifi->ifi_req); - if (o2info_to_user(*oifi, req)) - goto bail; + if (o2info_to_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } status = 0; bail: if (status) o2info_set_request_error(&oifi->ifi_req, req); - +out_free: kfree(oifi); out_err: return status; @@ -437,7 +398,7 @@ out_err: static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, unsigned int chunksize) { - int index; + u32 index; index = __ilog2_u32(chunksize); if (index >= OCFS2_INFO_MAX_HIST) @@ -460,19 +421,19 @@ static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, stats->ffs_free_chunks_real++; } -void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, - unsigned int chunksize) +static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, + unsigned int chunksize) { o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); o2ffg_update_stats(&(ffg->iff_ffs), chunksize); } -int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, - struct inode *gb_inode, - struct ocfs2_dinode *gb_dinode, - struct ocfs2_chain_rec *rec, - struct ocfs2_info_freefrag *ffg, - u32 chunks_in_group) +static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, + struct inode *gb_inode, + struct ocfs2_dinode *gb_dinode, + struct ocfs2_chain_rec *rec, + struct ocfs2_info_freefrag *ffg, + u32 chunks_in_group) { int status = 0, used; u64 blkno; @@ -570,9 +531,9 @@ bail: return status; } -int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, - struct inode *gb_inode, u64 blkno, - struct ocfs2_info_freefrag *ffg) +static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, + struct inode *gb_inode, u64 blkno, + struct ocfs2_info_freefrag *ffg) { u32 chunks_in_group; int status = 0, unlock = 0, i; @@ -583,7 +544,7 @@ int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, struct ocfs2_dinode *gb_dinode = NULL; if (gb_inode) - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); if (o2info_coherent(&ffg->iff_req)) { status = ocfs2_inode_lock(gb_inode, &bh, 0); @@ -640,22 +601,20 @@ bail: ocfs2_inode_unlock(gb_inode, 0); if (gb_inode) - mutex_unlock(&gb_inode->i_mutex); - - if (gb_inode) - iput(gb_inode); + inode_unlock(gb_inode); + iput(gb_inode); brelse(bh); return status; } -int ocfs2_info_handle_freefrag(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_freefrag(struct inode *inode, + struct ocfs2_info_request __user *req) { u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; + int status, type = GLOBAL_BITMAP_SYSTEM_INODE; struct ocfs2_info_freefrag *oiff; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -668,8 +627,10 @@ int ocfs2_info_handle_freefrag(struct inode *inode, goto out_err; } - if (o2info_from_user(*oiff, req)) - goto bail; + if (o2info_from_user(*oiff, req)) { + status = -EFAULT; + goto out_free; + } /* * chunksize from userspace should be power of 2. */ @@ -688,12 +649,10 @@ int ocfs2_info_handle_freefrag(struct inode *inode, goto bail; } } else { - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, - OCFS2_INVALID_SLOT); + int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), + type, OCFS2_INVALID_SLOT); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); + namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; @@ -708,39 +667,33 @@ int ocfs2_info_handle_freefrag(struct inode *inode, if (o2info_to_user(*oiff, req)) { status = -EFAULT; - goto bail; + goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oiff->iff_req, req); - +out_free: kfree(oiff); out_err: return status; } -int ocfs2_info_handle_unknown(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_unknown(struct inode *inode, + struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) - goto bail; + return -EFAULT; o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oir, req); - - return status; + return 0; } /* @@ -750,8 +703,8 @@ bail: * - distinguish different requests. * - validate size of different requests. */ -int ocfs2_info_handle_request(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_request(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_request oir; @@ -809,8 +762,8 @@ bail: return status; } -int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, - u64 *req_addr, int compat_flag) +static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, + u64 *req_addr, int compat_flag) { int status = -EFAULT; u64 __user *bp = NULL; @@ -839,7 +792,7 @@ bail: /* * OCFS2_IOC_INFO handles an array of requests passed from userspace. * - * ocfs2_info_handle() recevies a large info aggregation, grab and + * ocfs2_info_handle() receives a large info aggregation, grab and * validate the request count from header, then break it into small * pieces, later specific handlers can handle them one by one. * @@ -847,8 +800,8 @@ bail: * a better backward&forward compatibility, since a small piece of * request will be less likely to be broken if disk layout get changed. */ -int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, - int compat_flag) +static noinline_for_stack int +ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, int compat_flag) { int i, status = 0; u64 req_addr; @@ -884,46 +837,26 @@ bail: long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - unsigned int flags; - int new_clusters; - int status; - struct ocfs2_space_resv sr; - struct ocfs2_new_group_input input; - struct reflink_arguments args; - const char __user *old_path; - const char __user *new_path; - bool preserve; - struct ocfs2_info info; void __user *argp = (void __user *)arg; + int status; switch (cmd) { - case OCFS2_IOC_GETFLAGS: - status = ocfs2_get_inode_attr(inode, &flags); - if (status < 0) - return status; - - flags &= OCFS2_FL_VISIBLE; - return put_user(flags, (int __user *) arg); - case OCFS2_IOC_SETFLAGS: - if (get_user(flags, (int __user *) arg)) - return -EFAULT; - - status = mnt_want_write_file(filp); - if (status) - return status; - status = ocfs2_set_inode_attr(inode, flags, - OCFS2_FL_MODIFIABLE); - mnt_drop_write_file(filp); - return status; case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: + { + struct ocfs2_space_resv sr; + if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) return -EFAULT; return ocfs2_change_file_space(filp, cmd, &sr); + } case OCFS2_IOC_GROUP_EXTEND: + { + int new_clusters; + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -936,8 +869,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) status = ocfs2_group_extend(inode, new_clusters); mnt_drop_write_file(filp); return status; + } case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: + { + struct ocfs2_new_group_input input; + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -950,7 +887,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) status = ocfs2_group_add(inode, &input); mnt_drop_write_file(filp); return status; + } case OCFS2_IOC_REFLINK: + { + struct reflink_arguments args; + const char __user *old_path; + const char __user *new_path; + bool preserve; + if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; old_path = (const char __user *)(unsigned long)args.old_path; @@ -958,11 +902,16 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); + } case OCFS2_IOC_INFO: + { + struct ocfs2_info info; + if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 0); + } case FITRIM: { struct super_block *sb = inode->i_sb; @@ -972,9 +921,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!bdev_max_discard_sectors(sb->s_bdev)) + return -EOPNOTSUPP; + if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; + range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev), + range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) return ret; @@ -1001,12 +955,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) void __user *argp = (void __user *)arg; switch (cmd) { - case OCFS2_IOC32_GETFLAGS: - cmd = OCFS2_IOC_GETFLAGS; - break; - case OCFS2_IOC32_SETFLAGS: - cmd = OCFS2_IOC_SETFLAGS; - break; case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: @@ -1014,7 +962,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: - case FITRIM: break; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) @@ -1028,6 +975,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -EFAULT; return ocfs2_info_handle(inode, &info, 1); + case FITRIM: case OCFS2_IOC_MOVE_EXT: break; default: diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 0cd5323bd3f0..4a1c2313b429 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ioctl.h * @@ -10,6 +11,9 @@ #ifndef OCFS2_IOCTL_PROTO_H #define OCFS2_IOCTL_PROTO_H +int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa); +int ocfs2_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa); long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 242170d83971..85239807dec7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * journal.c * * Defines functions of journalling api * * Copyright (C) 2003, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -30,6 +14,8 @@ #include <linux/kthread.h> #include <linux/time.h> #include <linux/random.h> +#include <linux/delay.h> +#include <linux/writeback.h> #include <cluster/masklog.h> @@ -49,6 +35,8 @@ #include "sysfile.h" #include "uptodate.h" #include "quota.h" +#include "file.h" +#include "namei.h" #include "buffer_head_io.h" #include "ocfs2_trace.h" @@ -68,13 +56,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, static int ocfs2_trylock_journal(struct ocfs2_super *osb, int slot_num); static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot); + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type); static int ocfs2_commit_thread(void *arg); static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec); + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type); static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) { @@ -100,10 +90,10 @@ enum ocfs2_replay_state { struct ocfs2_replay_map { unsigned int rm_slots; enum ocfs2_replay_state rm_state; - unsigned char rm_replay_slots[0]; + unsigned char rm_replay_slots[] __counted_by(rm_slots); }; -void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) +static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) { if (!osb->replay_map) return; @@ -124,9 +114,9 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) if (osb->replay_map) return 0; - replay_map = kzalloc(sizeof(struct ocfs2_replay_map) + - (osb->max_slots * sizeof(char)), GFP_KERNEL); - + replay_map = kzalloc(struct_size(replay_map, rm_replay_slots, + osb->max_slots), + GFP_KERNEL); if (!replay_map) { mlog_errno(-ENOMEM); return -ENOMEM; @@ -148,7 +138,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) return 0; } -void ocfs2_queue_replay_slots(struct ocfs2_super *osb) +static void ocfs2_queue_replay_slots(struct ocfs2_super *osb, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_replay_map *replay_map = osb->replay_map; int i; @@ -162,7 +153,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb) for (i = 0; i < replay_map->rm_slots; i++) if (replay_map->rm_replay_slots[i]) ocfs2_queue_recovery_completion(osb->journal, i, NULL, - NULL, NULL); + NULL, NULL, + orphan_reco_type); replay_map->rm_state = REPLAY_DONE; } @@ -182,49 +174,69 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) struct ocfs2_recovery_map *rm; mutex_init(&osb->recovery_lock); - osb->disable_recovery = 0; + osb->recovery_state = OCFS2_REC_ENABLED; osb->recovery_thread_task = NULL; init_waitqueue_head(&osb->recovery_event); - rm = kzalloc(sizeof(struct ocfs2_recovery_map) + - osb->max_slots * sizeof(unsigned int), + rm = kzalloc(struct_size(rm, rm_entries, osb->max_slots), GFP_KERNEL); if (!rm) { mlog_errno(-ENOMEM); return -ENOMEM; } - rm->rm_entries = (unsigned int *)((char *)rm + - sizeof(struct ocfs2_recovery_map)); osb->recovery_map = rm; return 0; } -/* we can't grab the goofy sem lock from inside wait_event, so we use - * memory barriers to make sure that we'll see the null task before - * being woken up */ static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) { - mb(); return osb->recovery_thread_task != NULL; } +static void ocfs2_recovery_disable(struct ocfs2_super *osb, + enum ocfs2_recovery_state state) +{ + mutex_lock(&osb->recovery_lock); + /* + * If recovery thread is not running, we can directly transition to + * final state. + */ + if (!ocfs2_recovery_thread_running(osb)) { + osb->recovery_state = state + 1; + goto out_lock; + } + osb->recovery_state = state; + /* Wait for recovery thread to acknowledge state transition */ + wait_event_cmd(osb->recovery_event, + !ocfs2_recovery_thread_running(osb) || + osb->recovery_state >= state + 1, + mutex_unlock(&osb->recovery_lock), + mutex_lock(&osb->recovery_lock)); +out_lock: + mutex_unlock(&osb->recovery_lock); + + /* + * At this point we know that no more recovery work can be queued so + * wait for any recovery completion work to complete. + */ + if (osb->ocfs2_wq) + flush_workqueue(osb->ocfs2_wq); +} + +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb) +{ + ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE); +} + void ocfs2_recovery_exit(struct ocfs2_super *osb) { struct ocfs2_recovery_map *rm; /* disable any new recovery threads and wait for any currently * running ones to exit. Do this before setting the vol_state. */ - mutex_lock(&osb->recovery_lock); - osb->disable_recovery = 1; - mutex_unlock(&osb->recovery_lock); - wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); - - /* At this point, we know that no more recovery threads can be - * launched, so wait for any recovery completion work to - * complete. */ - flush_workqueue(ocfs2_wq); + ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE); /* * Now that recovery is shut down, and the osb is about to be @@ -316,7 +328,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) } jbd2_journal_lock_updates(journal->j_journal); - status = jbd2_journal_flush(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) { up_write(&journal->j_trans_barrier); @@ -367,7 +379,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) mlog_errno(PTR_ERR(handle)); if (is_journal_aborted(journal)) { - ocfs2_abort(osb->sb, "Detected aborted journal"); + ocfs2_abort(osb->sb, "Detected aborted journal\n"); handle = ERR_PTR(-EROFS); } } else { @@ -426,14 +438,14 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) if (!nblocks) return 0; - old_nblocks = handle->h_buffer_credits; + old_nblocks = jbd2_handle_buffer_credits(handle); trace_ocfs2_extend_trans(old_nblocks, nblocks); #ifdef CONFIG_OCFS2_DEBUG_FS status = 1; #else - status = jbd2_journal_extend(handle, nblocks); + status = jbd2_journal_extend(handle, nblocks, 0); if (status < 0) { mlog_errno(status); goto bail; @@ -455,10 +467,56 @@ bail: return status; } -struct ocfs2_triggers { - struct jbd2_buffer_trigger_type ot_triggers; - int ot_offset; -}; +/* + * Make sure handle has at least 'nblocks' credits available. If it does not + * have that many credits available, we will try to extend the handle to have + * enough credits. If that fails, we will restart transaction to have enough + * credits. Similar notes regarding data consistency and locking implications + * as for ocfs2_extend_trans() apply here. + */ +int ocfs2_assure_trans_credits(handle_t *handle, int nblocks) +{ + int old_nblks = jbd2_handle_buffer_credits(handle); + + trace_ocfs2_assure_trans_credits(old_nblks); + if (old_nblks >= nblocks) + return 0; + return ocfs2_extend_trans(handle, nblocks - old_nblks); +} + +/* + * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA. + * If that fails, restart the transaction & regain write access for the + * buffer head which is used for metadata modifications. + * Taken from Ext4: extend_or_restart_transaction() + */ +int ocfs2_allocate_extend_trans(handle_t *handle, int thresh) +{ + int status, old_nblks; + + BUG_ON(!handle); + + old_nblks = jbd2_handle_buffer_credits(handle); + trace_ocfs2_allocate_extend_trans(old_nblks, thresh); + + if (old_nblks < thresh) + return 0; + + status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (status > 0) { + status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA); + if (status < 0) + mlog_errno(status); + } + +bail: + return status; +} static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers) { @@ -523,87 +581,76 @@ static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh) { + struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers); + mlog(ML_ERROR, "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, " "bh->b_blocknr = %llu\n", (unsigned long)bh, (unsigned long long)bh->b_blocknr); - /* We aren't guaranteed to have the superblock here - but if we - * don't, it'll just crash. */ - ocfs2_error(bh->b_assoc_map->host->i_sb, + ocfs2_error(ot->sb, "JBD2 has aborted our journal, ocfs2 cannot continue\n"); } -static struct ocfs2_triggers di_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dinode, i_check), -}; - -static struct ocfs2_triggers eb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_extent_block, h_check), -}; - -static struct ocfs2_triggers rb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), -}; - -static struct ocfs2_triggers gd_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), -}; - -static struct ocfs2_triggers db_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_db_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, -}; +static void ocfs2_setup_csum_triggers(struct super_block *sb, + enum ocfs2_journal_trigger_type type, + struct ocfs2_triggers *ot) +{ + BUG_ON(type >= OCFS2_JOURNAL_TRIGGER_COUNT); -static struct ocfs2_triggers xb_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), -}; + switch (type) { + case OCFS2_JTR_DI: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dinode, i_check); + break; + case OCFS2_JTR_EB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_extent_block, h_check); + break; + case OCFS2_JTR_RB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_refcount_block, rf_check); + break; + case OCFS2_JTR_GD: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_group_desc, bg_check); + break; + case OCFS2_JTR_DB: + ot->ot_triggers.t_frozen = ocfs2_db_frozen_trigger; + break; + case OCFS2_JTR_XB: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_xattr_block, xb_check); + break; + case OCFS2_JTR_DQ: + ot->ot_triggers.t_frozen = ocfs2_dq_frozen_trigger; + break; + case OCFS2_JTR_DR: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check); + break; + case OCFS2_JTR_DL: + ot->ot_triggers.t_frozen = ocfs2_frozen_trigger; + ot->ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check); + break; + case OCFS2_JTR_NONE: + /* To make compiler happy... */ + return; + } -static struct ocfs2_triggers dq_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_dq_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, -}; + ot->ot_triggers.t_abort = ocfs2_abort_trigger; + ot->sb = sb; +} -static struct ocfs2_triggers dr_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), -}; +void ocfs2_initialize_journal_triggers(struct super_block *sb, + struct ocfs2_triggers triggers[]) +{ + enum ocfs2_journal_trigger_type type; -static struct ocfs2_triggers dl_triggers = { - .ot_triggers = { - .t_frozen = ocfs2_frozen_trigger, - .t_abort = ocfs2_abort_trigger, - }, - .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), -}; + for (type = OCFS2_JTR_DI; type < OCFS2_JOURNAL_TRIGGER_COUNT; type++) + ocfs2_setup_csum_triggers(sb, type, &triggers[type]); +} static int __ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, @@ -626,9 +673,26 @@ static int __ocfs2_journal_access(handle_t *handle, /* we can safely remove this assertion after testing. */ if (!buffer_uptodate(bh)) { mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); - mlog(ML_ERROR, "b_blocknr=%llu\n", - (unsigned long long)bh->b_blocknr); - BUG(); + mlog(ML_ERROR, "b_blocknr=%llu, b_state=0x%lx\n", + (unsigned long long)bh->b_blocknr, bh->b_state); + + lock_buffer(bh); + /* + * A previous transaction with a couple of buffer heads fail + * to checkpoint, so all the bhs are marked as BH_Write_EIO. + * For current transaction, the bh is just among those error + * bhs which previous transaction handle. We can't just clear + * its BH_Write_EIO and reuse directly, since other bhs are + * not written to disk yet and that will cause metadata + * inconsistency. So we should set fs read-only to avoid + * further damage. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) { + unlock_buffer(bh); + return ocfs2_error(osb->sb, "A previous attempt to " + "write this buffer head failed\n"); + } + unlock_buffer(bh); } /* Set the current transaction information on the ci so @@ -668,56 +732,91 @@ static int __ocfs2_journal_access(handle_t *handle, int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DI], + type); } int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_EB], + type); } int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &rb_triggers, + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_RB], type); } int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_GD], + type); } int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DB], + type); } int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_XB], + type); } int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DQ], + type); } int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DR], + type); } int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type); + struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + return __ocfs2_journal_access(handle, ci, bh, + &osb->s_journal_triggers[OCFS2_JTR_DL], + type); } int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, @@ -733,7 +832,22 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr); status = jbd2_journal_dirty_metadata(handle, bh); - BUG_ON(status); + if (status) { + mlog_errno(status); + if (!is_handle_aborted(handle)) { + journal_t *journal = handle->h_transaction->t_journal; + + mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed: " + "handle type %u started at line %u, credits %u/%u " + "errcode %d. Aborting transaction and journal.\n", + handle->h_type, handle->h_line_no, + handle->h_requested_credits, + jbd2_handle_buffer_credits(handle), status); + handle->h_err = status; + jbd2_journal_abort_handle(handle); + jbd2_journal_abort(journal, status); + } + } } #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) @@ -755,20 +869,54 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) write_unlock(&journal->j_state_lock); } -int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) +/* + * alloc & initialize skeleton for journal structure. + * ocfs2_journal_init() will make fs have journal ability. + */ +int ocfs2_journal_alloc(struct ocfs2_super *osb) +{ + int status = 0; + struct ocfs2_journal *journal; + + journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); + if (!journal) { + mlog(ML_ERROR, "unable to alloc journal\n"); + status = -ENOMEM; + goto bail; + } + osb->journal = journal; + journal->j_osb = osb; + + atomic_set(&journal->j_num_trans, 0); + init_rwsem(&journal->j_trans_barrier); + init_waitqueue_head(&journal->j_checkpointed); + spin_lock_init(&journal->j_lock); + journal->j_trans_id = 1UL; + INIT_LIST_HEAD(&journal->j_la_cleanups); + INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); + journal->j_state = OCFS2_JOURNAL_FREE; + +bail: + return status; +} + +static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, + jinode->i_dirty_start, jinode->i_dirty_end); +} + +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) { int status = -1; struct inode *inode = NULL; /* the journal inode */ journal_t *j_journal = NULL; + struct ocfs2_journal *journal = osb->journal; struct ocfs2_dinode *di = NULL; struct buffer_head *bh = NULL; - struct ocfs2_super *osb; int inode_lock = 0; BUG_ON(!journal); - - osb = journal->j_osb; - /* already have the inode for our journal */ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, osb->slot_num); @@ -801,31 +949,35 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) inode_lock = 1; di = (struct ocfs2_dinode *)bh->b_data; - if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { + if (i_size_read(inode) < OCFS2_MIN_JOURNAL_SIZE) { mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", - inode->i_size); + i_size_read(inode)); status = -EINVAL; goto done; } - trace_ocfs2_journal_init(inode->i_size, + trace_ocfs2_journal_init(i_size_read(inode), (unsigned long long)inode->i_blocks, OCFS2_I(inode)->ip_clusters); /* call the kernels journal init function now */ j_journal = jbd2_journal_init_inode(inode); - if (j_journal == NULL) { + if (IS_ERR(j_journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EINVAL; + status = PTR_ERR(j_journal); goto done; } - trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen); + trace_ocfs2_journal_init_maxlen(j_journal->j_total_len); *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & OCFS2_JOURNAL_DIRTY_FL); journal->j_journal = j_journal; + journal->j_journal->j_submit_inode_data_buffers = + ocfs2_journal_submit_inode_data_buffers; + journal->j_journal->j_finish_inode_data_buffers = + jbd2_journal_finish_inode_data_buffers; journal->j_inode = inode; journal->j_bh = bh; @@ -918,7 +1070,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (!igrab(inode)) BUG(); - num_running_trans = atomic_read(&(osb->journal->j_num_trans)); + num_running_trans = atomic_read(&(journal->j_num_trans)); trace_ocfs2_journal_shutdown(num_running_trans); /* Do a commit_cache here. It will flush our journal, *and* @@ -937,17 +1089,19 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) osb->commit_task = NULL; } - BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); + BUG_ON(atomic_read(&(journal->j_num_trans)) != 0); - if (ocfs2_mount_local(osb)) { + if (ocfs2_mount_local(osb) && + (journal->j_journal->j_flags & JBD2_LOADED)) { jbd2_journal_lock_updates(journal->j_journal); - status = jbd2_journal_flush(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) mlog_errno(status); } - if (status == 0) { + /* Shutdown the kernel journal system */ + if (!jbd2_journal_destroy(journal->j_journal) && !status) { /* * Do not toggle if flush was unsuccessful otherwise * will leave dirty metadata in a "clean" journal @@ -956,9 +1110,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (status < 0) mlog_errno(status); } - - /* Shutdown the kernel journal system */ - jbd2_journal_destroy(journal->j_journal); journal->j_journal = NULL; OCFS2_I(inode)->ip_open_count--; @@ -971,10 +1122,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) journal->j_state = OCFS2_JOURNAL_FREE; -// up_write(&journal->j_trans_barrier); done: - if (inode) - iput(inode); + iput(inode); + kfree(journal); + osb->journal = NULL; } static void ocfs2_clear_journal_error(struct super_block *sb, @@ -1012,6 +1163,14 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); + if (replayed) { + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) + mlog_errno(status); + } + status = ocfs2_journal_toggle_dirty(osb, 1, replayed); if (status < 0) { mlog_errno(status); @@ -1021,7 +1180,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) /* Launch the commit thread */ if (!local) { osb->commit_task = kthread_run(ocfs2_commit_thread, osb, - "ocfs2cmt"); + "ocfs2cmt-%s", osb->uuid_str); if (IS_ERR(osb->commit_task)) { status = PTR_ERR(osb->commit_task); osb->commit_task = NULL; @@ -1091,12 +1250,10 @@ static int ocfs2_force_read_journal(struct inode *inode) int status = 0; int i; u64 v_blkno, p_blkno, p_blocks, num_blocks; -#define CONCURRENT_JOURNAL_FILL 32ULL - struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; - - memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); + struct buffer_head *bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); + num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); v_blkno = 0; while (v_blkno < num_blocks) { status = ocfs2_extent_map_get_blocks(inode, v_blkno, @@ -1106,29 +1263,32 @@ static int ocfs2_force_read_journal(struct inode *inode) goto bail; } - if (p_blocks > CONCURRENT_JOURNAL_FILL) - p_blocks = CONCURRENT_JOURNAL_FILL; - - /* We are reading journal data which should not - * be put in the uptodate cache */ - status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), - p_blkno, p_blocks, bhs); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - for(i = 0; i < p_blocks; i++) { - brelse(bhs[i]); - bhs[i] = NULL; + for (i = 0; i < p_blocks; i++, p_blkno++) { + bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno, + osb->sb->s_blocksize); + /* block not cached. */ + if (!bh) + continue; + + brelse(bh); + bh = NULL; + /* We are reading journal data which should not + * be put in the uptodate cache. + */ + status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + brelse(bh); + bh = NULL; } v_blkno += p_blocks; } bail: - for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) - brelse(bhs[i]); return status; } @@ -1138,6 +1298,7 @@ struct ocfs2_la_recovery_item { struct ocfs2_dinode *lri_la_dinode; struct ocfs2_dinode *lri_tl_dinode; struct ocfs2_quota_recovery *lri_qrec; + enum ocfs2_orphan_reco_type lri_orphan_reco_type; }; /* Does the second half of the recovery process. By this point, the @@ -1159,6 +1320,7 @@ void ocfs2_complete_recovery(struct work_struct *work) struct ocfs2_dinode *la_dinode, *tl_dinode; struct ocfs2_la_recovery_item *item, *n; struct ocfs2_quota_recovery *qrec; + enum ocfs2_orphan_reco_type orphan_reco_type; LIST_HEAD(tmp_la_list); trace_ocfs2_complete_recovery( @@ -1176,6 +1338,7 @@ void ocfs2_complete_recovery(struct work_struct *work) la_dinode = item->lri_la_dinode; tl_dinode = item->lri_tl_dinode; qrec = item->lri_qrec; + orphan_reco_type = item->lri_orphan_reco_type; trace_ocfs2_complete_recovery_slot(item->lri_slot, la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, @@ -1200,7 +1363,8 @@ void ocfs2_complete_recovery(struct work_struct *work) kfree(tl_dinode); } - ret = ocfs2_recover_orphans(osb, item->lri_slot); + ret = ocfs2_recover_orphans(osb, item->lri_slot, + orphan_reco_type); if (ret < 0) mlog_errno(ret); @@ -1225,7 +1389,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec) + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_la_recovery_item *item; @@ -1249,10 +1414,11 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, item->lri_slot = slot_num; item->lri_tl_dinode = tl_dinode; item->lri_qrec = qrec; + item->lri_orphan_reco_type = orphan_reco_type; spin_lock(&journal->j_lock); list_add_tail(&item->lri_list, &journal->j_la_cleanups); - queue_work(ocfs2_wq, &journal->j_recovery_work); + queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work); spin_unlock(&journal->j_lock); } @@ -1268,15 +1434,15 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* No need to queue up our truncate_log as regular cleanup will catch * that */ ocfs2_queue_recovery_completion(journal, osb->slot_num, - osb->local_alloc_copy, NULL, NULL); + osb->local_alloc_copy, NULL, NULL, + ORPHAN_NEED_TRUNCATE); ocfs2_schedule_truncate_log_flush(osb, 0); osb->local_alloc_copy = NULL; - osb->dirty = 0; /* queue to recover orphan slots for all offline slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); ocfs2_free_replay_slots(osb); } @@ -1287,7 +1453,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) osb->slot_num, NULL, NULL, - osb->quota_rec); + osb->quota_rec, + ORPHAN_NEED_TRUNCATE); osb->quota_rec = NULL; } } @@ -1301,17 +1468,37 @@ static int __ocfs2_recovery_thread(void *arg) int rm_quota_used = 0, i; struct ocfs2_quota_recovery *qrec; + /* Whether the quota supported. */ + int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA) + || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA); + status = ocfs2_wait_on_mount(osb); if (status < 0) { goto bail; } - rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS); - if (!rm_quota) { - status = -ENOMEM; - goto bail; + if (quota_enabled) { + rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS); + if (!rm_quota) { + status = -ENOMEM; + goto bail; + } } restart: + if (quota_enabled) { + mutex_lock(&osb->recovery_lock); + /* Confirm that recovery thread will no longer recover quotas */ + if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) { + osb->recovery_state = OCFS2_REC_QUOTA_DISABLED; + wake_up(&osb->recovery_event); + } + if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED) + quota_enabled = 0; + mutex_unlock(&osb->recovery_lock); + } + status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); @@ -1324,7 +1511,7 @@ restart: /* queue recovery for our own slot */ ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, - NULL, NULL); + NULL, NULL, ORPHAN_NO_NEED_TRUNCATE); spin_lock(&osb->osb_lock); while (rm->rm_used) { @@ -1345,9 +1532,14 @@ restart: * then quota usage would be out of sync until some node takes * the slot. So we remember which nodes need quota recovery * and when everything else is done, we recover quotas. */ - for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++); - if (i == rm_quota_used) - rm_quota[rm_quota_used++] = slot_num; + if (quota_enabled) { + for (i = 0; i < rm_quota_used + && rm_quota[i] != slot_num; i++) + ; + + if (i == rm_quota_used) + rm_quota[rm_quota_used++] = slot_num; + } status = ocfs2_recover_node(osb, node_num, slot_num); skip_recovery: @@ -1375,21 +1567,25 @@ skip_recovery: /* Now it is right time to recover quotas... We have to do this under * superblock lock so that no one can start using the slot (and crash) * before we recover it */ - for (i = 0; i < rm_quota_used; i++) { - qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); - if (IS_ERR(qrec)) { - status = PTR_ERR(qrec); - mlog_errno(status); - continue; + if (quota_enabled) { + for (i = 0; i < rm_quota_used; i++) { + qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); + if (IS_ERR(qrec)) { + status = PTR_ERR(qrec); + mlog_errno(status); + continue; + } + ocfs2_queue_recovery_completion(osb->journal, + rm_quota[i], + NULL, NULL, qrec, + ORPHAN_NEED_TRUNCATE); } - ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], - NULL, NULL, qrec); } ocfs2_super_unlock(osb, 1); /* queue recovery for offline slots */ - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); bail: mutex_lock(&osb->recovery_lock); @@ -1400,37 +1596,36 @@ bail: ocfs2_free_replay_slots(osb); osb->recovery_thread_task = NULL; - mb(); /* sync with ocfs2_recovery_thread_running */ + if (osb->recovery_state == OCFS2_REC_WANT_DISABLE) + osb->recovery_state = OCFS2_REC_DISABLED; wake_up(&osb->recovery_event); mutex_unlock(&osb->recovery_lock); kfree(rm_quota); - /* no one is callint kthread_stop() for us so the kthread() api - * requires that we call do_exit(). And it isn't exported, but - * complete_and_exit() seems to be a minimal wrapper around it. */ - complete_and_exit(NULL, status); return status; } void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) { + int was_set = -1; + mutex_lock(&osb->recovery_lock); + if (osb->recovery_state < OCFS2_REC_WANT_DISABLE) + was_set = ocfs2_recovery_map_set(osb, node_num); trace_ocfs2_recovery_thread(node_num, osb->node_num, - osb->disable_recovery, osb->recovery_thread_task, - osb->disable_recovery ? - -1 : ocfs2_recovery_map_set(osb, node_num)); + osb->recovery_state, osb->recovery_thread_task, was_set); - if (osb->disable_recovery) + if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE) goto out; if (osb->recovery_thread_task) goto out; osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, - "ocfs2rec"); + "ocfs2rec-%s", osb->uuid_str); if (IS_ERR(osb->recovery_thread_task)) { mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); osb->recovery_thread_task = NULL; @@ -1558,17 +1753,16 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } journal = jbd2_journal_init_inode(inode); - if (journal == NULL) { + if (IS_ERR(journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EIO; + status = PTR_ERR(journal); goto done; } status = jbd2_journal_load(journal); if (status < 0) { mlog_errno(status); - if (!igrab(inode)) - BUG(); + BUG_ON(!igrab(inode)); jbd2_journal_destroy(journal); goto done; } @@ -1577,7 +1771,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, /* wipe the journal */ jbd2_journal_lock_updates(journal); - status = jbd2_journal_flush(journal); + status = jbd2_journal_flush(journal, 0); jbd2_journal_unlock_updates(journal); if (status < 0) mlog_errno(status); @@ -1597,8 +1791,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, if (status < 0) mlog_errno(status); - if (!igrab(inode)) - BUG(); + BUG_ON(!igrab(inode)); jbd2_journal_destroy(journal); @@ -1610,9 +1803,7 @@ done: if (got_lock) ocfs2_inode_unlock(inode, 1); - if (inode) - iput(inode); - + iput(inode); brelse(bh); return status; @@ -1676,7 +1867,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* This will kfree the memory pointed to by la_copy and tl_copy */ ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, - tl_copy, NULL); + tl_copy, NULL, ORPHAN_NEED_TRUNCATE); status = 0; done: @@ -1719,8 +1910,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb, ocfs2_inode_unlock(inode, 1); bail: - if (inode) - iput(inode); + iput(inode); return status; } @@ -1795,7 +1985,7 @@ bail: /* * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some - * randomness to the timeout to minimize multple nodes firing the timer at the + * randomness to the timeout to minimize multiple nodes firing the timer at the * same time. */ static inline unsigned long ocfs2_orphan_scan_timeout(void) @@ -1834,7 +2024,7 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void) * hasn't happened. The node queues a scan and increments the * sequence number in the LVB. */ -void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) +static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) { struct ocfs2_orphan_scan *os; int status, i; @@ -1866,14 +2056,14 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) for (i = 0; i < osb->max_slots; i++) ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, - NULL); + NULL, ORPHAN_NO_NEED_TRUNCATE); /* * We queued a recovery on orphan slots, increment the sequence * number and update LVB so other node will skip the scan for a while */ seqno++; os->os_count++; - os->os_scantime = CURRENT_TIME; + os->os_scantime = ktime_get_seconds(); unlock: ocfs2_orphan_scan_unlock(osb, seqno); out: @@ -1883,7 +2073,7 @@ out: } /* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ -void ocfs2_orphan_scan_work(struct work_struct *work) +static void ocfs2_orphan_scan_work(struct work_struct *work) { struct ocfs2_orphan_scan *os; struct ocfs2_super *osb; @@ -1895,7 +2085,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work) mutex_lock(&os->os_lock); ocfs2_queue_orphan_scan(osb); if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) - queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, ocfs2_orphan_scan_timeout()); mutex_unlock(&os->os_lock); } @@ -1930,12 +2120,12 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) struct ocfs2_orphan_scan *os; os = &osb->osb_orphan_scan; - os->os_scantime = CURRENT_TIME; + os->os_scantime = ktime_get_seconds(); if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); - queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, ocfs2_orphan_scan_timeout()); } } @@ -1944,24 +2134,44 @@ struct ocfs2_orphan_filldir_priv { struct dir_context ctx; struct inode *head; struct ocfs2_super *osb; + enum ocfs2_orphan_reco_type orphan_reco_type; }; -static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len, - loff_t pos, u64 ino, unsigned type) +static bool ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, + int name_len, loff_t pos, u64 ino, + unsigned type) { - struct ocfs2_orphan_filldir_priv *p = priv; + struct ocfs2_orphan_filldir_priv *p = + container_of(ctx, struct ocfs2_orphan_filldir_priv, ctx); struct inode *iter; if (name_len == 1 && !strncmp(".", name, 1)) - return 0; + return true; if (name_len == 2 && !strncmp("..", name, 2)) - return 0; + return true; + + /* do not include dio entry in case of orphan scan */ + if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) && + (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, + OCFS2_DIO_ORPHAN_PREFIX_LEN))) + return true; /* Skip bad inodes so that recovery can continue */ iter = ocfs2_iget(p->osb, ino, OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); if (IS_ERR(iter)) - return 0; + return true; + + if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, + OCFS2_DIO_ORPHAN_PREFIX_LEN)) + OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY; + + /* Skip inodes which are already added to recover list, since dio may + * happen concurrently with unlink/rename */ + if (OCFS2_I(iter)->ip_next_orphan) { + iput(iter); + return true; + } trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); /* No locking is required for the next_orphan queue as there @@ -1969,19 +2179,21 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len, OCFS2_I(iter)->ip_next_orphan = p->head; p->head = iter; - return 0; + return true; } static int ocfs2_queue_orphans(struct ocfs2_super *osb, int slot, - struct inode **head) + struct inode **head, + enum ocfs2_orphan_reco_type orphan_reco_type) { int status; struct inode *orphan_dir_inode = NULL; struct ocfs2_orphan_filldir_priv priv = { .ctx.actor = ocfs2_orphan_filldir, .osb = osb, - .head = *head + .head = *head, + .orphan_reco_type = orphan_reco_type }; orphan_dir_inode = ocfs2_get_system_file_inode(osb, @@ -1993,7 +2205,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, return status; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); if (status < 0) { mlog_errno(status); @@ -2011,7 +2223,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, out_cluster: ocfs2_inode_unlock(orphan_dir_inode, 0); out: - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); return status; } @@ -2071,17 +2283,20 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, * advertising our state to ocfs2_delete_inode(). */ static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot) + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type) { int ret = 0; struct inode *inode = NULL; struct inode *iter; struct ocfs2_inode_info *oi; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; trace_ocfs2_recover_orphans(slot); ocfs2_mark_recovering_orphan_dir(osb, slot); - ret = ocfs2_queue_orphans(osb, slot, &inode); + ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type); ocfs2_clear_recovering_orphan_dir(osb, slot); /* Error here should be noted, but we want to continue with as @@ -2095,21 +2310,61 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, (unsigned long long)oi->ip_blkno); iter = oi->ip_next_orphan; + oi->ip_next_orphan = NULL; - spin_lock(&oi->ip_lock); - /* The remote delete code may have set these on the - * assumption that the other node would wipe them - * successfully. If they are still in the node's - * orphan dir, we need to reset that state. */ - oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); - - /* Set the proper information to get us going into - * ocfs2_delete_inode. */ - oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; - spin_unlock(&oi->ip_lock); + if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { + inode_lock(inode); + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto unlock_mutex; + } + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto unlock_rw; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto unlock_inode; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, + di_bh, 0, 0); + if (ret) + mlog_errno(ret); + } +unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; +unlock_rw: + ocfs2_rw_unlock(inode, 1); +unlock_mutex: + inode_unlock(inode); + + /* clear dio flag in ocfs2_inode_info */ + oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; + } else { + spin_lock(&oi->ip_lock); + /* Set the proper information to get us going into + * ocfs2_delete_inode. */ + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + } iput(inode); - inode = iter; } @@ -2156,8 +2411,20 @@ static int ocfs2_commit_thread(void *arg) || kthread_should_stop()); status = ocfs2_commit_cache(osb); - if (status < 0) - mlog_errno(status); + if (status < 0) { + static unsigned long abort_warn_time; + + /* Warn about this once per minute */ + if (printk_timed_ratelimit(&abort_warn_time, 60*HZ)) + mlog(ML_ERROR, "status = %d, journal is " + "already aborted.\n", status); + /* + * After ocfs2_commit_cache() fails, j_num_trans has a + * non-zero value. Sleep here to avoid a busy-wait + * loop. + */ + msleep_interruptible(1000); + } if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ mlog(ML_KTHREAD, diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 96f9ac237e86..6397170f302f 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * journal.h * * Defines journalling api and structures. * * Copyright (C) 2003, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_JOURNAL_H @@ -45,7 +29,7 @@ struct ocfs2_dinode; struct ocfs2_recovery_map { unsigned int rm_used; - unsigned int *rm_entries; + unsigned int rm_entries[]; }; @@ -158,19 +142,21 @@ static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, void ocfs2_orphan_scan_init(struct ocfs2_super *osb); void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); -void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); void ocfs2_complete_recovery(struct work_struct *work); void ocfs2_wait_for_recovery(struct ocfs2_super *osb); int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); +void ocfs2_free_replay_slots(struct ocfs2_super *osb); /* * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. * + * ocfs2_journal_alloc - Initialize skeleton for journal structure. * ocfs2_journal_init - Initialize journal structures in the OSB. * ocfs2_journal_load - Load the given journal off disk. Replay it if * there's transactions still in there. @@ -184,8 +170,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. */ void ocfs2_set_journal_params(struct ocfs2_super *osb); -int ocfs2_journal_init(struct ocfs2_journal *journal, - int *dirty); +int ocfs2_journal_alloc(struct ocfs2_super *osb); +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty); void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full); @@ -246,8 +232,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * ocfs2_journal_access_*() unless you intend to * manage the checksum by hand. * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. - * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before - * the current handle commits. + * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes + * out before the current handle commits. */ /* You must always start_trans with a number of buffs > 0, but it's @@ -258,6 +244,19 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle); int ocfs2_extend_trans(handle_t *handle, int nblocks); +int ocfs2_assure_trans_credits(handle_t *handle, + int nblocks); +int ocfs2_allocate_extend_trans(handle_t *handle, + int thresh); + +/* + * Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * fallocate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. + */ +#define OCFS2_MAX_TRANS_DATA 64U /* * Create access is for when we get a newly created buffer and we're @@ -444,7 +443,7 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, * previous dirblock update in the free list */ static inline int ocfs2_link_credits(struct super_block *sb) { - return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + + return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 + ocfs2_quota_trans_credits(sb); } @@ -461,6 +460,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb) * orphan dir index leaf */ #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) +/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry + + * orphan dir index root + orphan dir index leaf */ +#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4) +#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS + /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming * directory + target unlink + 3 x dir index leaves */ @@ -513,8 +517,7 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb) * the result may be wrong. */ static inline int ocfs2_calc_extend_credits(struct super_block *sb, - struct ocfs2_extent_list *root_el, - u32 bits_wanted) + struct ocfs2_extent_list *root_el) { int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks; @@ -537,7 +540,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + - ocfs2_quota_trans_credits(sb) + bits_wanted; + ocfs2_quota_trans_credits(sb); } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) @@ -574,37 +577,12 @@ static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) return ocfs2_extent_recs_per_gd(sb); } -static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, - unsigned int clusters_to_del, - struct ocfs2_dinode *fe, - struct ocfs2_extent_list *last_el) +static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode, + loff_t start_byte, loff_t length) { - /* for dinode + all headers in this pass + update to next leaf */ - u16 next_free = le16_to_cpu(last_el->l_next_free_rec); - u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); - int credits = 1 + tree_depth + 1; - int i; - - i = next_free - 1; - BUG_ON(i < 0); - - /* We may be deleting metadata blocks, so metadata alloc dinode + - one desc. block for each possible delete. */ - if (tree_depth && next_free == 1 && - ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) - credits += 1 + tree_depth; - - /* update to the truncate log. */ - credits += OCFS2_TRUNCATE_LOG_UPDATE; - - credits += ocfs2_quota_trans_credits(sb); - - return credits; -} - -static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) -{ - return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); + return jbd2_journal_inode_ranged_write(handle, + &OCFS2_I(inode)->ip_jinode, + start_byte, length); } static inline int ocfs2_begin_ordered_truncate(struct inode *inode, @@ -616,4 +594,17 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode, new_size); } +static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + if (!is_handle_aborted(handle)) { + oi->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + oi->i_datasync_tid = handle->h_transaction->t_tid; + } +} + #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index aebeacd807c3..d1aa04a5af1b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * localalloc.c * * Node local data allocation * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -228,14 +212,15 @@ static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, unsigned int num_clusters) { - spin_lock(&osb->osb_lock); - if (osb->local_alloc_state == OCFS2_LA_DISABLED || - osb->local_alloc_state == OCFS2_LA_THROTTLED) - if (num_clusters >= osb->local_alloc_default_bits) { + if (num_clusters >= osb->local_alloc_default_bits) { + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED || + osb->local_alloc_state == OCFS2_LA_THROTTLED) { cancel_delayed_work(&osb->la_enable_wq); osb->local_alloc_state = OCFS2_LA_ENABLED; } - spin_unlock(&osb->osb_lock); + spin_unlock(&osb->osb_lock); + } } void ocfs2_la_enable_worker(struct work_struct *work) @@ -345,12 +330,17 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) if (num_used || alloc->id1.bitmap1.i_used || alloc->id1.bitmap1.i_total - || la->la_bm_off) - mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" + || la->la_bm_off) { + mlog(ML_ERROR, "inconsistent detected, clean journal with" + " unrecovered local alloc, please run fsck.ocfs2!\n" "found = %u, set = %u, taken = %u, off = %u\n", num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), le32_to_cpu(alloc->id1.bitmap1.i_total), - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off)); + + status = -EINVAL; + goto bail; + } osb->local_alloc_bh = alloc_bh; osb->local_alloc_state = OCFS2_LA_ENABLED; @@ -358,8 +348,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) bail: if (status < 0) brelse(alloc_bh); - if (inode) - iput(inode); + iput(inode); trace_ocfs2_load_local_alloc(osb->local_alloc_bits); @@ -387,7 +376,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) struct ocfs2_dinode *alloc = NULL; cancel_delayed_work(&osb->la_enable_wq); - flush_workqueue(ocfs2_wq); + if (osb->ocfs2_wq) + flush_workqueue(osb->ocfs2_wq); if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; @@ -415,7 +405,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -434,12 +424,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) bh = osb->local_alloc_bh; alloc = (struct ocfs2_dinode *) bh->b_data; - alloc_copy = kmalloc(bh->b_size, GFP_NOFS); + alloc_copy = kmemdup(alloc, bh->b_size, GFP_NOFS); if (!alloc_copy) { status = -ENOMEM; goto out_commit; } - memcpy(alloc_copy, alloc, bh->b_size); status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -469,12 +458,11 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: - if (local_alloc_inode) - iput(local_alloc_inode); + iput(local_alloc_inode); kfree(alloc_copy); } @@ -508,7 +496,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, goto bail; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_read_inode_block_full(inode, &alloc_bh, OCFS2_BH_IGNORE_CACHE); @@ -541,7 +529,7 @@ bail: brelse(alloc_bh); if (inode) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); } @@ -573,7 +561,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -603,7 +591,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); brelse(main_bm_bh); @@ -619,7 +607,7 @@ out: /* * make sure we've got at least bits_wanted contiguous bits in the - * local alloc. You lose them when you drop i_mutex. + * local alloc. You lose them when you drop i_rwsem. * * We will add ourselves to the transaction passed in, but may start * our own in order to shift windows. @@ -645,11 +633,11 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, goto bail; } - mutex_lock(&local_alloc_inode->i_mutex); + inode_lock(local_alloc_inode); /* * We must double check state and allocator bits because - * another process may have changed them while holding i_mutex. + * another process may have changed them while holding i_rwsem. */ spin_lock(&osb->osb_lock); if (!ocfs2_la_state_enabled(osb) || @@ -665,12 +653,10 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, #ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { - ocfs2_error(osb->sb, "local alloc inode %llu says it has " - "%u free bits, but a count shows %u", - (unsigned long long)le64_to_cpu(alloc->i_blkno), - le32_to_cpu(alloc->id1.bitmap1.i_used), - ocfs2_local_alloc_count_bits(alloc)); - status = -EIO; + status = ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", + (unsigned long long)le64_to_cpu(alloc->i_blkno), + le32_to_cpu(alloc->id1.bitmap1.i_used), + ocfs2_local_alloc_count_bits(alloc)); goto bail; } #endif @@ -690,7 +676,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, /* * Under certain conditions, the window slide code * might have reduced the number of bits available or - * disabled the the local alloc entirely. Re-check + * disabled the local alloc entirely. Re-check * here and return -ENOSPC if necessary. */ status = -ENOSPC; @@ -712,7 +698,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, status = 0; bail: if (status < 0 && local_alloc_inode) { - mutex_unlock(&local_alloc_inode->i_mutex); + inode_unlock(local_alloc_inode); iput(local_alloc_inode); } @@ -781,6 +767,48 @@ bail: return status; } +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits) +{ + int status, start; + u32 clear_bits; + struct inode *local_alloc_inode; + void *bitmap; + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + + local_alloc_inode = ac->ac_inode; + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + bitmap = la->la_bitmap; + start = bit_off - le32_to_cpu(la->la_bm_off); + clear_bits = num_bits; + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while (clear_bits--) + ocfs2_clear_bit(start++, bitmap); + + le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + +bail: + return status; +} + static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) { u32 count; @@ -797,7 +825,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, u32 *numbits, struct ocfs2_alloc_reservation *resv) { - int numfound, bitoff, left, startoff, lastzero; + int numfound = 0, bitoff, left, startoff; int local_resv = 0; struct ocfs2_alloc_reservation r; void *bitmap = NULL; @@ -835,16 +863,9 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; numfound = bitoff = startoff = 0; - lastzero = -1; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { - if (bitoff == left) { - /* mlog(0, "bitoff (%d) == left", bitoff); */ - break; - } - /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " - "numfound = %d\n", bitoff, startoff, numfound);*/ - + while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) < + left) { /* Ok, we found a zero bit... is it contig. or do we * start over?*/ if (bitoff == startoff) { @@ -947,11 +968,11 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, la_start_blk = ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(la->la_bm_off)); bitmap = la->la_bitmap; - start = count = bit_off = 0; + start = count = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) - != -1) { + while (1) { + bit_off = ocfs2_find_next_zero_bit(bitmap, left, start); if ((bit_off < left) && (bit_off == start)) { count++; start++; @@ -976,6 +997,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, goto bail; } } + if (bit_off >= left) break; count = 1; @@ -1003,7 +1025,7 @@ enum ocfs2_la_event { /* * Given an event, calculate the size of our next local alloc window. * - * This should always be called under i_mutex of the local alloc inode + * This should always be called under i_rwsem of the local alloc inode * so that local alloc disabling doesn't race with processes trying to * use the allocator. * @@ -1046,7 +1068,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb, } else { osb->local_alloc_state = OCFS2_LA_DISABLED; } - queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, + queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq, OCFS2_LA_ENABLE_INTERVAL); goto out_unlock; } @@ -1082,7 +1104,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, } retry_enospc: - (*ac)->ac_bits_wanted = osb->local_alloc_default_bits; + (*ac)->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); if (status == -ENOSPC) { if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == @@ -1154,7 +1176,7 @@ retry_enospc: OCFS2_LA_DISABLED) goto bail; - ac->ac_bits_wanted = osb->local_alloc_default_bits; + ac->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits, &cluster_off, @@ -1194,7 +1216,7 @@ retry_enospc: OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); trace_ocfs2_local_alloc_new_window_result( - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off, + le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off), le32_to_cpu(alloc->id1.bitmap1.i_total)); bail: @@ -1244,13 +1266,12 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, * local alloc shutdown won't try to double free main bitmap * bits. Make a copy so the sync function knows which bits to * free. */ - alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS); + alloc_copy = kmemdup(alloc, osb->local_alloc_bh->b_size, GFP_NOFS); if (!alloc_copy) { status = -ENOMEM; mlog_errno(status); goto bail; } - memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), @@ -1286,9 +1307,7 @@ bail: brelse(main_bm_bh); - if (main_bm_inode) - iput(main_bm_inode); - + iput(main_bm_inode); kfree(alloc_copy); if (ac) diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 1be9b5864460..08f925b7ec6d 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * localalloc.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_LOCALALLOC_H @@ -55,6 +39,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, u32 *bit_off, u32 *num_bits); +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits); + void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, unsigned int num_clusters); void ocfs2_la_enable_worker(struct work_struct *work); diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index e57c804069ea..6de944818c56 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -1,29 +1,14 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * locks.c * * Userspace file locking support * * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/fcntl.h> #include <cluster/masklog.h> @@ -42,7 +27,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, struct ocfs2_file_private *fp = file->private_data; struct ocfs2_lock_res *lockres = &fp->fp_flock; - if (fl->fl_type == F_WRLCK) + if (lock_is_write(fl)) level = 1; if (!IS_SETLKW(cmd)) trylock = 1; @@ -52,6 +37,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, if (lockres->l_flags & OCFS2_LOCK_ATTACHED && lockres->l_level > LKM_NLMODE) { int old_level = 0; + struct file_lock request; if (lockres->l_level == LKM_EXMODE) old_level = 1; @@ -66,8 +52,10 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, * level. */ - flock_lock_file_wait(file, - &(struct file_lock){.fl_type = F_UNLCK}); + locks_init_lock(&request); + request.c.flc_type = F_UNLCK; + request.c.flc_flags = FL_FLOCK; + locks_lock_file_wait(file, &request); ocfs2_file_unlock(file); } @@ -81,7 +69,9 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, goto out; } - ret = flock_lock_file_wait(file, fl); + ret = locks_lock_file_wait(file, fl); + if (ret) + ocfs2_file_unlock(file); out: mutex_unlock(&fp->fp_mutex); @@ -96,7 +86,7 @@ static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl) mutex_lock(&fp->fp_mutex); ocfs2_file_unlock(file); - ret = flock_lock_file_wait(file, fl); + ret = locks_lock_file_wait(file, fl); mutex_unlock(&fp->fp_mutex); return ret; @@ -110,16 +100,14 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!(fl->fl_flags & FL_FLOCK)) - return -ENOLCK; - if (__mandatory_lock(inode)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb)) - return flock_lock_file_wait(file, fl); + return locks_lock_file_wait(file, fl); - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) return ocfs2_do_funlock(file, cmd, fl); else return ocfs2_do_flock(file, inode, cmd, fl); @@ -130,9 +118,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!(fl->fl_flags & FL_POSIX)) - return -ENOLCK; - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h index 496d488b271f..b52de3947d5f 100644 --- a/fs/ocfs2/locks.h +++ b/fs/ocfs2/locks.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * locks.h * * Function prototypes for Userspace file locking support * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_LOCKS_H diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 10d66c75cecb..50e2faf64c19 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * mmap.c * * Code to deal with the mess that is clustered mmap. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -44,37 +28,39 @@ #include "ocfs2_trace.h" -static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) +static vm_fault_t ocfs2_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; sigset_t oldset; - int ret; + vm_fault_t ret; ocfs2_block_signals(&oldset); - ret = filemap_fault(area, vmf); + ret = filemap_fault(vmf); ocfs2_unblock_signals(&oldset); - trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno, - area, vmf->page, vmf->pgoff); + trace_ocfs2_fault(OCFS2_I(vma->vm_file->f_mapping->host)->ip_blkno, + vma, vmf->page, vmf->pgoff); return ret; } -static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, - struct page *page) +static vm_fault_t __ocfs2_page_mkwrite(struct file *file, + struct buffer_head *di_bh, struct folio *folio) { - int ret = VM_FAULT_NOPAGE; + int err; + vm_fault_t ret = VM_FAULT_NOPAGE; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; - loff_t pos = page_offset(page); - unsigned int len = PAGE_CACHE_SIZE; + loff_t pos = folio_pos(folio); + unsigned int len = PAGE_SIZE; pgoff_t last_index; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; void *fsdata; loff_t size = i_size_read(inode); - last_index = (size - 1) >> PAGE_CACHE_SHIFT; + last_index = (size - 1) >> PAGE_SHIFT; /* - * There are cases that lead to the page no longer bebongs to the + * There are cases that lead to the page no longer belonging to the * mapping. * 1) pagecache truncates locally due to memory pressure. * 2) pagecache truncates when another is taking EX lock against @@ -86,9 +72,9 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, * * Let VM retry with these cases. */ - if ((page->mapping != inode->i_mapping) || - (!PageUptodate(page)) || - (page_offset(page) >= size)) + if ((folio->mapping != inode->i_mapping) || + !folio_test_uptodate(folio) || + (pos >= size)) goto out; /* @@ -101,40 +87,37 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, * worry about ocfs2_write_begin() skipping some buffer reads * because the "write" would invalidate their data. */ - if (page->index == last_index) - len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; - - ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, - &fsdata, di_bh, page); - if (ret) { - if (ret != -ENOSPC) - mlog_errno(ret); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + if (folio->index == last_index) + len = ((size - 1) & ~PAGE_MASK) + 1; + + err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, + &locked_folio, &fsdata, di_bh, folio); + if (err) { + if (err != -ENOSPC) + mlog_errno(err); + ret = vmf_error(err); goto out; } - if (!locked_page) { + if (!locked_folio) { ret = VM_FAULT_NOPAGE; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, - fsdata); - BUG_ON(ret != len); + err = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); + BUG_ON(err != len); ret = VM_FAULT_LOCKED; out: return ret; } -static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct folio *folio = page_folio(vmf->page); + struct inode *inode = file_inode(vmf->vma->vm_file); struct buffer_head *di_bh = NULL; sigset_t oldset; - int ret; + int err; + vm_fault_t ret; sb_start_pagefault(inode->i_sb); ocfs2_block_signals(&oldset); @@ -144,9 +127,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * node. Taking the data lock will also ensure that we don't * attempt page truncation as part of a downconvert. */ - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - mlog_errno(ret); + err = ocfs2_inode_lock(inode, &di_bh, 1); + if (err < 0) { + mlog_errno(err); + ret = vmf_error(err); goto out; } @@ -157,7 +141,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page); + ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, folio); up_write(&OCFS2_I(inode)->ip_alloc_sem); @@ -173,22 +157,22 @@ out: static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; -int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) +int ocfs2_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; int ret = 0, lock_level = 0; ret = ocfs2_inode_lock_atime(file_inode(file), - file->f_path.mnt, &lock_level); + file->f_path.mnt, &lock_level, 1); if (ret < 0) { mlog_errno(ret); goto out; } ocfs2_inode_unlock(file_inode(file), lock_level); out: - vma->vm_ops = &ocfs2_file_vm_ops; + desc->vm_ops = &ocfs2_file_vm_ops; return 0; } diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h index 1274ee0f1fe2..d21c30de6b8c 100644 --- a/fs/ocfs2/mmap.h +++ b/fs/ocfs2/mmap.h @@ -1,6 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef OCFS2_MMAP_H #define OCFS2_MMAP_H -int ocfs2_mmap(struct file *file, struct vm_area_struct *vma); +int ocfs2_mmap_prepare(struct vm_area_desc *desc); #endif /* OCFS2_MMAP_H */ diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index f1fc172175b6..ce978a2497d9 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -1,18 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * move_extents.c * * Copyright (C) 2011 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/fs.h> #include <linux/types.h> @@ -25,6 +15,7 @@ #include "ocfs2_ioctl.h" #include "alloc.h" +#include "localalloc.h" #include "aops.h" #include "dlmglue.h" #include "extent_map.h" @@ -69,7 +60,7 @@ static int __ocfs2_move_extent(handle_t *handle, u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); - ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, + ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, p_cpos, new_p_cpos, len); if (ret) { mlog_errno(ret); @@ -98,32 +89,28 @@ static int __ocfs2_move_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + if (index == -1) { + ret = ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } rec = &el->l_recs[index]; - BUG_ON(ext_flags != rec->e_flags); + if (ext_flags != rec->e_flags) { + ret = ocfs2_error(inode->i_sb, + "Inode %llu has corrupted extent %d with flags 0x%x at cpos %u\n", + (unsigned long long)ino, index, rec->e_flags, cpos); + goto out; + } + /* * after moving/defraging to new location, the extent is not going * to be refcounted anymore. */ replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), - context->et.et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - ret = ocfs2_split_extent(handle, &context->et, path, index, &replace_rec, context->meta_ac, &context->dealloc); @@ -132,8 +119,6 @@ static int __ocfs2_move_extent(handle_t *handle, goto out; } - ocfs2_journal_dirty(handle, context->et.et_root_bh); - context->new_phys_cpos = new_p_cpos; /* @@ -151,23 +136,21 @@ static int __ocfs2_move_extent(handle_t *handle, old_blkno, len); } + ocfs2_update_inode_fsync_trans(handle, inode, 0); out: + ocfs2_free_path(path); return ret; } /* - * lock allocators, and reserving appropriate number of bits for - * meta blocks and data clusters. - * - * in some cases, we don't need to reserve clusters, just let data_ac - * be NULL. + * lock allocator, and reserve appropriate number of bits for + * meta blocks. */ -static int ocfs2_lock_allocators_move_extents(struct inode *inode, +static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_move, u32 extents_to_split, struct ocfs2_alloc_context **meta_ac, - struct ocfs2_alloc_context **data_ac, int extra_blocks, int *credits) { @@ -175,7 +158,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); @@ -192,16 +175,8 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, goto out; } - if (data_ac) { - ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); - if (ret) { - mlog_errno(ret); - goto out; - } - } - *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, - clusters_to_move + 2); + *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", extra_blocks, clusters_to_move, *credits); @@ -234,12 +209,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, struct ocfs2_refcount_tree *ref_tree = NULL; u32 new_phys_cpos, new_len; u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + int need_free = 0; if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { - - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - + BUG_ON(!ocfs2_is_refcount_inode(inode)); BUG_ON(!context->refcount_loc); ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, @@ -261,10 +234,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, - &context->meta_ac, - &context->data_ac, - extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + *len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; @@ -277,7 +250,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; */ - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -287,6 +260,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, } } + /* + * Make sure ocfs2_reserve_cluster is called after + * __ocfs2_flush_truncate_log, otherwise, dead lock may happen. + * + * If ocfs2_reserve_cluster is called + * before __ocfs2_flush_truncate_log, dead lock on global bitmap + * may happen. + * + */ + ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac); + if (ret) { + mlog_errno(ret); + goto out_unlock_mutex; + } + handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -312,6 +300,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, if (!partial) { context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; ret = -ENOSPC; + need_free = 1; goto out_commit; } } @@ -336,10 +325,24 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, mlog_errno(ret); out_commit: + if (need_free && context->data_ac) { + struct ocfs2_alloc_context *data_ac = context->data_ac; + + if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + new_phys_cpos, new_len); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos), + new_len); + } + ocfs2_commit_trans(osb, handle); out_unlock_mutex: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (context->data_ac) { ocfs2_free_alloc_context(context->data_ac); @@ -367,7 +370,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, int *vict_bit, struct buffer_head **ret_bh) { - int ret, i, bits_per_unit = 0; + int ret, i, len, bits_per_unit = 0; u64 blkno; char namebuf[40]; @@ -378,9 +381,9 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, struct ocfs2_dinode *ac_dinode; struct ocfs2_group_desc *bg; - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); - ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); + len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); + ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno); + if (ret) { ret = -ENOENT; goto out; @@ -403,7 +406,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, * 'vict_blkno' was out of the valid range. */ if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || - (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << + (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << bits_per_unit))) { ret = -EINVAL; goto out; @@ -437,7 +440,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, bg = (struct ocfs2_group_desc *)gd_bh->b_data; if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + - le16_to_cpu(bg->bg_bits))) { + (le16_to_cpu(bg->bg_bits) << bits_per_unit))) { *ret_bh = gd_bh; *vict_bit = (vict_blkno - blkno) >> @@ -495,7 +498,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, bg = (struct ocfs2_group_desc *)gd_bh->b_data; /* - * moving goal is not allowd to start with a group desc blok(#0 blk) + * moving goal is not allowed to start with a group desc blok(#0 blk) * let's compromise to the latter cluster. */ if (range->me_goal == le64_to_cpu(bg->bg_blkno)) @@ -552,6 +555,7 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, last_free_bits++; if (last_free_bits == move_len) { + i -= move_len; *goal_bit = i; *phys_cpos = base_cpos + i; break; @@ -561,83 +565,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, - handle_t *handle, - struct buffer_head *di_bh, - u32 num_bits, - u16 chain) -{ - int ret; - u32 tmp_used; - struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; - struct ocfs2_chain_list *cl = - (struct ocfs2_chain_list *) &di->id2.i_chain; - - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); - di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); - ocfs2_journal_dirty(handle, di_bh); - -out: - return ret; -} - -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits) -{ - int status; - void *bitmap = bg->bg_bitmap; - int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; - - /* All callers get the descriptor via - * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ - BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); - BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); - - mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, - num_bits); - - if (ocfs2_is_cluster_bitmap(alloc_inode)) - journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - - status = ocfs2_journal_access_gd(handle, - INODE_CACHE(alloc_inode), - group_bh, - journal_type); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - le16_add_cpu(&bg->bg_free_bits_count, -num_bits); - if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; - } - while (num_bits--) - ocfs2_set_bit(bit_off++, bitmap); - - ocfs2_journal_dirty(handle, group_bh); - -bail: - return status; -} - static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, u32 len, int ext_flags) @@ -659,10 +586,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { - - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - + BUG_ON(!ocfs2_is_refcount_inode(inode)); BUG_ON(!context->refcount_loc); ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, @@ -684,9 +608,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, } } - ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, - &context->meta_ac, - NULL, extra_blocks, &credits); + ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, + len, 1, + &context->meta_ac, + extra_blocks, &credits); if (ret) { mlog_errno(ret); goto out; @@ -698,6 +623,8 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, */ credits += OCFS2_INODE_UPDATE_CREDITS + 1; + inode_lock(tl_inode); + /* * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() * logic, while we still need to lock the global_bitmap. @@ -707,24 +634,22 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, if (!gb_inode) { mlog(ML_ERROR, "unable to get global_bitmap inode\n"); ret = -EIO; - goto out; + goto out_unlock_tl_inode; } - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { mlog_errno(ret); - goto out_unlock_gb_mutex; + goto out_unlock_gb_inode; } - mutex_lock(&tl_inode->i_mutex); - handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_unlock_tl_inode; + goto out_unlock; } new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); @@ -739,7 +664,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, /* * probe the victim cluster group to find a proper - * region to fit wanted movement, it even will perfrom + * region to fit wanted movement, it even will perform * a best-effort attempt by compromising to a threshold * around the goal. */ @@ -766,9 +691,12 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, } ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, - goal_bit, len); - if (ret) + goal_bit, len, 0, 0); + if (ret) { + ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } /* * Here we should write the new page out first if we are @@ -781,15 +709,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, out_commit: ocfs2_commit_trans(osb, handle); brelse(gd_bh); - -out_unlock_tl_inode: - mutex_unlock(&tl_inode->i_mutex); - +out_unlock: ocfs2_inode_unlock(gb_inode, 1); -out_unlock_gb_mutex: - mutex_unlock(&gb_inode->i_mutex); +out_unlock_gb_inode: + inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); +out_unlock_tl_inode: + inode_unlock(tl_inode); out: if (context->meta_ac) { @@ -845,7 +772,7 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh, struct ocfs2_move_extents *range = context->range; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if ((inode->i_size == 0) || (range->me_len == 0)) + if ((i_size_read(inode) == 0) || (range->me_len == 0)) return 0; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) @@ -946,6 +873,11 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh, mlog_errno(ret); goto out; } + /* + * Invalidate extent cache after moving/defragging to prevent + * stale cached data with outdated extent flags. + */ + ocfs2_extent_map_trunc(inode, cpos); context->clusters_moved += alloc_size; next: @@ -977,13 +909,10 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) struct buffer_head *di_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!inode) - return -ENOENT; - if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes from other nodes @@ -1001,7 +930,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) } /* - * rememer ip_xattr_sem also needs to be held if necessary + * remember ip_xattr_sem also needs to be held if necessary */ down_write(&OCFS2_I(inode)->ip_alloc_sem); @@ -1031,9 +960,10 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) } di = (struct ocfs2_dinode *)di_bh->b_data; - inode->i_ctime = CURRENT_TIME; - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); @@ -1046,7 +976,7 @@ out_inode_unlock: out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return status; } @@ -1066,8 +996,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) if (status) return status; - if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) + if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) { + status = -EPERM; goto out_drop; + } if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { status = -EPERM; @@ -1089,26 +1021,35 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) goto out_free; } - if (range.me_start > i_size_read(inode)) + if (range.me_start > i_size_read(inode)) { + status = -EINVAL; goto out_free; + } if (range.me_start + range.me_len > i_size_read(inode)) range.me_len = i_size_read(inode) - range.me_start; context->range = ⦥ + /* + * ok, the default threshold for the defragmentation + * is 1M, since our maximum clustersize was 1M also. + * any thought? + */ + if (!range.me_threshold) + range.me_threshold = 1024 * 1024; + + if (range.me_threshold > i_size_read(inode)) + range.me_threshold = i_size_read(inode); + + if (range.me_flags & ~(OCFS2_MOVE_EXT_FL_AUTO_DEFRAG | + OCFS2_MOVE_EXT_FL_PART_DEFRAG)) { + status = -EINVAL; + goto out_free; + } + if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { context->auto_defrag = 1; - /* - * ok, the default theshold for the defragmentation - * is 1M, since our maximum clustersize was 1M also. - * any thought? - */ - if (!range.me_threshold) - range.me_threshold = 1024 * 1024; - - if (range.me_threshold > i_size_read(inode)) - range.me_threshold = i_size_read(inode); if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) context->partial = 1; diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h index 4e143e811441..987f9e559f30 100644 --- a/fs/ocfs2/move_extents.h +++ b/fs/ocfs2/move_extents.h @@ -1,18 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * move_extents.h * * Copyright (C) 2011 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef OCFS2_MOVE_EXTENTS_H #define OCFS2_MOVE_EXTENTS_H diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index be3f8676a438..c90b254da75e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1,6 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * namei.c * * Create and rename file, directory, symlinks @@ -19,21 +18,6 @@ * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linux Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -41,6 +25,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/quotaops.h> +#include <linux/iversion.h> #include <cluster/masklog.h> @@ -63,6 +48,7 @@ #include "xattr.h" #include "acl.h" #include "ocfs2_trace.h" +#include "ioctl.h" #include "buffer_head_io.h" @@ -79,7 +65,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup); + struct ocfs2_dir_lookup_result *lookup, + bool dio); static int ocfs2_orphan_add(struct ocfs2_super *osb, handle_t *handle, @@ -87,13 +74,22 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode); + struct inode *orphan_dir_inode, + bool dio); static int ocfs2_create_symlink_data(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, const char *symname); +static int ocfs2_double_lock(struct ocfs2_super *osb, + struct buffer_head **bh1, + struct inode *inode1, + struct buffer_head **bh2, + struct inode *inode2, + int rename); + +static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) @@ -146,6 +142,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, bail_add: ret = d_splice_alias(inode, dentry); + if (IS_ERR(ret)) + goto bail_unlock; if (inode) { /* @@ -165,8 +163,9 @@ bail_add: OCFS2_I(dir)->ip_blkno); if (status) { mlog_errno(status); + if (ret) + dput(ret); ret = ERR_PTR(status); - goto bail_unlock; } } else ocfs2_dentry_attach_gen(dentry); @@ -188,11 +187,12 @@ bail: static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) { struct inode *inode; + int status; inode = new_inode(dir->i_sb); if (!inode) { mlog(ML_ERROR, "new_inode failed!\n"); - return NULL; + return ERR_PTR(-ENOMEM); } /* populate as many fields early on as possible - many of @@ -200,12 +200,34 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) * callers. */ if (S_ISDIR(mode)) set_nlink(inode, 2); - inode_init_owner(inode, dir, mode); - dquot_initialize(inode); + mode = mode_strip_sgid(&nop_mnt_idmap, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); + status = dquot_initialize(inode); + if (status) { + iput(inode); + return ERR_PTR(status); + } + return inode; } -static int ocfs2_mknod(struct inode *dir, +static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, + struct dentry *dentry, struct inode *inode) +{ + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); + ocfs2_lock_res_free(&dl->dl_lockres); + BUG_ON(dl->dl_count != 1); + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry_attach_lock); + kfree(dl); + iput(inode); +} + +static int ocfs2_mknod(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) @@ -215,6 +237,7 @@ static int ocfs2_mknod(struct inode *dir, handle_t *handle = NULL; struct ocfs2_super *osb; struct ocfs2_dinode *dirfe; + struct ocfs2_dinode *fe = NULL; struct buffer_head *new_fe_bh = NULL; struct inode *inode = NULL; struct ocfs2_alloc_context *inode_ac = NULL; @@ -224,18 +247,24 @@ static int ocfs2_mknod(struct inode *dir, int want_meta = 0; int xattr_credits = 0; struct ocfs2_security_xattr_info si = { + .name = NULL, .enable = 1, }; int did_quota_inode = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long)dev, mode); - dquot_initialize(dir); + status = dquot_initialize(dir); + if (status) { + mlog_errno(status); + return status; + } /* get our super block */ osb = OCFS2_SB(dir->i_sb); @@ -282,8 +311,9 @@ static int ocfs2_mknod(struct inode *dir, } inode = ocfs2_get_init_inode(dir, mode); - if (!inode) { - status = -ENOMEM; + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + inode = NULL; mlog_errno(status); goto leave; } @@ -359,6 +389,7 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + fe = (struct ocfs2_dinode *) new_fe_bh->b_data; if (S_ISDIR(mode)) { status = ocfs2_fill_new_dir(osb, handle, dir, inode, new_fe_bh, data_ac, meta_ac); @@ -380,10 +411,11 @@ static int ocfs2_mknod(struct inode *dir, } status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, - meta_ac, data_ac); + meta_ac, data_ac); + if (status < 0) { mlog_errno(status); - goto leave; + goto roll_back; } if (si.enable) { @@ -391,7 +423,7 @@ static int ocfs2_mknod(struct inode *dir, meta_ac, data_ac); if (status < 0) { mlog_errno(status); - goto leave; + goto roll_back; } } @@ -404,25 +436,37 @@ static int ocfs2_mknod(struct inode *dir, OCFS2_I(dir)->ip_blkno); if (status) { mlog_errno(status); - goto leave; + goto roll_back; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, &lookup); if (status < 0) { mlog_errno(status); - goto leave; + goto roll_back; } insert_inode_hash(inode); d_instantiate(dentry, inode); status = 0; + +roll_back: + if (status < 0 && S_ISDIR(mode)) { + ocfs2_add_links_count(dirfe, -1); + drop_nlink(dir); + } + leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && fe) + ocfs2_set_links_count(fe, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -430,7 +474,6 @@ leave: brelse(new_fe_bh); brelse(parent_fe_bh); - kfree(si.name); kfree(si.value); ocfs2_free_dir_lookup_result(&lookup); @@ -445,11 +488,14 @@ leave: ocfs2_free_alloc_context(meta_ac); /* - * We should call iput after the i_mutex of the bitmap been + * We should call iput after the i_rwsem of the bitmap been * unlocked in ocfs2_free_alloc_context, or the * ocfs2_delete_inode will mutex_lock again. */ if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); @@ -465,7 +511,6 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct inode *inode, dev_t dev, struct buffer_head **new_fe_bh, - struct buffer_head *parent_fe_bh, handle_t *handle, struct ocfs2_alloc_context *inode_ac, u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit) @@ -475,6 +520,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; u16 feat; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct timespec64 ts; *new_fe_bh = NULL; @@ -482,14 +529,14 @@ static int __ocfs2_mknod_locked(struct inode *dir, * these are used by the support functions here and in * callers. */ inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); - OCFS2_I(inode)->ip_blkno = fe_blkno; + oi->ip_blkno = fe_blkno; spin_lock(&osb->osb_lock); inode->i_generation = osb->s_next_generation++; spin_unlock(&osb->osb_lock); *new_fe_bh = sb_getblk(osb->sb, fe_blkno); if (!*new_fe_bh) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto leave; } @@ -523,10 +570,11 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); + ktime_get_coarse_real_ts64(&ts); fe->i_atime = fe->i_ctime = fe->i_mtime = - cpu_to_le64(CURRENT_TIME.tv_sec); + cpu_to_le64(ts.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = - cpu_to_le32(CURRENT_TIME.tv_nsec); + cpu_to_le32(ts.tv_nsec); fe->i_dtime = 0; /* @@ -556,8 +604,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, mlog_errno(status); } - status = 0; /* error in ocfs2_create_new_inode_locks is not - * critical */ + ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: if (status < 0) { @@ -596,26 +643,28 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, } return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, - parent_fe_bh, handle, inode_ac, - fe_blkno, suballoc_loc, suballoc_bit); + handle, inode_ac, fe_blkno, + suballoc_loc, suballoc_bit); } -static int ocfs2_mkdir(struct inode *dir, - struct dentry *dentry, - umode_t mode) +static struct dentry *ocfs2_mkdir(struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { int ret; trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name, OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); + ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (ret) mlog_errno(ret); - return ret; + return ERR_PTR(ret); } -static int ocfs2_create(struct inode *dir, +static int ocfs2_create(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) @@ -624,7 +673,7 @@ static int ocfs2_create(struct inode *dir, trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); + ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); if (ret) mlog_errno(ret); @@ -636,14 +685,17 @@ static int ocfs2_link(struct dentry *old_dentry, struct dentry *dentry) { handle_t *handle; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); + struct inode *old_dir = d_inode(old_dentry->d_parent); int err; struct buffer_head *fe_bh = NULL; + struct buffer_head *old_dir_bh = NULL; struct buffer_head *parent_fe_bh = NULL; struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; + u64 old_de_ino; trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno, old_dentry->d_name.len, old_dentry->d_name.name, @@ -652,20 +704,54 @@ static int ocfs2_link(struct dentry *old_dentry, if (S_ISDIR(inode->i_mode)) return -EPERM; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) { + mlog_errno(err); + return err; + } - err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); + err = ocfs2_double_lock(osb, &old_dir_bh, old_dir, + &parent_fe_bh, dir, 0); if (err < 0) { if (err != -ENOENT) mlog_errno(err); return err; } + /* make sure both dirs have bhs + * get an extra ref on old_dir_bh if old==new */ + if (!parent_fe_bh) { + if (old_dir_bh) { + parent_fe_bh = old_dir_bh; + get_bh(parent_fe_bh); + } else { + mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str); + err = -EIO; + goto out; + } + } + if (!dir->i_nlink) { err = -ENOENT; goto out; } + err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name, + old_dentry->d_name.len, &old_de_ino); + if (err) { + err = -ENOENT; + goto out; + } + + /* + * Check whether another node removed the source inode while we + * were in the vfs. + */ + if (old_de_ino != OCFS2_I(inode)->ip_blkno) { + err = -ENOENT; + goto out; + } + err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, dentry->d_name.len); if (err) @@ -711,10 +797,11 @@ static int ocfs2_link(struct dentry *old_dentry, } inc_nlink(inode); - inode->i_ctime = CURRENT_TIME; + inode_set_ctime_current(inode); ocfs2_set_links_count(fe, inode->i_nlink); - fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); err = ocfs2_add_entry(handle, dentry, inode, @@ -743,10 +830,11 @@ out_unlock_inode: ocfs2_inode_unlock(inode, 1); out: - ocfs2_inode_unlock(dir, 1); + ocfs2_double_unlock(old_dir, dir); brelse(fe_bh); brelse(parent_fe_bh); + brelse(old_dir_bh); ocfs2_free_dir_lookup_result(&lookup); @@ -792,7 +880,7 @@ static int ocfs2_unlink(struct inode *dir, int status; int child_locked = 0; bool is_unlinkable = false; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); u64 blkno; @@ -809,9 +897,13 @@ static int ocfs2_unlink(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - dquot_initialize(dir); + status = dquot_initialize(dir); + if (status) { + mlog_errno(status); + return status; + } - BUG_ON(dentry->d_parent->d_inode != dir); + BUG_ON(d_inode(dentry->d_parent) != dir); if (inode == osb->root_inode) return -EPERM; @@ -869,7 +961,8 @@ static int ocfs2_unlink(struct inode *dir, if (ocfs2_inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto leave; @@ -905,9 +998,10 @@ static int ocfs2_unlink(struct inode *dir, drop_nlink(inode); drop_nlink(inode); ocfs2_set_links_count(fe, inode->i_nlink); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); if (S_ISDIR(inode->i_mode)) drop_nlink(dir); @@ -921,7 +1015,7 @@ static int ocfs2_unlink(struct inode *dir, if (is_unlinkable) { status = ocfs2_orphan_add(osb, handle, inode, fe_bh, - orphan_name, &orphan_insert, orphan_dir); + orphan_name, &orphan_insert, orphan_dir, false); if (status < 0) mlog_errno(status); } @@ -930,45 +1024,104 @@ leave: if (handle) ocfs2_commit_trans(osb, handle); - if (child_locked) - ocfs2_inode_unlock(inode, 1); - - ocfs2_inode_unlock(dir, 1); - if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } + if (child_locked) + ocfs2_inode_unlock(inode, 1); + + ocfs2_inode_unlock(dir, 1); + brelse(fe_bh); brelse(parent_node_bh); ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status && (status != -ENOTEMPTY)) + if (status && (status != -ENOTEMPTY) && (status != -ENOENT)) mlog_errno(status); return status; } +static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, + u64 src_inode_no, u64 dest_inode_no) +{ + int ret = 0, i = 0; + u64 parent_inode_no = 0; + u64 child_inode_no = src_inode_no; + struct inode *child_inode; + +#define MAX_LOOKUP_TIMES 32 + while (1) { + child_inode = ocfs2_iget(osb, child_inode_no, 0, 0); + if (IS_ERR(child_inode)) { + ret = PTR_ERR(child_inode); + break; + } + + ret = ocfs2_inode_lock(child_inode, NULL, 0); + if (ret < 0) { + iput(child_inode); + if (ret != -ENOENT) + mlog_errno(ret); + break; + } + + ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2, + &parent_inode_no); + ocfs2_inode_unlock(child_inode, 0); + iput(child_inode); + if (ret < 0) { + ret = -ENOENT; + break; + } + + if (parent_inode_no == dest_inode_no) { + ret = 1; + break; + } + + if (parent_inode_no == osb->root_inode->i_ino) { + ret = 0; + break; + } + + child_inode_no = parent_inode_no; + + if (++i >= MAX_LOOKUP_TIMES) { + mlog_ratelimited(ML_NOTICE, "max lookup times reached, " + "filesystem may have nested directories, " + "src inode: %llu, dest inode: %llu.\n", + (unsigned long long)src_inode_no, + (unsigned long long)dest_inode_no); + ret = 0; + break; + } + } + + return ret; +} + /* - * The only place this should be used is rename! + * The only place this should be used is rename and link! * if they have the same id, then the 1st one is the only one locked. */ static int ocfs2_double_lock(struct ocfs2_super *osb, struct buffer_head **bh1, struct inode *inode1, struct buffer_head **bh2, - struct inode *inode2) + struct inode *inode2, + int rename) { int status; + int inode1_is_ancestor, inode2_is_ancestor; struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); - struct buffer_head **tmpbh; - struct inode *tmpinode; trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, (unsigned long long)oi2->ip_blkno); @@ -978,21 +1131,33 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, if (*bh2) *bh2 = NULL; - /* we always want to lock the one with the lower lockid first. */ + /* we always want to lock the one with the lower lockid first. + * and if they are nested, we lock ancestor first */ if (oi1->ip_blkno != oi2->ip_blkno) { - if (oi1->ip_blkno < oi2->ip_blkno) { - /* switch id1 and id2 around */ - tmpbh = bh2; - bh2 = bh1; - bh1 = tmpbh; + inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, + oi1->ip_blkno); + if (inode1_is_ancestor < 0) { + status = inode1_is_ancestor; + goto bail; + } - tmpinode = inode2; - inode2 = inode1; - inode1 = tmpinode; + inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno, + oi2->ip_blkno); + if (inode2_is_ancestor < 0) { + status = inode2_is_ancestor; + goto bail; + } + + if ((inode1_is_ancestor == 1) || + (oi1->ip_blkno < oi2->ip_blkno && + inode2_is_ancestor == 0)) { + /* switch id1 and id2 around */ + swap(bh2, bh1); + swap(inode2, inode1); } /* lock id2 */ status = ocfs2_inode_lock_nested(inode2, bh2, 1, - OI_LS_RENAME1); + rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -1001,7 +1166,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, } /* lock id1 */ - status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2); + status = ocfs2_inode_lock_nested(inode1, bh1, 1, + rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT); if (status < 0) { /* * An error return must mean that no cluster locks @@ -1018,8 +1184,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, } trace_ocfs2_double_lock_end( - (unsigned long long)OCFS2_I(inode1)->ip_blkno, - (unsigned long long)OCFS2_I(inode2)->ip_blkno); + (unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); bail: if (status) @@ -1035,15 +1201,17 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) ocfs2_inode_unlock(inode2, 1); } -static int ocfs2_rename(struct inode *old_dir, +static int ocfs2_rename(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry) + struct dentry *new_dentry, + unsigned int flags) { int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0; int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0; - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct inode *orphan_dir = NULL; struct ocfs2_dinode *newfe = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; @@ -1061,6 +1229,10 @@ static int ocfs2_rename(struct inode *old_dir, struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; struct ocfs2_dir_lookup_result target_insert = { NULL, }; + bool should_add_orphan = false; + + if (flags) + return -EINVAL; /* At some point it might be nice to break this function up a * bit. */ @@ -1069,8 +1241,16 @@ static int ocfs2_rename(struct inode *old_dir, old_dentry->d_name.len, old_dentry->d_name.name, new_dentry->d_name.len, new_dentry->d_name.name); - dquot_initialize(old_dir); - dquot_initialize(new_dir); + status = dquot_initialize(old_dir); + if (status) { + mlog_errno(status); + goto bail; + } + status = dquot_initialize(new_dir); + if (status) { + mlog_errno(status); + goto bail; + } osb = OCFS2_SB(old_dir->i_sb); @@ -1097,17 +1277,37 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } rename_lock = 1; + + /* here we cannot guarantee the inodes haven't just been + * changed, so check if they are nested again */ + status = ocfs2_check_if_ancestor(osb, new_dir->i_ino, + old_inode->i_ino); + if (status < 0) { + mlog_errno(status); + goto bail; + } else if (status == 1) { + status = -EPERM; + trace_ocfs2_rename_not_permitted( + (unsigned long long)old_inode->i_ino, + (unsigned long long)new_dir->i_ino); + goto bail; + } } /* if old and new are the same, this'll just do one lock. */ status = ocfs2_double_lock(osb, &old_dir_bh, old_dir, - &new_dir_bh, new_dir); + &new_dir_bh, new_dir, 1); if (status < 0) { mlog_errno(status); goto bail; } parents_locked = 1; + if (!new_dir->i_nlink) { + status = -EACCES; + goto bail; + } + /* make sure both dirs have bhs * get an extra ref on old_dir_bh if old==new */ if (!new_dir_bh) { @@ -1142,7 +1342,7 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (S_ISDIR(old_inode->i_mode)) { + if (S_ISDIR(old_inode->i_mode) && new_dir != old_dir) { u64 old_inode_parent; update_dot_dot = 1; @@ -1159,8 +1359,7 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (!new_inode && new_dir != old_dir && - new_dir->i_nlink >= ocfs2_link_max(osb)) { + if (!new_inode && new_dir->i_nlink >= ocfs2_link_max(osb)) { status = -EMLINK; goto bail; } @@ -1256,20 +1455,22 @@ static int ocfs2_rename(struct inode *old_dir, newfe = (struct ocfs2_dinode *) newfe_bh->b_data; trace_ocfs2_rename_over_existing( - (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? - (unsigned long long)newfe_bh->b_blocknr : 0ULL); + (unsigned long long)newfe_blkno, newfe_bh, + (unsigned long long)newfe_bh->b_blocknr); if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(new_inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto bail; } + should_add_orphan = true; } } else { - BUG_ON(new_dentry->d_parent->d_inode != new_dir); + BUG_ON(d_inode(new_dentry->d_parent) != new_dir); status = ocfs2_check_dir_for_entry(new_dir, new_dentry->d_name.name, @@ -1311,17 +1512,6 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (S_ISDIR(new_inode->i_mode) || - (ocfs2_read_links_count(newfe) == 1)) { - status = ocfs2_orphan_add(osb, handle, new_inode, - newfe_bh, orphan_name, - &orphan_insert, orphan_dir); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - /* change the dirent to point to the correct inode */ status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, old_inode); @@ -1329,21 +1519,34 @@ static int ocfs2_rename(struct inode *old_dir, mlog_errno(status); goto bail; } - new_dir->i_version++; + inode_inc_iversion(new_dir); if (S_ISDIR(new_inode->i_mode)) ocfs2_set_links_count(newfe, 0); else ocfs2_add_links_count(newfe, -1); ocfs2_journal_dirty(handle, newfe_bh); + if (should_add_orphan) { + status = ocfs2_orphan_add(osb, handle, new_inode, + newfe_bh, orphan_name, + &orphan_insert, orphan_dir, false); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } } else { /* if the name was not found in new_dir, add it now */ status = ocfs2_add_entry(handle, new_dentry, old_inode, OCFS2_I(old_inode)->ip_blkno, new_dir_bh, &target_insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } } - old_inode->i_ctime = CURRENT_TIME; + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode), @@ -1352,8 +1555,8 @@ static int ocfs2_rename(struct inode *old_dir, if (status >= 0) { old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; - old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); - old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); + old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode)); + old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode)); ocfs2_journal_dirty(handle, old_inode_bh); } else mlog_errno(status); @@ -1368,24 +1571,44 @@ static int ocfs2_rename(struct inode *old_dir, status = ocfs2_find_entry(old_dentry->d_name.name, old_dentry->d_name.len, old_dir, &old_entry_lookup); - if (status) + if (status) { + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; + } status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); if (status < 0) { mlog_errno(status); + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; } if (new_inode) { drop_nlink(new_inode); - new_inode->i_ctime = CURRENT_TIME; + inode_set_ctime_current(new_inode); } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, &old_inode_dot_dot_res, new_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + if (S_ISDIR(old_inode->i_mode)) { drop_nlink(old_dir); if (new_inode) { drop_nlink(new_inode); @@ -1403,7 +1626,8 @@ static int ocfs2_rename(struct inode *old_dir, if (old_dir != new_dir) { /* Keep the same times on both directories.*/ - new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime; + inode_set_mtime_to_ts(new_dir, + inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir))); /* * This will also pick up the i_nlink change from the @@ -1424,6 +1648,10 @@ static int ocfs2_rename(struct inode *old_dir, INODE_CACHE(old_dir), old_dir_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } fe = (struct ocfs2_dinode *) old_dir_bh->b_data; ocfs2_set_links_count(fe, old_dir->i_nlink); ocfs2_journal_dirty(handle, old_dir_bh); @@ -1432,33 +1660,32 @@ static int ocfs2_rename(struct inode *old_dir, ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); status = 0; bail: - if (rename_lock) - ocfs2_rename_unlock(osb); - if (handle) ocfs2_commit_trans(osb, handle); - if (parents_locked) - ocfs2_double_unlock(old_dir, new_dir); - - if (old_child_locked) - ocfs2_inode_unlock(old_inode, 1); - - if (new_child_locked) - ocfs2_inode_unlock(new_inode, 1); - if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } + if (new_child_locked) + ocfs2_inode_unlock(new_inode, 1); + + if (old_child_locked) + ocfs2_inode_unlock(old_inode, 1); + + if (parents_locked) + ocfs2_double_unlock(old_dir, new_dir); + + if (rename_lock) + ocfs2_rename_unlock(osb); + if (new_inode) sync_mapping_buffers(old_inode->i_mapping); - if (new_inode) - iput(new_inode); + iput(new_inode); ocfs2_free_dir_lookup_result(&target_lookup_res); ocfs2_free_dir_lookup_result(&old_entry_lookup); @@ -1579,7 +1806,8 @@ bail: return status; } -static int ocfs2_symlink(struct inode *dir, +static int ocfs2_symlink(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, const char *symname) { @@ -1599,17 +1827,23 @@ static int ocfs2_symlink(struct inode *dir, int want_clusters = 0; int xattr_credits = 0; struct ocfs2_security_xattr_info si = { + .name = NULL, .enable = 1, }; int did_quota = 0, did_quota_inode = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_symlink_begin(dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); - dquot_initialize(dir); + status = dquot_initialize(dir); + if (status) { + mlog_errno(status); + goto bail; + } sb = dir->i_sb; osb = OCFS2_SB(sb); @@ -1654,8 +1888,9 @@ static int ocfs2_symlink(struct inode *dir, } inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO); - if (!inode) { - status = -ENOMEM; + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + inode = NULL; mlog_errno(status); goto bail; } @@ -1726,6 +1961,7 @@ static int ocfs2_symlink(struct inode *dir, inode->i_rdev = 0; newsize = l - 1; inode->i_op = &ocfs2_symlink_inode_operations; + inode_nohighmem(inode); if (l > ocfs2_fast_symlink_chars(sb)) { u32 offset = 0; @@ -1793,6 +2029,8 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, &lookup); @@ -1809,8 +2047,11 @@ bail: ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && fe) + ocfs2_set_links_count(fe, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -1818,7 +2059,6 @@ bail: brelse(new_fe_bh); brelse(parent_fe_bh); - kfree(si.name); kfree(si.value); ocfs2_free_dir_lookup_result(&lookup); if (inode_ac) @@ -1828,6 +2068,9 @@ bail: if (xattr_ac) ocfs2_free_alloc_context(xattr_ac); if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); @@ -1885,11 +2128,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, return ret; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (ret < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(ret); @@ -1906,12 +2149,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, struct buffer_head *orphan_dir_bh, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { int ret; struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; + + if (dio) { + ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + ret = -EINVAL; + mlog_errno(ret); + return ret; + } - ret = ocfs2_blkno_stringify(blkno, name); + ret = ocfs2_blkno_stringify(blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + ret = ocfs2_blkno_stringify(blkno, name); if (ret < 0) { mlog_errno(ret); return ret; @@ -1919,7 +2178,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); + namelen, lookup); if (ret < 0) { mlog_errno(ret); return ret; @@ -1934,8 +2193,10 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, * @osb: ocfs2 file system * @ret_orphan_dir: Orphan dir inode - returned locked! * @blkno: Actual block number of the inode to be inserted into orphan dir. + * @name: Buffer to store the name of the orphan. * @lookup: dir lookup result, to be passed back into functions like * ocfs2_orphan_add + * @dio: Flag indicating if direct IO is being used or not. * * Returns zero on success and the ret_orphan_dir, name and lookup * fields will be populated. @@ -1946,7 +2207,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; @@ -1960,7 +2222,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, } ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, - blkno, name, lookup); + blkno, name, lookup, dio); if (ret < 0) { mlog_errno(ret); goto out; @@ -1973,7 +2235,7 @@ out: if (ret) { ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); } @@ -1988,12 +2250,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode) + struct inode *orphan_dir_inode, + bool dio) { struct buffer_head *orphan_dir_bh = NULL; int status = 0; struct ocfs2_dinode *orphan_fe; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; trace_ocfs2_orphan_add_begin( (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -2037,7 +2303,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, - OCFS2_ORPHAN_NAMELEN, inode, + namelen, inode, OCFS2_I(inode)->ip_blkno, orphan_dir_bh, lookup); if (status < 0) { @@ -2045,13 +2311,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto rollback; } - fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); - OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; + if (dio) { + /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan + * slot. + */ + fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num); + } else { + fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); + OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; - /* Record which orphan dir our inode now resides - * in. delete_inode will use this to determine which orphan - * dir to lock. */ - fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + /* Record which orphan dir our inode now resides + * in. delete_inode will use this to determine which orphan + * dir to lock. */ + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + } ocfs2_journal_dirty(handle, fe_bh); @@ -2076,14 +2350,27 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh) + struct buffer_head *orphan_dir_bh, + bool dio) { - char name[OCFS2_ORPHAN_NAMELEN + 1]; + char name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (dio) { + status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + status = -EINVAL; + mlog_errno(status); + return status; + } + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); if (status < 0) { mlog_errno(status); goto leave; @@ -2091,10 +2378,19 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, trace_ocfs2_orphan_del( (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, - name, OCFS2_ORPHAN_NAMELEN); + name, strlen(name)); + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } /* find it's spot in the orphan directory */ - status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, + status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, &lookup); if (status) { mlog_errno(status); @@ -2108,15 +2404,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access_di(handle, - INODE_CACHE(orphan_dir_inode), - orphan_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* do the i_nlink dance! :) */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) @@ -2194,7 +2481,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, - di_blkno, orphan_name, orphan_insert); + di_blkno, orphan_name, orphan_insert, + false); if (ret < 0) { mlog_errno(ret); goto out; @@ -2215,7 +2503,7 @@ out: ocfs2_free_alloc_context(inode_ac); /* Unroll orphan dir locking */ - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); ocfs2_inode_unlock(orphan_dir, 1); iput(orphan_dir); } @@ -2233,14 +2521,13 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, struct inode *inode = NULL; struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); - struct ocfs2_dinode *di = NULL; handle_t *handle = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; struct buffer_head *parent_di_bh = NULL; struct buffer_head *new_di_bh = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; - u64 uninitialized_var(di_blkno), suballoc_loc; + u64 di_blkno, suballoc_loc; u16 suballoc_bit; status = ocfs2_inode_lock(dir, &parent_di_bh, 1); @@ -2260,8 +2547,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, } inode = ocfs2_get_init_inode(dir, mode); - if (!inode) { - status = -ENOMEM; + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + inode = NULL; mlog_errno(status); goto leave; } @@ -2290,7 +2578,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, clear_nlink(inode); /* do the real work now. */ status = __ocfs2_mknod_locked(dir, inode, - 0, &new_di_bh, parent_di_bh, handle, + 0, &new_di_bh, handle, inode_ac, di_blkno, suballoc_loc, suballoc_bit); if (status < 0) { @@ -2298,9 +2586,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - di = (struct ocfs2_dinode *)new_di_bh->b_data; status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto leave; @@ -2321,7 +2608,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2345,6 +2632,158 @@ leave: return status; } +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode) +{ + char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; + struct inode *orphan_dir_inode = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct buffer_head *di_bh = NULL; + int status = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = NULL; + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + /* + * Another append dio crashed? + * If so, manually recover it first. + */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail_unlock_inode; + } + + status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } + } + + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, + OCFS2_I(inode)->ip_blkno, + orphan_name, + &orphan_insert, + true); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_ADD_TO_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name, + &orphan_insert, orphan_dir_inode, true); + if (status) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + inode_unlock(orphan_dir_inode); + iput(orphan_dir_inode); + + ocfs2_free_dir_lookup_result(&orphan_insert); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, struct buffer_head *di_bh, + int update_isize, loff_t end) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + handle_t *handle = NULL; + int status = 0; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + le16_to_cpu(di->i_dio_orphaned_slot)); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail; + } + + inode_lock(orphan_dir_inode); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + inode_unlock(orphan_dir_inode); + iput(orphan_dir_inode); + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))); + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, + inode, orphan_dir_bh, true); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + di->i_dio_orphaned_slot = 0; + + if (update_isize) { + status = ocfs2_set_inode_size(handle, inode, di_bh, end); + if (status) + mlog_errno(status); + } else + ocfs2_journal_dirty(handle, di_bh); + +bail_commit: + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + inode_unlock(orphan_dir_inode); + brelse(orphan_dir_bh); + iput(orphan_dir_inode); + +bail: + return status; +} + int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *inode, struct dentry *dentry) @@ -2396,17 +2835,17 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ORPHAN_DIR_SYSTEM_INODE, osb->slot_num); if (!orphan_dir_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto leave; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { mlog_errno(status); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); goto leave; } @@ -2433,7 +2872,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, } status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto out_commit; @@ -2444,6 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di->i_orphaned_slot = 0; set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, @@ -2467,7 +2907,7 @@ out_commit: ocfs2_commit_trans(osb, handle); orphan_unlock: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); leave: @@ -2498,10 +2938,10 @@ const struct inode_operations ocfs2_dir_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, - .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, + .fileattr_get = ocfs2_fileattr_get, + .fileattr_set = ocfs2_fileattr_set, }; diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e5d059d4f115..9cc891eb874e 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -1,31 +1,18 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * namei.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_NAMEI_H #define OCFS2_NAMEI_H +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 + extern const struct inode_operations ocfs2_dir_iops; struct dentry *ocfs2_get_parent(struct dentry *child); @@ -34,10 +21,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh); + struct buffer_head *orphan_dir_bh, + bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, struct inode **new_inode); +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode); +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, struct buffer_head *di_bh, + int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h index dfb313bda5dd..6dbcf3d467fb 100644 --- a/fs/ocfs2/ocfs1_fs_compat.h +++ b/fs/ocfs2/ocfs1_fs_compat.h @@ -1,6 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * ocfs1_fs_compat.h * * OCFS1 volume header definitions. OCFS2 creates valid but unmountable @@ -9,20 +8,6 @@ * mount it. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License, version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef _OCFS1_FS_COMPAT_H diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 3a903470c794..6aaa94c554c1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * ocfs2.h * * Defines macros and structures used in OCFS2 * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_H @@ -30,6 +14,7 @@ #include <linux/sched.h> #include <linux/wait.h> #include <linux/list.h> +#include <linux/llist.h> #include <linux/rbtree.h> #include <linux/workqueue.h> #include <linux/kref.h> @@ -49,6 +34,8 @@ #include "reservations.h" +#include "filecheck.h" + /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of @@ -143,6 +130,12 @@ enum ocfs2_unlock_action { * before the upconvert * has completed */ +#define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster + * lock has already + * returned, do not block + * dc thread from + * downconverting */ + struct ocfs2_lock_res_ops; typedef void (*ocfs2_lock_callback)(int status, unsigned long data); @@ -155,16 +148,18 @@ struct ocfs2_lock_stats { /* Storing max wait in usecs saves 24 bytes per inode */ u32 ls_max; /* Max wait in USEC */ + u64 ls_last; /* Last unlock time in USEC */ }; #endif struct ocfs2_lock_res { void *l_priv; - struct ocfs2_lock_res_ops *l_ops; + const struct ocfs2_lock_res_ops *l_ops; struct list_head l_blocked_list; struct list_head l_mask_waiters; + struct list_head l_holders; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; @@ -195,6 +190,7 @@ struct ocfs2_lock_res { #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */ u32 l_lock_refresh; /* Disk refreshes */ + u64 l_lock_wait; /* First lock wait time */ struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -202,6 +198,11 @@ struct ocfs2_lock_res { #endif }; +enum ocfs2_orphan_reco_type { + ORPHAN_NO_NEED_TRUNCATE = 0, + ORPHAN_NEED_TRUNCATE, +}; + enum ocfs2_orphan_scan_state { ORPHAN_SCAN_ACTIVE, ORPHAN_SCAN_INACTIVE @@ -212,7 +213,7 @@ struct ocfs2_orphan_scan { struct ocfs2_super *os_osb; struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ struct delayed_work os_orphan_scan_work; - struct timespec os_scantime; /* time this node ran the scan */ + time64_t os_scantime; /* time this node ran the scan */ u32 os_count; /* tracks node specific scans */ u32 os_seqno; /* tracks cluster wide scans */ atomic_t os_state; /* ACTIVE or INACTIVE */ @@ -220,7 +221,7 @@ struct ocfs2_orphan_scan { struct ocfs2_dlm_debug { struct kref d_refcnt; - struct dentry *d_locking_state; + u32 d_filter_secs; struct list_head d_lockres_tracking; }; @@ -272,21 +273,61 @@ enum ocfs2_mount_options writes */ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ + + OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ + OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ + OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ +}; + +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 + +struct ocfs2_triggers { + struct jbd2_buffer_trigger_type ot_triggers; + int ot_offset; + struct super_block *sb; +}; + +enum ocfs2_journal_trigger_type { + OCFS2_JTR_DI, + OCFS2_JTR_EB, + OCFS2_JTR_RB, + OCFS2_JTR_GD, + OCFS2_JTR_DB, + OCFS2_JTR_XB, + OCFS2_JTR_DQ, + OCFS2_JTR_DR, + OCFS2_JTR_DL, + OCFS2_JTR_NONE /* This must be the last entry */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 +#define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE + +void ocfs2_initialize_journal_triggers(struct super_block *sb, + struct ocfs2_triggers triggers[]); -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +enum ocfs2_recovery_state { + OCFS2_REC_ENABLED = 0, + OCFS2_REC_QUOTA_WANT_DISABLE, + /* + * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for + * ocfs2_recovery_disable_quota() to work. + */ + OCFS2_REC_QUOTA_DISABLED, + OCFS2_REC_WANT_DISABLE, + /* + * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work + */ + OCFS2_REC_DISABLED, +}; struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_replay_map; struct ocfs2_quota_recovery; -struct ocfs2_dentry_lock; struct ocfs2_super { struct task_struct *commit_task; @@ -306,7 +347,6 @@ struct ocfs2_super u64 system_dir_blkno; u64 bitmap_blkno; u32 bitmap_cpg; - u8 *uuid; char *uuid_str; u32 uuid_hash; u8 *vol_label; @@ -323,8 +363,8 @@ struct ocfs2_super spinlock_t osb_lock; u32 s_next_generation; unsigned long osb_flags; - s16 s_inode_steal_slot; - s16 s_meta_steal_slot; + u16 s_inode_steal_slot; + u16 s_meta_steal_slot; atomic_t s_num_inodes_stolen; atomic_t s_num_meta_stolen; @@ -345,15 +385,18 @@ struct ocfs2_super struct ocfs2_recovery_map *recovery_map; struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; - int disable_recovery; + enum ocfs2_recovery_state recovery_state; wait_queue_head_t checkpoint_event; struct ocfs2_journal *journal; unsigned long osb_commit_interval; + /* Journal triggers for checksum */ + struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT]; + struct delayed_work la_enable_wq; /* - * Must hold local alloc i_mutex and osb->osb_lock to change + * Must hold local alloc i_rwsem and osb->osb_lock to change * local_alloc_bits. Reads can be done under either lock. */ unsigned int local_alloc_bits; @@ -374,9 +417,8 @@ struct ocfs2_super unsigned int osb_resv_level; unsigned int osb_dir_resv_level; - /* Next three fields are for local node slot recovery during + /* Next two fields are for local node slot recovery during * mount. */ - int dirty; struct ocfs2_dinode *local_alloc_copy; struct ocfs2_quota_recovery *quota_rec; @@ -387,14 +429,17 @@ struct ocfs2_super u8 osb_stackflags; char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; + struct rw_semaphore nfs_sync_rwlock; + struct ocfs2_lock_res osb_trim_fs_lockres; + struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; - struct dentry *osb_ctxt; wait_queue_head_t recovery_event; @@ -413,10 +458,9 @@ struct ocfs2_super struct list_head blocked_lock_list; unsigned long blocked_lock_count; - /* List of dentry locks to release. Anyone can add locks to - * the list, ocfs2_wq processes the list */ - struct ocfs2_dentry_lock *dentry_lock_list; - struct work_struct dentry_lock_work; + /* List of dquot structures to drop last reference to */ + struct llist_head dquot_drop_list; + struct work_struct dquot_drop_work; wait_queue_head_t osb_mount_event; @@ -424,9 +468,10 @@ struct ocfs2_super struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct delayed_work osb_truncate_log_wq; + atomic_t osb_tl_disable; /* * How many clusters in our truncate log. - * It must be protected by osb_tl_inode->i_mutex. + * It must be protected by osb_tl_inode->i_rwsem. */ unsigned int truncated_clusters; @@ -448,6 +493,22 @@ struct ocfs2_super /* rb tree root for refcount lock. */ struct rb_root osb_rf_lock_tree; struct ocfs2_refcount_tree *osb_ref_tree_lru; + + struct mutex system_file_mutex; + + /* + * OCFS2 needs to schedule several different types of work which + * require cluster locking, disk I/O, recovery waits, etc. Since these + * types of work tend to be heavy we avoid using the kernel events + * workqueue and schedule on our own. + */ + struct workqueue_struct *ocfs2_wq; + + /* sysfs directory per partition */ + struct kset *osb_dev_kset; + + /* file check related stuff */ + struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) @@ -486,6 +547,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO) + return 1; + return 0; +} + + static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) @@ -533,8 +602,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) u32 nlink = le16_to_cpu(di->i_links_count); u32 hi = le16_to_cpu(di->i_links_count_hi); - if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL)) - nlink |= (hi << OCFS2_LINKS_HI_SHIFT); + nlink |= (hi << OCFS2_LINKS_HI_SHIFT); return nlink; } @@ -578,18 +646,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } - -static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, - unsigned long flag) -{ - unsigned long ret; - - spin_lock(&osb->osb_lock); - ret = osb->osb_flags & flag; - spin_unlock(&osb->osb_lock); - return ret; -} - static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { @@ -707,6 +763,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, return (u64)clusters << c_to_b_bits; } +static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb, + u64 blocks) +{ + int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + blocks += (1 << b_to_c_bits) - 1; + return (u32)(blocks >> b_to_c_bits); +} + static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, u64 blocks) { @@ -729,6 +795,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, return clusters; } +static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = (unsigned int)(bytes >> cl_bits); + return clusters; +} + static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, u64 bytes) { @@ -782,10 +858,10 @@ static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, u32 clusters = pg_index; unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; - if (unlikely(PAGE_CACHE_SHIFT > cbits)) - clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); - else if (PAGE_CACHE_SHIFT < cbits) - clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); + if (unlikely(PAGE_SHIFT > cbits)) + clusters = pg_index << (PAGE_SHIFT - cbits); + else if (PAGE_SHIFT < cbits) + clusters = pg_index >> (cbits - PAGE_SHIFT); return clusters; } @@ -799,10 +875,10 @@ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; pgoff_t index = clusters; - if (PAGE_CACHE_SHIFT > cbits) { - index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits); - } else if (PAGE_CACHE_SHIFT < cbits) { - index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT); + if (PAGE_SHIFT > cbits) { + index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits); + } else if (PAGE_SHIFT < cbits) { + index = (pgoff_t)clusters << (cbits - PAGE_SHIFT); } return index; @@ -813,8 +889,8 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int pages_per_cluster = 1; - if (PAGE_CACHE_SHIFT < cbits) - pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); + if (PAGE_SHIFT < cbits) + pages_per_cluster = 1 << (cbits - PAGE_SHIFT); return pages_per_cluster; } diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 938387a10d5d..f7763da5c4a2 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1,30 +1,17 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * ocfs2_fs.h * * On-disk structures for OCFS2. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License, version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef _OCFS2_FS_H #define _OCFS2_FS_H +#include <linux/magic.h> + /* Version */ #define OCFS2_MAJOR_REV_LEVEL 0 #define OCFS2_MINOR_REV_LEVEL 90 @@ -56,9 +43,6 @@ #define OCFS2_MIN_BLOCKSIZE 512 #define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE -/* Filesystem magic number */ -#define OCFS2_SUPER_MAGIC 0x7461636f - /* Object signatures */ #define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" #define OCFS2_INODE_SIGNATURE "INODE01" @@ -102,7 +86,8 @@ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ - | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ + | OCFS2_FEATURE_INCOMPAT_APPEND_DIO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -147,7 +132,7 @@ * well as the name of the cluster being joined. * mount.ocfs2 must pass in a matching stack name. * - * If not set, the classic stack will be used. This is compatbile with + * If not set, the classic stack will be used. This is compatible with * all older versions. */ #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 @@ -158,7 +143,7 @@ /* Support for extended attributes */ #define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 -/* Support for indexed directores */ +/* Support for indexed directories */ #define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400 /* Metadata checksum and error correction */ @@ -167,17 +152,22 @@ /* Refcount tree support */ #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 -/* Discontigous block groups */ +/* Discontiguous block groups */ #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 /* - * Incompat bit to indicate useable clusterinfo with stackflags for all + * Incompat bit to indicate usable clusterinfo with stackflags for all * cluster stacks (userspace adnd o2cb). If this bit is set, * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. */ #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 /* + * Append Direct IO support + */ +#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000 + +/* * backup superblock flag is used to indicate that this volume * has backup superblocks. */ @@ -199,6 +189,7 @@ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 + /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ @@ -229,6 +220,8 @@ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ #define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ +#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially + * for dio */ /* * Flags on ocfs2_dinode.i_dyn_features @@ -295,7 +288,7 @@ #define OCFS2_MAX_SLOTS 255 /* Slot map indicator for an empty slot */ -#define OCFS2_INVALID_SLOT -1 +#define OCFS2_INVALID_SLOT ((u16)-1) #define OCFS2_VOL_UUID_LEN 16 #define OCFS2_MAX_VOL_LABEL_LEN 64 @@ -331,8 +324,8 @@ struct ocfs2_system_inode_info { enum { BAD_BLOCK_SYSTEM_INODE = 0, GLOBAL_INODE_ALLOC_SYSTEM_INODE, +#define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE, -#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE HEARTBEAT_SYSTEM_INODE, GLOBAL_BITMAP_SYSTEM_INODE, USER_QUOTA_SYSTEM_INODE, @@ -384,21 +377,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { #define OCFS2_HB_GLOBAL "heartbeat=global" /* - * OCFS2 directory file types. Only the low 3 bits are used. The - * other bits are reserved for now. - */ -#define OCFS2_FT_UNKNOWN 0 -#define OCFS2_FT_REG_FILE 1 -#define OCFS2_FT_DIR 2 -#define OCFS2_FT_CHRDEV 3 -#define OCFS2_FT_BLKDEV 4 -#define OCFS2_FT_FIFO 5 -#define OCFS2_FT_SOCK 6 -#define OCFS2_FT_SYMLINK 7 - -#define OCFS2_FT_MAX 8 - -/* * OCFS2_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 @@ -416,17 +394,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { #define OCFS2_LINKS_HI_SHIFT 16 #define OCFS2_DX_ENTRIES_MAX (0xffffffffU) -#define S_SHIFT 12 -static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR, - [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK, - [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK, -}; - /* * Convenience casts @@ -501,7 +468,8 @@ struct ocfs2_extent_list { __le16 l_reserved1; __le64 l_reserved2; /* Pad to sizeof(ocfs2_extent_rec) */ -/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */ + /* Extent records */ +/*10*/ struct ocfs2_extent_rec l_recs[] __counted_by_le(l_count); }; /* @@ -515,7 +483,8 @@ struct ocfs2_chain_list { __le16 cl_count; /* Total chains in this list */ __le16 cl_next_free_rec; /* Next unused chain slot */ __le64 cl_reserved1; -/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */ + /* Chain records */ +/*10*/ struct ocfs2_chain_rec cl_recs[] __counted_by_le(cl_count); }; /* @@ -527,7 +496,8 @@ struct ocfs2_truncate_log { /*00*/ __le16 tl_count; /* Total records in this log */ __le16 tl_used; /* Number of records in use */ __le32 tl_reserved1; -/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */ + /* Truncate records */ +/*08*/ struct ocfs2_truncate_rec tl_recs[] __counted_by_le(tl_count); }; /* @@ -560,7 +530,7 @@ struct ocfs2_extent_block * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { -/*00*/ __le16 sm_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots); /* * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. @@ -571,7 +541,7 @@ struct ocfs2_extended_slot { /*00*/ __u8 es_valid; __u8 es_reserved1[3]; __le32 es_node_num; -/*10*/ +/*08*/ }; /* @@ -581,7 +551,7 @@ struct ocfs2_extended_slot { * i_size. */ struct ocfs2_slot_map_extended { -/*00*/ struct ocfs2_extended_slot se_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots); /* * Actual size is i_size of the slot_map system file. It should * match s_max_slots * sizeof(struct ocfs2_extended_slot) @@ -647,7 +617,7 @@ struct ocfs2_super_block { __le16 s_reserved0; __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. * s_uuid_hash serves as seed[3]. */ -/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ +/*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */ /*140*/ /* @@ -671,7 +641,7 @@ struct ocfs2_local_alloc __le16 la_size; /* Size of included bitmap, in bytes */ __le16 la_reserved1; __le64 la_reserved2; -/*10*/ __u8 la_bitmap[0]; +/*10*/ __u8 la_bitmap[]; }; /* @@ -684,7 +654,7 @@ struct ocfs2_inline_data * for data, starting at id_data */ __le16 id_reserved0; __le32 id_reserved1; - __u8 id_data[0]; /* Start of user data */ + __u8 id_data[]; /* Start of user data */ }; /* @@ -729,7 +699,9 @@ struct ocfs2_dinode { inode belongs to. Only valid if allocated from a discontiguous block group */ -/*A0*/ __le64 i_reserved2[3]; +/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */ + __le16 i_reserved1[3]; + __le64 i_reserved2[2]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -758,7 +730,7 @@ struct ocfs2_dinode { struct ocfs2_extent_list i_list; struct ocfs2_truncate_log i_dealloc; struct ocfs2_inline_data i_data; - __u8 i_symlink[0]; + DECLARE_FLEX_ARRAY(__u8, i_symlink); } id2; /* Actual on-disk size is one block */ }; @@ -797,11 +769,11 @@ struct ocfs2_dir_block_trailer { * in this block. (unused) */ /*10*/ __u8 db_signature[8]; /* Signature for verification */ __le64 db_reserved2; - __le64 db_free_next; /* Next block in list (unused) */ -/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */ - __le64 db_parent_dinode; /* dinode which owns me, in +/*20*/ __le64 db_free_next; /* Next block in list (unused) */ + __le64 db_blkno; /* Offset on disk, in blocks */ +/*30*/ __le64 db_parent_dinode; /* dinode which owns me, in blocks */ -/*30*/ struct ocfs2_block_check db_check; /* Error checking */ + struct ocfs2_block_check db_check; /* Error checking */ /*40*/ }; @@ -827,9 +799,10 @@ struct ocfs2_dx_entry_list { * possible in de_entries */ __le16 de_num_used; /* Current number of * de_entries entries */ - struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries - * in a packed array of - * length de_num_used */ + /* Indexed dir entries in a packed + * array of length de_num_used. + */ + struct ocfs2_dx_entry de_entries[] __counted_by_le(de_count); }; #define OCFS2_DX_FLAG_INLINE 0x01 @@ -914,7 +887,8 @@ struct ocfs2_group_desc __le16 bg_free_bits_count; /* Free bits count */ __le16 bg_chain; /* What chain I am in. */ /*10*/ __le32 bg_generation; - __le32 bg_reserved1; + __le16 bg_contig_free_bits; /* max contig free bits length */ + __le16 bg_reserved1; __le64 bg_next_group; /* Next group in my list, in blocks */ /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in @@ -923,12 +897,12 @@ struct ocfs2_group_desc /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; /*40*/ union { - __u8 bg_bitmap[0]; + DECLARE_FLEX_ARRAY(__u8, bg_bitmap); struct { /* * Block groups may be discontiguous when * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set. - * The extents of a discontigous block group are + * The extents of a discontiguous block group are * stored in bg_list. It is a flat list. * l_tree_depth must always be zero. A * discontiguous group is signified by a non-zero @@ -964,7 +938,8 @@ struct ocfs2_refcount_list { __le16 rl_used; /* Current number of used records */ __le32 rl_reserved2; __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ -/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */ + /* Refcount records */ +/*10*/ struct ocfs2_refcount_rec rl_recs[] __counted_by_le(rl_count); }; @@ -1050,7 +1025,8 @@ struct ocfs2_xattr_header { buckets. A block uses xb_check and sets this field to zero.) */ - struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ + /* xattr entry list. */ + struct ocfs2_xattr_entry xh_entries[] __counted_by_le(xh_count); }; /* @@ -1113,7 +1089,7 @@ struct ocfs2_xattr_block { struct ocfs2_xattr_header xb_header; /* xattr header if this block contains xattr */ struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this - block cotains xattr + block contains xattr tree. */ } xb_attrs; }; @@ -1236,7 +1212,7 @@ struct ocfs2_local_disk_dqinfo { /* Header of one chunk of a quota file */ struct ocfs2_local_disk_chunk { __le32 dqc_free; /* Number of free entries in the bitmap */ - __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + __u8 dqc_bitmap[]; /* Bitmap of entries in the corresponding * chunk of quota file */ }; @@ -1619,7 +1595,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, umode_t mode) { - de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + de->file_type = fs_umode_to_ftype(mode); } static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd) diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 5b27ff1fa577..2de2f8733283 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -1,34 +1,16 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * ocfs2_ioctl.h * * Defines OCFS2 ioctls. * * Copyright (C) 2010 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License, version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef OCFS2_IOCTL_H #define OCFS2_IOCTL_H /* - * ioctl commands - */ -#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS -#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS -#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS - -/* * Space reservation / allocation / free ioctls and argument structure * are designed to be compatible with XFS. * @@ -233,7 +215,7 @@ struct ocfs2_move_extents { movement less likely to fail, may make fs even more fragmented */ -#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation +#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmentation completely gets done. */ diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index d277aabf5dfb..9b234c03d693 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * ocfs2_lockid.h * * Defines OCFS2 lockid bits. * * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_LOCKID_H @@ -50,6 +34,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_LOCK_TYPE_ORPHAN_SCAN, OCFS2_LOCK_TYPE_REFCOUNT, + OCFS2_LOCK_TYPE_TRIM_FS, OCFS2_NUM_LOCK_TYPES }; @@ -93,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_REFCOUNT: c = 'T'; break; + case OCFS2_LOCK_TYPE_TRIM_FS: + c = 'I'; + break; default: c = '\0'; } @@ -105,7 +93,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_DATA] = "Data", [OCFS2_LOCK_TYPE_SUPER] = "Super", [OCFS2_LOCK_TYPE_RENAME] = "Rename", - /* Need to differntiate from [R]ename.. serializing writes is the + /* Need to differentiate from [R]ename.. serializing writes is the * important job it does, anyway. */ [OCFS2_LOCK_TYPE_RW] = "Write/Read", [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", @@ -115,6 +103,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", + [OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h index 2e45c8d2ea7e..31a5e1619e7f 100644 --- a/fs/ocfs2/ocfs2_lockingver.h +++ b/fs/ocfs2/ocfs2_lockingver.h @@ -1,20 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * ocfs2_lockingver.h * * Defines OCFS2 Locking version values. * * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License, version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef OCFS2_LOCKINGVER_H diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 3b481f490633..4b32fb5658ad 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ocfs2 @@ -81,7 +82,7 @@ DECLARE_EVENT_CLASS(ocfs2__string, __string(name,name) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); ), TP_printk("%s", __get_str(name)) ); @@ -711,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent, DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm); + DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); /* End of trace events for fs/ocfs2/alloc.c. */ @@ -1154,8 +1157,6 @@ DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end); DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage); -DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage); - DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap); TRACE_EVENT(ocfs2_try_to_write_inline_data, @@ -1288,7 +1289,7 @@ DECLARE_EVENT_CLASS(ocfs2__file_ops, __entry->dentry = dentry; __entry->ino = ino; __entry->d_len = d_len; - __assign_str(d_name, d_name); + __assign_str(d_name); __entry->para = para; ), TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file, @@ -1310,14 +1311,12 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_release); DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); +DEFINE_OCFS2_FILE_OPS(ocfs2_file_write_iter); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); +DEFINE_OCFS2_FILE_OPS(ocfs2_file_read_iter); DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); - DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error); @@ -1426,7 +1425,7 @@ TRACE_EVENT(ocfs2_setattr, __entry->dentry = dentry; __entry->ino = ino; __entry->d_len = d_len; - __assign_str(d_name, d_name); + __assign_str(d_name); __entry->ia_valid = ia_valid; __entry->ia_mode = ia_mode; __entry->ia_uid = ia_uid; @@ -1450,31 +1449,26 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); TRACE_EVENT(ocfs2_prepare_inode_for_write, TP_PROTO(unsigned long long ino, unsigned long long saved_pos, - int appending, unsigned long count, - int *direct_io, int *has_refcount), - TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount), + unsigned long count, int wait), + TP_ARGS(ino, saved_pos, count, wait), TP_STRUCT__entry( __field(unsigned long long, ino) __field(unsigned long long, saved_pos) - __field(int, appending) __field(unsigned long, count) - __field(int, direct_io) - __field(int, has_refcount) + __field(int, wait) ), TP_fast_assign( __entry->ino = ino; __entry->saved_pos = saved_pos; - __entry->appending = appending; __entry->count = count; - __entry->direct_io = direct_io ? *direct_io : -1; - __entry->has_refcount = has_refcount ? *has_refcount : -1; + __entry->wait = wait; ), - TP_printk("%llu %llu %d %lu %d %d", __entry->ino, - __entry->saved_pos, __entry->appending, __entry->count, - __entry->direct_io, __entry->has_refcount) + TP_printk("%llu %llu %lu %d", __entry->ino, + __entry->saved_pos, __entry->count, __entry->wait) ); -DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); +DEFINE_OCFS2_INT_EVENT(generic_file_read_iter_ret); +DEFINE_OCFS2_INT_EVENT(filemap_splice_read_ret); /* End of trace events for fs/ocfs2/file.c. */ @@ -1540,6 +1534,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state); DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block); +DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block); +DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block); TRACE_EVENT(ocfs2_inode_is_valid_to_delete, TP_PROTO(void *task, void *dc_task, unsigned long long ino, @@ -1573,8 +1569,6 @@ DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode); DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode); -DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode); - TRACE_EVENT(ocfs2_inode_revalidate, TP_PROTO(void *inode, unsigned long long ino, unsigned int flags), @@ -1662,34 +1656,34 @@ TRACE_EVENT(ocfs2_remount, ); TRACE_EVENT(ocfs2_fill_super, - TP_PROTO(void *sb, void *data, int silent), - TP_ARGS(sb, data, silent), + TP_PROTO(void *sb, void *fc, int silent), + TP_ARGS(sb, fc, silent), TP_STRUCT__entry( __field(void *, sb) - __field(void *, data) + __field(void *, fc) __field(int, silent) ), TP_fast_assign( __entry->sb = sb; - __entry->data = data; + __entry->fc = fc; __entry->silent = silent; ), TP_printk("%p %p %d", __entry->sb, - __entry->data, __entry->silent) + __entry->fc, __entry->silent) ); TRACE_EVENT(ocfs2_parse_options, - TP_PROTO(int is_remount, char *options), - TP_ARGS(is_remount, options), + TP_PROTO(int is_remount, const char *option), + TP_ARGS(is_remount, option), TP_STRUCT__entry( __field(int, is_remount) - __string(options, options) + __string(option, option) ), TP_fast_assign( __entry->is_remount = is_remount; - __assign_str(options, options); + __assign_str(option); ), - TP_printk("%d %s", __entry->is_remount, __get_str(options)) + TP_printk("%d %s", __entry->is_remount, __get_str(option)) ); DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super); @@ -1722,8 +1716,8 @@ TRACE_EVENT(ocfs2_initialize_super, __field(int, cluster_bits) ), TP_fast_assign( - __assign_str(label, label); - __assign_str(uuid_str, uuid_str); + __assign_str(label); + __assign_str(uuid_str); __entry->root_dir = root_dir; __entry->system_dir = system_dir; __entry->cluster_bits = cluster_bits; @@ -1750,7 +1744,7 @@ TRACE_EVENT(ocfs2_init_xattr_set_ctxt, __field(int, credits) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->meta = meta; __entry->clusters = clusters; __entry->credits = credits; @@ -1774,7 +1768,7 @@ DECLARE_EVENT_CLASS(ocfs2__xattr_find, ), TP_fast_assign( __entry->ino = ino; - __assign_str(name, name); + __assign_str(name); __entry->name_index = name_index; __entry->hash = hash; __entry->location = location; @@ -2023,7 +2017,7 @@ TRACE_EVENT(ocfs2_sync_dquot_helper, __entry->dq_id = dq_id; __entry->dq_type = dq_type; __entry->type = type; - __assign_str(s_id, s_id); + __assign_str(s_id); ), TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type, __entry->type, __get_str(s_id)) @@ -2035,6 +2029,8 @@ DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot); DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot); +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_get_next_id); + DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty); /* End of trace events for fs/ocfs2/quota_global.c. */ @@ -2062,7 +2058,7 @@ TRACE_EVENT(ocfs2_dx_dir_search, TP_fast_assign( __entry->ino = ino; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->major_hash = major_hash; __entry->minor_hash = minor_hash; __entry->blkno = blkno; @@ -2090,7 +2086,7 @@ TRACE_EVENT(ocfs2_find_files_on_disk, ), TP_fast_assign( __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->blkno = blkno; __entry->dir = dir; ), @@ -2109,7 +2105,7 @@ TRACE_EVENT(ocfs2_check_dir_for_entry, TP_fast_assign( __entry->dir = dir; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%llu %.*s", __entry->dir, __entry->namelen, __get_str(name)) @@ -2137,7 +2133,7 @@ TRACE_EVENT(ocfs2_dx_dir_index_root_block, __entry->major_hash = major_hash; __entry->minor_hash = minor_hash; __entry->namelen = namelen; - __assign_str(name, name); + __assign_str(name); __entry->num_used = num_used; ), TP_printk("%llu %x %x %.*s %u", __entry->dir, @@ -2173,7 +2169,7 @@ DECLARE_EVENT_CLASS(ocfs2__dentry_ops, __entry->dir = dir; __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->dir_blkno = dir_blkno; __entry->extra = extra; ), @@ -2219,7 +2215,7 @@ TRACE_EVENT(ocfs2_mknod, __entry->dir = dir; __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->dir_blkno = dir_blkno; __entry->dev = dev; __entry->mode = mode; @@ -2243,9 +2239,9 @@ TRACE_EVENT(ocfs2_link, TP_fast_assign( __entry->ino = ino; __entry->old_len = old_len; - __assign_str(old_name, old_name); + __assign_str(old_name); __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%llu %.*s %.*s", __entry->ino, __entry->old_len, __get_str(old_name), @@ -2281,9 +2277,9 @@ TRACE_EVENT(ocfs2_rename, __entry->new_dir = new_dir; __entry->new_dentry = new_dentry; __entry->old_len = old_len; - __assign_str(old_name, old_name); + __assign_str(old_name); __entry->new_len = new_len; - __assign_str(new_name, new_name); + __assign_str(new_name); ), TP_printk("%p %p %p %p %.*s %.*s", __entry->old_dir, __entry->old_dentry, @@ -2292,6 +2288,8 @@ TRACE_EVENT(ocfs2_rename, __entry->new_len, __get_str(new_name)) ); +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted); + TRACE_EVENT(ocfs2_rename_target_exists, TP_PROTO(int new_len, const char *new_name), TP_ARGS(new_len, new_name), @@ -2301,7 +2299,7 @@ TRACE_EVENT(ocfs2_rename_target_exists, ), TP_fast_assign( __entry->new_len = new_len; - __assign_str(new_name, new_name); + __assign_str(new_name); ), TP_printk("%.*s", __entry->new_len, __get_str(new_name)) ); @@ -2344,7 +2342,7 @@ TRACE_EVENT(ocfs2_symlink_begin, __entry->dentry = dentry; __entry->symname = symname; __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry, __entry->symname, __entry->len, __get_str(name)) @@ -2360,7 +2358,7 @@ TRACE_EVENT(ocfs2_blkno_stringify, ), TP_fast_assign( __entry->blkno = blkno; - __assign_str(name, name); + __assign_str(name); __entry->namelen = namelen; ), TP_printk("%llu %s %d", __entry->blkno, __get_str(name), @@ -2381,7 +2379,7 @@ TRACE_EVENT(ocfs2_orphan_del, ), TP_fast_assign( __entry->dir = dir; - __assign_str(name, name); + __assign_str(name); __entry->namelen = namelen; ), TP_printk("%llu %s %d", __entry->dir, __get_str(name), @@ -2403,7 +2401,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate, TP_fast_assign( __entry->dentry = dentry; __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name)) ); @@ -2420,7 +2418,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate_negative, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->pgen = pgen; __entry->gen = gen; ), @@ -2445,7 +2443,7 @@ TRACE_EVENT(ocfs2_find_local_alias, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); ), TP_printk("%.*s", __entry->len, __get_str(name)) ); @@ -2462,7 +2460,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock, ), TP_fast_assign( __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->parent = parent; __entry->fsdata = fsdata; ), @@ -2480,7 +2478,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock_found, __field(unsigned long long, ino) ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->parent = parent; __entry->ino = ino; ), @@ -2527,7 +2525,7 @@ TRACE_EVENT(ocfs2_get_parent, TP_fast_assign( __entry->child = child; __entry->len = len; - __assign_str(name, name); + __assign_str(name); __entry->ino = ino; ), TP_printk("%p %.*s %llu", __entry->child, __entry->len, @@ -2551,7 +2549,7 @@ TRACE_EVENT(ocfs2_encode_fh_begin, TP_fast_assign( __entry->dentry = dentry; __entry->name_len = name_len; - __assign_str(name, name); + __assign_str(name); __entry->fh = fh; __entry->len = len; __entry->connectable = connectable; @@ -2577,8 +2575,12 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); +DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits); + DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans); + DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access); DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty); diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index d5ab56cbe5c5..788a8de922a4 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * quota.h for OCFS2 * @@ -17,6 +18,9 @@ #include "ocfs2.h" +/* Number of quota types we support */ +#define OCFS2_MAXQUOTAS 2 + /* * In-memory structures */ @@ -28,6 +32,7 @@ struct ocfs2_dquot { unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ s64 dq_origspace; /* Last globally synced space usage */ s64 dq_originodes; /* Last globally synced inode usage */ + struct llist_node list; /* Member of list of dquots to drop */ }; /* Description of one chunk to recover in memory */ @@ -38,12 +43,13 @@ struct ocfs2_recovery_chunk { }; struct ocfs2_quota_recovery { - struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */ + struct list_head r_list[OCFS2_MAXQUOTAS]; /* List of chunks to recover */ }; /* In-memory structure with quota header information */ struct ocfs2_mem_dqinfo { unsigned int dqi_type; /* Quota type this structure describes */ + unsigned int dqi_flags; /* Flags OLQF_* */ unsigned int dqi_chunks; /* Number of chunks in local quota file */ unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ unsigned int dqi_syncms; /* How often should we sync with other nodes */ @@ -77,7 +83,7 @@ struct ocfs2_quota_chunk { extern struct kmem_cache *ocfs2_dquot_cachep; extern struct kmem_cache *ocfs2_qf_chunk_cachep; -extern struct qtree_fmt_operations ocfs2_global_ops; +extern const struct qtree_fmt_operations ocfs2_global_ops; struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( struct ocfs2_super *osb, int slot_num); @@ -91,7 +97,6 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); int ocfs2_global_read_info(struct super_block *sb, int type); int ocfs2_global_write_info(struct super_block *sb, int type); -int ocfs2_global_read_dquot(struct dquot *dquot); int __ocfs2_sync_dquot(struct dquot *dquot, int freeing); static inline int ocfs2_sync_dquot(struct dquot *dquot) { @@ -110,6 +115,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, int ocfs2_create_local_dquot(struct dquot *dquot); int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); int ocfs2_local_write_dquot(struct dquot *dquot); +void ocfs2_drop_dquot_refs(struct work_struct *work); extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 332a281f217e..e85b1ccf81be 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Implementation of operations over global quota file */ @@ -10,6 +11,8 @@ #include <linux/jiffies.h> #include <linux/writeback.h> #include <linux/workqueue.h> +#include <linux/llist.h> +#include <linux/iversion.h> #include <cluster/masklog.h> @@ -32,8 +35,8 @@ * Locking of quotas with OCFS2 is rather complex. Here are rules that * should be obeyed by all the functions: * - any write of quota structure (either to local or global file) is protected - * by dqio_mutex or dquot->dq_lock. - * - any modification of global quota file holds inode cluster lock, i_mutex, + * by dqio_sem or dquot->dq_lock. + * - any modification of global quota file holds inode cluster lock, i_rwsem, * and ip_alloc_sem of the global quota file (achieved by * ocfs2_lock_global_qf). It also has to hold qinfo_lock. * - an allocation of new blocks for local quota file is protected by @@ -41,9 +44,9 @@ * * A rough sketch of locking dependencies (lf = local file, gf = global file): * Normal filesystem operation: - * start_trans -> dqio_mutex -> write to lf + * start_trans -> dqio_sem -> write to lf * Syncing of local and global file: - * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock -> * write to gf * -> write to lf * Acquire dquot for the first time: @@ -59,7 +62,7 @@ * Recovery: * inode cluster lock of recovered lf * -> read bitmaps -> ip_alloc_sem of lf - * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * -> ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock -> * write to gf */ @@ -122,7 +125,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot) dquot->dq_id); } -struct qtree_fmt_operations ocfs2_global_ops = { +const struct qtree_fmt_operations ocfs2_global_ops = { .mem2disk_dqblk = ocfs2_global_mem2diskdqb, .disk2mem_dqblk = ocfs2_global_disk2memdqb, .is_id = ocfs2_global_is_id, @@ -234,7 +237,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; } - if (gqinode->i_size < off + len) { + if (i_size_read(gqinode) < off + len) { loff_t rounded_end = ocfs2_align_bytes_to_blocks(sb, off + len); @@ -270,7 +273,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, if (new) memset(bh->b_data, 0, sb->s_blocksize); memcpy(bh->b_data + offset, data, len); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); set_buffer_uptodate(bh); unlock_buffer(bh); ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh); @@ -287,7 +290,7 @@ out: mlog_errno(err); return err; } - gqinode->i_version++; + inode_inc_iversion(gqinode); ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); return len; } @@ -307,7 +310,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) WARN_ON(bh != oinfo->dqi_gqi_bh); spin_unlock(&dq_data_lock); if (ex) { - mutex_lock(&oinfo->dqi_gqinode->i_mutex); + inode_lock(oinfo->dqi_gqinode); down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } else { down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); @@ -319,7 +322,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) { if (ex) { up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); - mutex_unlock(&oinfo->dqi_gqinode->i_mutex); + inode_unlock(oinfo->dqi_gqinode); } else { up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } @@ -334,24 +337,14 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) /* Read information header from global quota file */ int ocfs2_global_read_info(struct super_block *sb, int type) { - struct inode *gqinode = NULL; - unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, - GROUP_QUOTA_SYSTEM_INODE }; + unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; struct ocfs2_global_disk_dqinfo dinfo; struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; u64 pcount; int status; - /* Read global header */ - gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], - OCFS2_INVALID_SLOT); - if (!gqinode) { - mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n", - type); - status = -EINVAL; - goto out_err; - } oinfo->dqi_gi.dqi_sb = sb; oinfo->dqi_gi.dqi_type = type; ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); @@ -359,21 +352,35 @@ int ocfs2_global_read_info(struct super_block *sb, int type) oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; oinfo->dqi_gqi_bh = NULL; oinfo->dqi_gqi_count = 0; - oinfo->dqi_gqinode = gqinode; + + /* Read global header */ + oinfo->dqi_gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + OCFS2_INVALID_SLOT); + if (!oinfo->dqi_gqinode) { + mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n", + type); + status = -EINVAL; + goto out_err; + } + status = ocfs2_lock_global_qf(oinfo, 0); if (status < 0) { mlog_errno(status); goto out_err; } - status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk, + status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk, &pcount, NULL); - if (status < 0) + if (status < 0) { + mlog_errno(status); goto out_unlock; + } status = ocfs2_qinfo_lock(oinfo, 0); - if (status < 0) + if (status < 0) { + mlog_errno(status); goto out_unlock; + } status = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct ocfs2_global_disk_dqinfo), OCFS2_GLOBAL_INFO_OFF); @@ -401,15 +408,14 @@ int ocfs2_global_read_info(struct super_block *sb, int type) schedule_delayed_work(&oinfo->dqi_sync_work, msecs_to_jiffies(oinfo->dqi_syncms)); -out_err: - return status; + return 0; out_unlock: ocfs2_unlock_global_qf(oinfo, 0); - mlog_errno(status); - goto out_err; +out_err: + return status; } -/* Write information to global quota file. Expects exlusive lock on quota +/* Write information to global quota file. Expects exclusive lock on quota * file inode and quota info */ static int __ocfs2_global_write_info(struct super_block *sb, int type) { @@ -442,13 +448,20 @@ static int __ocfs2_global_write_info(struct super_block *sb, int type) int ocfs2_global_write_info(struct super_block *sb, int type) { int err; - struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct quota_info *dqopt = sb_dqopt(sb); + struct ocfs2_mem_dqinfo *info = dqopt->info[type].dqi_priv; + unsigned int memalloc; + down_write(&dqopt->dqio_sem); + memalloc = memalloc_nofs_save(); err = ocfs2_qinfo_lock(info, 1); if (err < 0) - return err; + goto out_sem; err = __ocfs2_global_write_info(sb, type); ocfs2_qinfo_unlock(info, 1); +out_sem: + memalloc_nofs_restore(memalloc); + up_write(&dqopt->dqio_sem); return err; } @@ -482,7 +495,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; struct ocfs2_global_disk_dqblk dqblk; s64 spacechange, inodechange; - time_t olditime, oldbtime; + time64_t olditime, oldbtime; err = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct ocfs2_global_disk_dqblk), @@ -499,7 +512,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) /* Update space and inode usage. Get also other information from * global quota file so that we don't overwrite any changes there. * We are */ - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); spacechange = dquot->dq_dqb.dqb_curspace - OCFS2_DQUOT(dquot)->dq_origspace; inodechange = dquot->dq_dqb.dqb_curinodes - @@ -555,7 +568,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); err = ocfs2_qinfo_lock(info, freeing); if (err < 0) { mlog(ML_ERROR, "Failed to lock quota info, losing quota write" @@ -594,6 +607,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; struct ocfs2_super *osb = OCFS2_SB(sb); int status = 0; + unsigned int memalloc; trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id), dquot->dq_id.type, @@ -610,7 +624,8 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) mlog_errno(status); goto out_ilock; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); + down_write(&sb_dqopt(sb)->dqio_sem); + memalloc = memalloc_nofs_save(); status = ocfs2_sync_dquot(dquot); if (status < 0) mlog_errno(status); @@ -618,7 +633,8 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) status = ocfs2_local_write_dquot(dquot); if (status < 0) mlog_errno(status); - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + memalloc_nofs_restore(memalloc); + up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out_ilock: ocfs2_unlock_global_qf(oinfo, 1); @@ -633,7 +649,15 @@ static void qsync_work_fn(struct work_struct *work) dqi_sync_work.work); struct super_block *sb = oinfo->dqi_gqinode->i_sb; - dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); + /* + * We have to be careful here not to deadlock on s_umount as umount + * disabling quotas may be in progress and it waits for this work to + * complete. If trylock fails, we'll do the sync next time... + */ + if (down_read_trylock(&sb->s_umount)) { + dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); + up_read(&sb->s_umount); + } schedule_delayed_work(&oinfo->dqi_sync_work, msecs_to_jiffies(oinfo->dqi_syncms)); } @@ -647,6 +671,7 @@ static int ocfs2_write_dquot(struct dquot *dquot) handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; + unsigned int memalloc; trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id), dquot->dq_id.type); @@ -657,9 +682,11 @@ static int ocfs2_write_dquot(struct dquot *dquot) mlog_errno(status); goto out; } - mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + down_write(&sb_dqopt(dquot->dq_sb)->dqio_sem); + memalloc = memalloc_nofs_save(); status = ocfs2_local_write_dquot(dquot); - mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + memalloc_nofs_restore(memalloc); + up_write(&sb_dqopt(dquot->dq_sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out: return status; @@ -679,6 +706,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) OCFS2_INODE_UPDATE_CREDITS; } +void ocfs2_drop_dquot_refs(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dquot_drop_work); + struct llist_node *list; + struct ocfs2_dquot *odquot, *next_odquot; + + list = llist_del_all(&osb->dquot_drop_list); + llist_for_each_entry_safe(odquot, next_odquot, list, list) { + /* Drop the reference we acquired in ocfs2_dquot_release() */ + dqput(&odquot->dq_dquot); + } +} + +/* + * Called when the last reference to dquot is dropped. If we are called from + * downconvert thread, we cannot do all the handling here because grabbing + * quota lock could deadlock (the node holding the quota lock could need some + * other cluster lock to proceed but with blocked downconvert thread we cannot + * release any lock). + */ static int ocfs2_release_dquot(struct dquot *dquot) { handle_t *handle; @@ -692,14 +740,32 @@ static int ocfs2_release_dquot(struct dquot *dquot) mutex_lock(&dquot->dq_lock); /* Check whether we are not racing with some other dqget() */ - if (atomic_read(&dquot->dq_count) > 1) + if (dquot_is_busy(dquot)) + goto out; + /* Running from downconvert thread? Postpone quota processing to wq */ + if (current == osb->dc_task) { + /* + * Grab our own reference to dquot and queue it for delayed + * dropping. Quota code rechecks after calling + * ->release_dquot() and won't free dquot structure. + */ + dqgrab(dquot); + /* First entry on list -> queue work */ + if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) + queue_work(osb->ocfs2_wq, &osb->dquot_drop_work); goto out; + } status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; handle = ocfs2_start_trans(osb, ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type)); if (IS_ERR(handle)) { + /* + * Mark dquot as inactive to avoid endless cycle in + * quota_release_workfn(). + */ + clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); status = PTR_ERR(handle); mlog_errno(status); goto out_ilock; @@ -717,6 +783,12 @@ static int ocfs2_release_dquot(struct dquot *dquot) */ if (status < 0) mlog_errno(status); + /* + * Clear dq_off so that we search for the structure in quota file next + * time we acquire it. The structure might be deleted and reallocated + * elsewhere by another node while our dquot structure is on freelist. + */ + dquot->dq_off = 0; clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_trans: ocfs2_commit_trans(osb, handle); @@ -756,16 +828,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) status = ocfs2_lock_global_qf(info, 1); if (status < 0) goto out; - if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { - status = ocfs2_qinfo_lock(info, 0); - if (status < 0) - goto out_dq; - status = qtree_read_dquot(&info->dqi_gi, dquot); - ocfs2_qinfo_unlock(info, 0); - if (status < 0) - goto out_dq; - } - set_bit(DQ_READ_B, &dquot->dq_flags); + status = ocfs2_qinfo_lock(info, 0); + if (status < 0) + goto out_dq; + /* + * We always want to read dquot structure from disk because we don't + * know what happened with it while it was on freelist. + */ + status = qtree_read_dquot(&info->dqi_gi, dquot); + ocfs2_qinfo_unlock(info, 0); + if (status < 0) + goto out_dq; OCFS2_DQUOT(dquot)->dq_use_count++; OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; @@ -778,8 +851,8 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) */ WARN_ON(journal_current_handle()); status = ocfs2_extend_no_holes(gqinode, NULL, - gqinode->i_size + (need_alloc << sb->s_blocksize_bits), - gqinode->i_size); + i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits), + i_size_read(gqinode)); if (status < 0) goto out_dq; } @@ -818,6 +891,37 @@ out: return status; } +static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) +{ + int type = qid->type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + int status = 0; + + trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type); + if (!sb_has_quota_active(sb, type)) { + status = -ESRCH; + goto out; + } + status = ocfs2_lock_global_qf(info, 0); + if (status < 0) + goto out; + status = ocfs2_qinfo_lock(info, 0); + if (status < 0) + goto out_global; + status = qtree_get_next_id(&info->dqi_gi, qid); + ocfs2_qinfo_unlock(info, 0); +out_global: + ocfs2_unlock_global_qf(info, 0); +out: + /* + * Avoid logging ENOENT since it just means there isn't next ID and + * ESRCH which means quota isn't enabled for the filesystem. + */ + if (status && status != -ENOENT && status != -ESRCH) + mlog_errno(status); + return status; +} + static int ocfs2_mark_dquot_dirty(struct dquot *dquot) { unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) | @@ -833,16 +937,17 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(sb); + unsigned int memalloc; trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id), type); /* In case user set some limits, sync dquot immediately to global * quota file so that information propagates quicker */ - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); if (dquot->dq_flags & mask) sync = 1; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); /* This is a slight hack but we can't afford getting global quota * lock if we already have a transaction started. */ if (!sync || journal_current_handle()) { @@ -858,7 +963,8 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) mlog_errno(status); goto out_ilock; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); + down_write(&sb_dqopt(sb)->dqio_sem); + memalloc = memalloc_nofs_save(); status = ocfs2_sync_dquot(dquot); if (status < 0) { mlog_errno(status); @@ -867,7 +973,8 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) /* Now write updated local dquot structure */ status = ocfs2_local_write_dquot(dquot); out_dlock: - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + memalloc_nofs_restore(memalloc); + up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out_ilock: ocfs2_unlock_global_qf(oinfo, 1); @@ -926,4 +1033,5 @@ const struct dquot_operations ocfs2_quota_operations = { .write_info = ocfs2_write_info, .alloc_dquot = ocfs2_alloc_dquot, .destroy_dquot = ocfs2_destroy_dquot, + .get_next_id = ocfs2_get_next_id, }; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 27fe7ee4874c..de7f12858729 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Implementation of operations over local quota file */ @@ -73,12 +74,6 @@ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off) ol_dqblk_block_off(sb, c, off); } -/* Compute block number from given offset */ -static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off) -{ - return off >> sb->s_blocksize_bits; -} - static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off) { return off & ((1 << sb->s_blocksize_bits) - 1); @@ -142,15 +137,13 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, int rc = 0; struct buffer_head *tmp = *bh; - if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { - ocfs2_error(inode->i_sb, - "Quota file %llu is probably corrupted! Requested " - "to read block %Lu but file has size only %Lu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)v_block, - (unsigned long long)i_size_read(inode)); - return -EIO; - } + if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) + return ocfs2_error(inode->i_sb, + "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_block, + (unsigned long long)i_size_read(inode)); + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, ocfs2_validate_quota_block); if (rc) @@ -166,12 +159,12 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, /* Check whether we understand format of quota files */ static int ocfs2_local_check_quota_file(struct super_block *sb, int type) { - unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; - unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; - unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; - unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; - unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, - GROUP_QUOTA_SYSTEM_INODE }; + unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; + unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; + unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; + unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; + unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; struct buffer_head *bh = NULL; struct inode *linode = sb_dqopt(sb)->files[type]; struct inode *ginode = NULL; @@ -292,7 +285,7 @@ static void olq_update_info(struct buffer_head *bh, void *private) ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + OCFS2_LOCAL_INFO_OFF); spin_lock(&dq_data_lock); - ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); + ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags); ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks); ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks); spin_unlock(&dq_data_lock); @@ -336,7 +329,7 @@ void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec) { int type; - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) free_recovery_list(&(rec->r_list[type])); kfree(rec); } @@ -382,7 +375,7 @@ static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void) rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS); if (!rec) return NULL; - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) INIT_LIST_HEAD(&(rec->r_list[type])); return rec; } @@ -392,10 +385,11 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( struct ocfs2_super *osb, int slot_num) { - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; - unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, - LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct inode *lqinode; @@ -412,7 +406,7 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( return ERR_PTR(-ENOMEM); /* First init... */ - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; /* At this point, journal of the slot is already replayed so @@ -459,8 +453,7 @@ out: /* Sync changes in local quota file into global quota file and * reinitialize local quota file. - * The function expects local quota file to be already locked and - * dqonoff_mutex locked. */ + * The function expects local quota file to be already locked. */ static int ocfs2_recover_local_quota_file(struct inode *lqinode, int type, struct ocfs2_quota_recovery *rec) @@ -476,6 +469,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, int bit, chunk; struct ocfs2_recovery_chunk *rchunk, *next; qsize_t spacechange, inodechange; + unsigned int memalloc; trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type); @@ -504,8 +498,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, dquot = dqget(sb, make_kqid(&init_user_ns, type, le64_to_cpu(dqblk->dqb_id))); - if (!dquot) { - status = -EIO; + if (IS_ERR(dquot)) { + status = PTR_ERR(dquot); mlog(ML_ERROR, "Failed to get quota structure " "for id %u, type %d. Cannot finish quota " "file recovery.\n", @@ -526,8 +520,9 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, mlog_errno(status); goto out_drop_lock; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); - spin_lock(&dq_data_lock); + down_write(&sb_dqopt(sb)->dqio_sem); + memalloc = memalloc_nofs_save(); + spin_lock(&dquot->dq_dqb_lock); /* Add usage from quota entry into quota changes * of our node. Auxiliary variables are important * due to signedness */ @@ -535,7 +530,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, inodechange = le64_to_cpu(dqblk->dqb_inodemod); dquot->dq_dqb.dqb_curspace += spacechange; dquot->dq_dqb.dqb_curinodes += inodechange; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); /* We want to drop reference held by the crashed * node. Since we have our own reference we know * global structure actually won't be freed. */ @@ -559,7 +554,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, unlock_buffer(qbh); ocfs2_journal_dirty(handle, qbh); out_commit: - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + memalloc_nofs_restore(memalloc); + up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(OCFS2_SB(sb), handle); out_drop_lock: ocfs2_unlock_global_qf(oinfo, 1); @@ -589,9 +585,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, struct ocfs2_quota_recovery *rec, int slot_num) { - unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, - LOCAL_GROUP_QUOTA_SYSTEM_INODE }; - struct super_block *sb = osb->sb; + unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; struct ocfs2_local_disk_dqinfo *ldinfo; struct buffer_head *bh; handle_t *handle; @@ -603,8 +598,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " "slot %u\n", osb->dev_str, slot_num); - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; trace_ocfs2_finish_quota_recovery(slot_num); @@ -680,8 +674,7 @@ out_put: break; } out: - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); - kfree(rec); + ocfs2_free_quota_recovery(rec); return status; } @@ -695,17 +688,15 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) int status; struct buffer_head *bh = NULL; struct ocfs2_quota_recovery *rec; - int locked = 0; + int locked = 0, global_read = 0; - /* We don't need the lock and we have to acquire quota file locks - * which will later depend on this lock */ - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); - info->dqi_maxblimit = 0x7fffffffffffffffLL; - info->dqi_maxilimit = 0x7fffffffffffffffLL; + info->dqi_max_spc_limit = 0x7fffffffffffffffLL; + info->dqi_max_ino_limit = 0x7fffffffffffffffLL; oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); if (!oinfo) { mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" " info."); + status = -ENOMEM; goto out_err; } info->dqi_priv = oinfo; @@ -718,6 +709,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) status = ocfs2_global_read_info(sb, type); if (status < 0) goto out_err; + global_read = 1; status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1); if (status < 0) { @@ -736,13 +728,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) } ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + OCFS2_LOCAL_INFO_OFF); - info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); + oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); oinfo->dqi_libh = bh; /* We crashed when using local quota file? */ - if (!(info->dqi_flags & OLQF_CLEAN)) { + if (!(oinfo->dqi_flags & OLQF_CLEAN)) { rec = OCFS2_SB(sb)->quota_rec; if (!rec) { rec = ocfs2_alloc_quota_recovery(); @@ -771,14 +763,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) } /* Now mark quota file as used */ - info->dqi_flags &= ~OLQF_CLEAN; + oinfo->dqi_flags &= ~OLQF_CLEAN; status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info); if (status < 0) { mlog_errno(status); goto out_err; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); return 0; out_err: if (oinfo) { @@ -789,11 +780,12 @@ out_err: if (locked) ocfs2_inode_unlock(lqinode, 1); ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + if (global_read) + cancel_delayed_work_sync(&oinfo->dqi_sync_work); kfree(oinfo); } brelse(bh); - mutex_lock(&sb_dqopt(sb)->dqio_mutex); - return -1; + return status; } /* Write local info to quota file */ @@ -822,7 +814,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) struct ocfs2_quota_chunk *chunk; struct ocfs2_local_disk_chunk *dchunk; int mark_clean = 1, len; - int status; + int status = 0; iput(oinfo->dqi_gqinode); ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); @@ -846,7 +838,9 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) } ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); - /* dqonoff_mutex protects us against racing with recovery thread... */ + /* + * ocfs2_dismount_volume() has already aborted quota recovery... + */ if (oinfo->dqi_rec) { ocfs2_free_quota_recovery(oinfo->dqi_rec); mark_clean = 0; @@ -856,22 +850,20 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) goto out; /* Mark local file as clean */ - info->dqi_flags |= OLQF_CLEAN; + oinfo->dqi_flags |= OLQF_CLEAN; status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], oinfo->dqi_libh, olq_update_info, info); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto out; - } - out: ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); brelse(oinfo->dqi_libh); brelse(oinfo->dqi_lqi_bh); kfree(oinfo); - return 0; + info->dqi_priv = NULL; + return status; } static void olq_set_dquot(struct buffer_head *bh, void *private) @@ -885,12 +877,12 @@ static void olq_set_dquot(struct buffer_head *bh, void *private) dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns, od->dq_dquot.dq_id)); - spin_lock(&dq_data_lock); + spin_lock(&od->dq_dquot.dq_dqb_lock); dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace - od->dq_origspace); dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes - od->dq_originodes); - spin_unlock(&dq_data_lock); + spin_unlock(&od->dq_dquot.dq_dqb_lock); trace_olq_set_dquot( (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod), (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod), @@ -929,19 +921,19 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; - struct ocfs2_quota_chunk *chunk; + struct ocfs2_quota_chunk *chunk = NULL, *iter; struct ocfs2_local_disk_chunk *dchunk; int found = 0, len; - list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + list_for_each_entry(iter, &oinfo->dqi_chunk, qc_chunk) { dchunk = (struct ocfs2_local_disk_chunk *) - chunk->qc_headerbh->b_data; + iter->qc_headerbh->b_data; if (le32_to_cpu(dchunk->dqc_free) > 0) { - found = 1; + chunk = iter; break; } } - if (!found) + if (!chunk) return NULL; if (chunk->qc_num < oinfo->dqi_chunks - 1) { @@ -982,14 +974,14 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( /* We are protected by dqio_sem so no locking needed */ status = ocfs2_extend_no_holes(lqinode, NULL, - lqinode->i_size + 2 * sb->s_blocksize, - lqinode->i_size); + i_size_read(lqinode) + 2 * sb->s_blocksize, + i_size_read(lqinode)); if (status < 0) { mlog_errno(status); goto out; } status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, - lqinode->i_size + 2 * sb->s_blocksize); + i_size_read(lqinode) + 2 * sb->s_blocksize); if (status < 0) { mlog_errno(status); goto out; @@ -1125,14 +1117,14 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( /* We are protected by dqio_sem so no locking needed */ status = ocfs2_extend_no_holes(lqinode, NULL, - lqinode->i_size + sb->s_blocksize, - lqinode->i_size); + i_size_read(lqinode) + sb->s_blocksize, + i_size_read(lqinode)); if (status < 0) { mlog_errno(status); goto out; } status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, - lqinode->i_size + sb->s_blocksize); + i_size_read(lqinode) + sb->s_blocksize); if (status < 0) { mlog_errno(status); goto out; @@ -1251,6 +1243,10 @@ int ocfs2_create_local_dquot(struct dquot *dquot) &od->dq_local_phys_blk, &pcount, NULL); + if (status < 0) { + mlog_errno(status); + goto out; + } /* Initialize dquot structure on disk */ status = ocfs2_local_write_dquot(dquot); @@ -1303,10 +1299,6 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); out: - /* Clear the read bit so that next time someone uses this - * dquot he reads fresh info from disk and allocates local - * dquot structure */ - clear_bit(DQ_READ_B, &dquot->dq_flags); return status; } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 998b17eda09d..c92e0ea85bca 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1,18 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * refcounttree.c * * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/sort.h> @@ -34,6 +24,8 @@ #include "xattr.h" #include "namei.h" #include "ocfs2_trace.h" +#include "file.h" +#include "symlink.h" #include <linux/bio.h> #include <linux/blkdev.h> @@ -42,14 +34,15 @@ #include <linux/pagevec.h> #include <linux/swap.h> #include <linux/security.h> +#include <linux/string.h> #include <linux/fsnotify.h> #include <linux/quotaops.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/posix_acl.h> struct ocfs2_cow_context { struct inode *inode; - struct file *file; u32 cow_start; u32 cow_len; struct ocfs2_extent_tree data_et; @@ -66,7 +59,7 @@ struct ocfs2_cow_context { u32 *num_clusters, unsigned int *extent_flags); int (*cow_duplicate_clusters)(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); }; @@ -102,32 +95,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb, if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { - ocfs2_error(sb, - "Refcount block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - rb->rf_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + goto out; } if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid rf_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(rb->rf_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + goto out; } if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid " - "rf_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(rb->rf_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + goto out; } - - return 0; +out: + return rc; } static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, @@ -163,6 +154,7 @@ ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) } static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) +__acquires(&rf->rf_lock) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); @@ -170,6 +162,7 @@ static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) } static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) +__releases(&rf->rf_lock) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); @@ -412,7 +405,7 @@ static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) goto out; } - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); di = (struct ocfs2_dinode *)di_bh->b_data; *ref_blkno = le64_to_cpu(di->i_refcount_loc); @@ -480,7 +473,6 @@ again: if (ret) { mlog_errno(ret); ocfs2_unlock_refcount_tree(osb, tree, rw); - ocfs2_refcount_tree_put(tree); goto out; } @@ -572,10 +564,10 @@ static int ocfs2_create_refcount_tree(struct inode *inode, u32 num_got; u64 suballoc_loc, first_blkno; - BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(ocfs2_is_refcount_inode(inode)); trace_ocfs2_create_refcount_tree( - (unsigned long long)OCFS2_I(inode)->ip_blkno); + (unsigned long long)oi->ip_blkno); ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); if (ret) { @@ -613,6 +605,11 @@ static int ocfs2_create_refcount_tree(struct inode *inode, } new_bh = sb_getblk(inode->i_sb, first_blkno); + if (!new_bh) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, @@ -625,7 +622,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, /* Initialize ocfs2_refcount_block. */ rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(rb, 0, inode->i_sb->s_blocksize); - strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + strscpy(rb->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE); rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); @@ -635,7 +632,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, rb->rf_records.rl_count = cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); spin_lock(&osb->osb_lock); - rb->rf_generation = osb->s_next_generation++; + rb->rf_generation = cpu_to_le32(osb->s_next_generation++); spin_unlock(&osb->osb_lock); ocfs2_journal_dirty(handle, new_bh); @@ -705,7 +702,7 @@ static int ocfs2_set_refcount_tree(struct inode *inode, struct ocfs2_refcount_block *rb; struct ocfs2_refcount_tree *ref_tree; - BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(ocfs2_is_refcount_inode(inode)); ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, &ref_tree, &ref_root_bh); @@ -772,7 +769,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); u16 bit = 0; - if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + if (!ocfs2_is_refcount_inode(inode)) return 0; BUG_ON(!ref_blkno); @@ -804,7 +801,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); if (ret) { @@ -864,7 +861,7 @@ out_unlock: } out_mutex: if (alloc_inode) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); } out: @@ -981,7 +978,7 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, return 0; } - if (!eb || (eb && !eb->h_next_leaf_blk)) { + if (!eb || !eb->h_next_leaf_blk) { /* * We are the last extent rec, so any high cpos should * be stored in this leaf refcount block. @@ -1066,7 +1063,7 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, struct buffer_head **ret_bh) { int ret = 0, i, found; - u32 low_cpos, uninitialized_var(cpos_end); + u32 low_cpos, cpos_end; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec = NULL; struct ocfs2_extent_block *eb = NULL; @@ -1097,12 +1094,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(sb, - "refcount tree %llu has non zero tree " - "depth in leaf btree tree block %llu\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(sb, + "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -1311,7 +1306,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, new_bh = sb_getblk(sb, blkno); if (new_bh == NULL) { - ret = -EIO; + ret = -ENOMEM; mlog_errno(ret); goto out; } @@ -1399,16 +1394,6 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b) return 0; } -static void swap_refcount_rec(void *a, void *b, int size) -{ - struct ocfs2_refcount_rec *l = a, *r = b, tmp; - - tmp = *(struct ocfs2_refcount_rec *)l; - *(struct ocfs2_refcount_rec *)l = - *(struct ocfs2_refcount_rec *)r; - *(struct ocfs2_refcount_rec *)r = tmp; -} - /* * The refcount cpos are ordered by their 64bit cpos, * But we will use the low 32 bit to be the e_cpos in the b-tree. @@ -1484,7 +1469,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, */ sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), - cmp_refcount_rec_by_low_cpos, swap_refcount_rec); + cmp_refcount_rec_by_low_cpos, NULL); ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); if (ret) { @@ -1509,11 +1494,11 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), - cmp_refcount_rec_by_cpos, swap_refcount_rec); + cmp_refcount_rec_by_cpos, NULL); sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), sizeof(struct ocfs2_refcount_rec), - cmp_refcount_rec_by_cpos, swap_refcount_rec); + cmp_refcount_rec_by_cpos, NULL); *split_cpos = cpos; return 0; @@ -1562,7 +1547,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, new_bh = sb_getblk(sb, blkno); if (new_bh == NULL) { - ret = -EIO; + ret = -ENOMEM; mlog_errno(ret); goto out; } @@ -1578,7 +1563,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, /* Initialize ocfs2_refcount_block. */ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(new_rb, 0, sb->s_blocksize); - strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + strscpy(new_rb->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE); new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); @@ -2301,11 +2286,10 @@ int ocfs2_decrease_refcount(struct inode *inode, { int ret; u64 ref_blkno; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_block(inode, &ref_blkno); if (ret) { @@ -2357,10 +2341,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2424,8 +2406,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, get_bh(prev_bh); } - rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; - trace_ocfs2_calc_refcount_meta_credits_iterate( recs_add, (unsigned long long)cpos, clusters, (unsigned long long)le64_to_cpu(rec.r_cpos), @@ -2441,7 +2421,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, * * If we will insert a new one, this is easy and only happens * during adding refcounted flag to the extent, so we don't - * have a chance of spliting. We just need one record. + * have a chance of splitting. We just need one record. * * If the refcount rec already exists, that would be a little * complicated. we may have to: @@ -2503,8 +2483,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); *meta_add += ocfs2_extend_meta_needed(et.et_root_el); *credits += ocfs2_calc_extend_credits(sb, - et.et_root_el, - ref_blocks); + et.et_root_el); } else { *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; *meta_add += 1; @@ -2540,20 +2519,17 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, int *ref_blocks) { int ret; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), refcount_loc, &tree); @@ -2633,13 +2609,13 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, } /* - * Calculate out the start and number of virtual clusters we need to to CoW. + * Calculate out the start and number of virtual clusters we need to CoW. * - * cpos is vitual start cluster position we want to do CoW in a + * cpos is virtual start cluster position we want to do CoW in a * file and write_len is the cluster length. * max_cpos is the place where we want to stop CoW intentionally. * - * Normal we will start CoW from the beginning of extent record cotaining cpos. + * Normal we will start CoW from the beginning of extent record containing cpos. * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we * get good I/O from the resulting extent tree. */ @@ -2673,11 +2649,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -2863,7 +2838,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb, int *credits) { int ret = 0, meta_add = 0; - int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + int num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; @@ -2875,8 +2850,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb, meta_add = ocfs2_extend_meta_needed(et->et_root_el); - *credits += ocfs2_calc_extend_credits(sb, et->et_root_el, - num_clusters + 2); + *credits += ocfs2_calc_extend_credits(sb, et->et_root_el); ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, p_cluster, num_clusters, @@ -2922,27 +2896,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) } int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0, partial; - struct inode *inode = file_inode(file); - struct ocfs2_caching_info *ci = INODE_CACHE(inode); - struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct super_block *sb = inode->i_sb; u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); - struct page *page; pgoff_t page_index; - unsigned int from, to, readahead_pages; + unsigned int from, to; loff_t offset, end, map_end; struct address_space *mapping = inode->i_mapping; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); - readahead_pages = - (ocfs2_cow_contig_clusters(sb) << - OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); /* @@ -2953,44 +2921,53 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, end = i_size_read(inode); while (offset < end) { - page_index = offset >> PAGE_CACHE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + struct folio *folio; + page_index = offset >> PAGE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; if (map_end > end) map_end = end; /* from, to is the offset within the page. */ - from = offset & (PAGE_CACHE_SIZE - 1); - to = PAGE_CACHE_SIZE; - if (map_end & (PAGE_CACHE_SIZE - 1)) - to = map_end & (PAGE_CACHE_SIZE - 1); - - page = find_or_create_page(mapping, page_index, GFP_NOFS); + from = offset & (PAGE_SIZE - 1); + to = PAGE_SIZE; + if (map_end & (PAGE_SIZE - 1)) + to = map_end & (PAGE_SIZE - 1); + +retry: + folio = __filemap_get_folio(mapping, page_index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + mlog_errno(ret); + break; + } /* - * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page - * can't be dirtied before we CoW it out. + * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty + * page, so write it back. */ - if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) - BUG_ON(PageDirty(page)); - - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, - &file->f_ra, file, - page, page_index, - readahead_pages); + if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { + if (folio_test_dirty(folio)) { + folio_unlock(folio); + folio_put(folio); + + ret = filemap_write_and_wait_range(mapping, + offset, map_end - 1); + goto retry; + } } - if (!PageUptodate(page)) { - ret = block_read_full_page(page, ocfs2_get_block); + if (!folio_test_uptodate(folio)) { + ret = block_read_full_folio(folio, ocfs2_get_block); if (ret) { mlog_errno(ret); goto unlock; } - lock_page(page); + folio_lock(folio); } - if (page_has_buffers(page)) { - ret = walk_page_buffers(handle, page_buffers(page), + if (folio_buffers(folio)) { + ret = walk_page_buffers(handle, folio_buffers(folio), from, to, &partial, ocfs2_clear_cow_buffer); if (ret) { @@ -2999,13 +2976,12 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } } - ocfs2_map_and_dirty_page(inode, handle, from, to, - page, 0, &new_block); - mark_page_accessed(page); + ocfs2_map_and_dirty_folio(inode, handle, from, to, + folio, 0, &new_block); + folio_mark_accessed(folio); unlock: - unlock_page(page); - page_cache_release(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); offset = map_end; if (ret) break; @@ -3015,12 +2991,11 @@ unlock: } int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0; - struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct ocfs2_caching_info *ci = INODE_CACHE(inode); int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); @@ -3036,7 +3011,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, for (i = 0; i < blocks; i++, old_block++, new_block++) { new_bh = sb_getblk(osb->sb, new_block); if (new_bh == NULL) { - ret = -EIO; + ret = -ENOMEM; mlog_errno(ret); break; } @@ -3111,12 +3086,10 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + if (index == -1) { + ret = ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } @@ -3145,7 +3118,7 @@ static int ocfs2_replace_clusters(handle_t *handle, /*If the old clusters is unwritten, no need to duplicate. */ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { - ret = context->cow_duplicate_clusters(handle, context->file, + ret = context->cow_duplicate_clusters(handle, context->inode, cpos, old, new, len); if (ret) { mlog_errno(ret); @@ -3166,48 +3139,18 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, struct inode *inode, u32 cpos, u32 num_clusters) { - int ret = 0; - loff_t offset, end, map_end; - pgoff_t page_index; - struct page *page; + int ret; + loff_t start, end; if (ocfs2_should_order_data(inode)) return 0; - offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; - end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); + start = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = start + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits) - 1; - ret = filemap_fdatawrite_range(inode->i_mapping, - offset, end - 1); - if (ret < 0) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret < 0) mlog_errno(ret); - return ret; - } - - while (offset < end) { - page_index = offset >> PAGE_CACHE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; - if (map_end > end) - map_end = end; - - page = find_or_create_page(inode->i_mapping, - page_index, GFP_NOFS); - BUG_ON(!page); - - wait_on_page_writeback(page); - if (PageError(page)) { - ret = -EIO; - mlog_errno(ret); - } else - mark_page_accessed(page); - - unlock_page(page); - page_cache_release(page); - page = NULL; - offset = map_end; - if (ret) - break; - } return ret; } @@ -3381,11 +3324,9 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) unsigned int ext_flags; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - return -EROFS; + if (!ocfs2_refcount_tree(osb)) { + return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); } ocfs2_init_dealloc_ctxt(&context->dealloc); @@ -3423,48 +3364,24 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) return ret; } -static void ocfs2_readahead_for_cow(struct inode *inode, - struct file *file, - u32 start, u32 len) -{ - struct address_space *mapping; - pgoff_t index; - unsigned long num_pages; - int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; - - if (!file) - return; - - mapping = file->f_mapping; - num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT; - if (!num_pages) - num_pages = 1; - - index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT; - page_cache_sync_readahead(mapping, &file->f_ra, file, - index, num_pages); -} - /* * Starting at cpos, try to CoW write_len clusters. Don't CoW * past max_cpos. This will stop when it runs into a hole or an * unrefcounted extent. */ static int ocfs2_refcount_cow_hunk(struct inode *inode, - struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { int ret; u32 cow_start = 0, cow_len = 0; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_cow_context *context = NULL; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, cpos, write_len, max_cpos, @@ -3480,8 +3397,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, BUG_ON(cow_len == 0); - ocfs2_readahead_for_cow(inode, file, cow_start, cow_len); - context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; @@ -3503,7 +3418,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, context->ref_root_bh = ref_root_bh; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; context->get_clusters = ocfs2_di_get_clusters; - context->file = file; ocfs2_init_dinode_extent_tree(&context->data_et, INODE_CACHE(inode), di_bh); @@ -3532,7 +3446,6 @@ out: * clusters between cpos and cpos+write_len are safe to modify. */ int ocfs2_refcount_cow(struct inode *inode, - struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3552,7 +3465,7 @@ int ocfs2_refcount_cow(struct inode *inode, num_clusters = write_len; if (ext_flags & OCFS2_EXT_REFCOUNTED) { - ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, + ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, num_clusters, max_cpos); if (ret) { mlog_errno(ret); @@ -3657,8 +3570,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode, ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); *credits += ocfs2_calc_extend_credits(inode->i_sb, - et.et_root_el, - ref_blocks); + et.et_root_el); } out: @@ -3679,11 +3591,10 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, { int ret; struct ocfs2_xattr_value_root *xv = vb->vb_xv; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_cow_context *context = NULL; u32 cow_start, cow_len; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, cpos, write_len, UINT_MAX, @@ -3746,6 +3657,9 @@ int ocfs2_add_refcount_flag(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_alloc_context *meta_ac = NULL; + /* We need to be able to handle at least an extent tree split. */ + ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el); + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, ref_ci, ref_root_bh, p_cluster, num_clusters, @@ -3758,7 +3672,7 @@ int ocfs2_add_refcount_flag(struct inode *inode, trace_ocfs2_add_refcount_flag(ref_blocks, credits); if (ref_blocks) { - ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ret = ocfs2_reserve_new_metadata_blocks(osb, ref_blocks, &meta_ac); if (ret) { mlog_errno(ret); @@ -3828,9 +3742,9 @@ static int ocfs2_change_ctime(struct inode *inode, goto out_commit; } - inode->i_ctime = CURRENT_TIME; - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(handle, di_bh); @@ -3857,7 +3771,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, ocfs2_init_dealloc_ctxt(&dealloc); - if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + if (!ocfs2_is_refcount_inode(inode)) { ret = ocfs2_create_refcount_tree(inode, di_bh); if (ret) { mlog_errno(ret); @@ -3886,7 +3800,10 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, while (cpos < clusters) { ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, &ext_flags); - + if (ret) { + mlog_errno(ret); + goto unlock; + } if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { ret = ocfs2_add_refcount_flag(inode, &di_et, &ref_tree->rf_ci, @@ -3981,6 +3898,13 @@ static int ocfs2_add_refcounted_extent(struct inode *inode, ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, p_cluster, num_clusters, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, num_clusters)); if (ret) mlog_errno(ret); @@ -4057,7 +3981,10 @@ static int ocfs2_duplicate_extent_list(struct inode *s_inode, while (cpos < clusters) { ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, &num_clusters, &ext_flags); - + if (ret) { + mlog_errno(ret); + goto out; + } if (p_cluster) { ret = ocfs2_add_refcounted_extent(t_inode, &et, ref_ci, ref_root_bh, @@ -4138,12 +4065,12 @@ static int ocfs2_complete_reflink(struct inode *s_inode, * we want mtime to appear identical to the source and * update ctime. */ - t_inode->i_ctime = CURRENT_TIME; + inode_set_ctime_current(t_inode); - di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(t_inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(t_inode)); - t_inode->i_mtime = s_inode->i_mtime; + inode_set_mtime_to_ts(t_inode, inode_get_mtime(s_inode)); di->i_mtime = s_di->i_mtime; di->i_mtime_nsec = s_di->i_mtime_nsec; } @@ -4165,7 +4092,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, struct buffer_head *ref_root_bh = NULL; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); - struct ocfs2_refcount_block *rb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_refcount_tree *ref_tree; @@ -4192,7 +4118,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, mlog_errno(ret); goto out; } - rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, &ref_tree->rf_ci, ref_root_bh, @@ -4220,10 +4145,11 @@ static int __ocfs2_reflink(struct dentry *old_dentry, bool preserve) { int ret; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct buffer_head *new_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); - if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { ret = -EINVAL; mlog_errno(ret); goto out; @@ -4241,7 +4167,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out; } - mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(new_inode, I_MUTEX_CHILD); ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, OI_LS_REFLINK_TARGET); if (ret) { @@ -4249,6 +4175,26 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out_unlock; } + if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && + (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + /* + * Adjust extent record count to reserve space for extended attribute. + * Inline data count had been adjusted in ocfs2_duplicate_inline_data(). + */ + struct ocfs2_inode_info *new_oi = OCFS2_I(new_inode); + + if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) && + !(ocfs2_inode_is_fast_symlink(new_inode))) { + struct ocfs2_dinode *new_di = (struct ocfs2_dinode *)new_bh->b_data; + struct ocfs2_dinode *old_di = (struct ocfs2_dinode *)old_bh->b_data; + struct ocfs2_extent_list *el = &new_di->id2.i_list; + int inline_size = le16_to_cpu(old_di->i_xattr_inline_size); + + le16_add_cpu(&el->l_count, -(inline_size / + sizeof(struct ocfs2_extent_rec))); + } + } + ret = ocfs2_create_reflink_node(inode, old_bh, new_inode, new_bh, preserve); if (ret) { @@ -4256,7 +4202,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto inode_unlock; } - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { ret = ocfs2_reflink_xattrs(inode, old_bh, new_inode, new_bh, preserve); @@ -4275,7 +4221,7 @@ inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); out_unlock: - mutex_unlock(&new_inode->i_mutex); + inode_unlock(new_inode); out: if (!ret) { ret = filemap_fdatawait(inode->i_mapping); @@ -4288,14 +4234,16 @@ out: static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { - int error; - struct inode *inode = old_dentry->d_inode; + int error, had_lock; + struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; + struct ocfs2_lock_holder oh; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, &new_orphan_inode); if (error) { @@ -4303,9 +4251,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, goto out; } + error = ocfs2_rw_lock(inode, 1); + if (error) { + mlog_errno(error); + goto out; + } + error = ocfs2_inode_lock(inode, &old_bh, 1); if (error) { mlog_errno(error); + ocfs2_rw_unlock(inode, 1); goto out; } @@ -4317,6 +4272,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); brelse(old_bh); if (error) { @@ -4324,6 +4280,14 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, goto out; } + had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1, + &oh); + if (had_lock < 0) { + error = had_lock; + mlog_errno(error); + goto out; + } + /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, @@ -4331,14 +4295,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, if (error) mlog_errno(error); } -out: if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); if (error) mlog_errno(error); } + ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock); +out: if (new_orphan_inode) { /* * We need to open_unlock the inode no matter whether we @@ -4361,11 +4326,11 @@ out: /* copied from may_create in VFS. */ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) { - if (child->d_inode) + if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(dir, MAY_WRITE | MAY_EXEC); + return inode_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); } /** @@ -4379,7 +4344,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int error; if (!inode) @@ -4419,15 +4384,16 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, * file. */ if (!preserve) { - error = inode_permission(inode, MAY_READ); + error = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (error) return error; } - mutex_lock(&inode->i_mutex); - dquot_initialize(dir); - error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); - mutex_unlock(&inode->i_mutex); + inode_lock(inode); + error = dquot_initialize(dir); + if (!error) + error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); + inode_unlock(inode); if (!error) fsnotify_create(dir, new_dentry); return error; @@ -4453,7 +4419,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, return error; } - new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); @@ -4467,12 +4433,375 @@ int ocfs2_reflink_ioctl(struct inode *inode, } error = ocfs2_vfs_reflink(old_path.dentry, - new_path.dentry->d_inode, + d_inode(new_path.dentry), new_dentry, preserve); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); return error; } + +/* Update destination inode size, if necessary. */ +int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen) +{ + handle_t *handle; + int ret; + + dest->i_blocks = ocfs2_inode_sector_count(dest); + + if (newlen <= i_size_read(dest)) + return 0; + + handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + /* Extend i_size if needed. */ + spin_lock(&OCFS2_I(dest)->ip_lock); + if (newlen > i_size_read(dest)) + i_size_write(dest, newlen); + spin_unlock(&OCFS2_I(dest)->ip_lock); + inode_set_mtime_to_ts(dest, inode_set_ctime_current(dest)); + + ret = ocfs2_mark_inode_dirty(handle, dest, d_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + +out_commit: + ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle); + return ret; +} + +/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */ +static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + struct ocfs2_extent_tree s_et; + struct ocfs2_extent_tree t_et; + struct ocfs2_dinode *dis; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + struct ocfs2_super *osb; + loff_t remapped_bytes = 0; + loff_t pstart, plen; + u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0; + unsigned int ext_flags; + int ret = 0; + + osb = OCFS2_SB(s_inode->i_sb); + dis = (struct ocfs2_dinode *)s_bh->b_data; + ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh); + ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh); + + spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in); + tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out); + slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len); + + while (spos < slast) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + /* Look up the extent. */ + ret = ocfs2_get_clusters(s_inode, spos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_clusters = min_t(u32, num_clusters, slast - spos); + + /* Punch out the dest range. */ + pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos); + plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters); + ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (p_cluster == 0) + goto next_loop; + + /* Lock the refcount btree... */ + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(dis->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Mark s_inode's extent as refcounted. */ + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) { + ret = ocfs2_add_refcount_flag(s_inode, &s_et, + &ref_tree->rf_ci, + ref_root_bh, spos, + p_cluster, num_clusters, + dealloc, NULL); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + } + + /* Map in the new extent. */ + ext_flags |= OCFS2_EXT_REFCOUNTED; + ret = ocfs2_add_refcounted_extent(t_inode, &t_et, + &ref_tree->rf_ci, + ref_root_bh, + tpos, p_cluster, + num_clusters, + ext_flags, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +next_loop: + spos += num_clusters; + tpos += num_clusters; + remapped_clus += num_clusters; + } + + goto out; +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus); + remapped_bytes = min_t(loff_t, len, remapped_bytes); + + return remapped_bytes > 0 ? remapped_bytes : ret; +} + +/* Set up refcount tree and remap s_inode to t_inode. */ +loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len) +{ + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb; + struct ocfs2_dinode *dis; + struct ocfs2_dinode *dit; + loff_t ret; + + osb = OCFS2_SB(s_inode->i_sb); + dis = (struct ocfs2_dinode *)s_bh->b_data; + dit = (struct ocfs2_dinode *)t_bh->b_data; + ocfs2_init_dealloc_ctxt(&dealloc); + + /* + * If we're reflinking the entire file and the source is inline + * data, just copy the contents. + */ + if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) && + i_size_read(t_inode) <= len && + (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) { + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * If both inodes belong to two different refcount groups then + * forget it because we don't know how (or want) to go merging + * refcount trees. + */ + ret = -EOPNOTSUPP; + if (ocfs2_is_refcount_inode(s_inode) && + ocfs2_is_refcount_inode(t_inode) && + le64_to_cpu(dis->i_refcount_loc) != + le64_to_cpu(dit->i_refcount_loc)) + goto out; + + /* Neither inode has a refcount tree. Add one to s_inode. */ + if (!ocfs2_is_refcount_inode(s_inode) && + !ocfs2_is_refcount_inode(t_inode)) { + ret = ocfs2_create_refcount_tree(s_inode, s_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Ensure that both inodes end up with the same refcount tree. */ + if (!ocfs2_is_refcount_inode(s_inode)) { + ret = ocfs2_set_refcount_tree(s_inode, s_bh, + le64_to_cpu(dit->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + } + if (!ocfs2_is_refcount_inode(t_inode)) { + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(dis->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Turn off inline data in the dest file. */ + if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Actually remap extents now. */ + ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh, + pos_out, len, &dealloc); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + return ret; +} + +/* Lock an inode and grab a bh pointing to the inode. */ +int ocfs2_reflink_inodes_lock(struct inode *s_inode, + struct buffer_head **bh_s, + struct inode *t_inode, + struct buffer_head **bh_t) +{ + struct inode *inode1 = s_inode; + struct inode *inode2 = t_inode; + struct ocfs2_inode_info *oi1; + struct ocfs2_inode_info *oi2; + struct buffer_head *bh1 = NULL; + struct buffer_head *bh2 = NULL; + bool same_inode = (s_inode == t_inode); + bool need_swap = (inode1->i_ino > inode2->i_ino); + int status; + + /* First grab the VFS and rw locks. */ + lock_two_nondirectories(s_inode, t_inode); + if (need_swap) + swap(inode1, inode2); + + status = ocfs2_rw_lock(inode1, 1); + if (status) { + mlog_errno(status); + goto out_i1; + } + if (!same_inode) { + status = ocfs2_rw_lock(inode2, 1); + if (status) { + mlog_errno(status); + goto out_i2; + } + } + + /* Now go for the cluster locks */ + oi1 = OCFS2_I(inode1); + oi2 = OCFS2_I(inode2); + + trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); + + /* We always want to lock the one with the lower lockid first. */ + if (oi1->ip_blkno > oi2->ip_blkno) + mlog_errno(-ENOLCK); + + /* lock id1 */ + status = ocfs2_inode_lock_nested(inode1, &bh1, 1, + OI_LS_REFLINK_TARGET); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto out_rw2; + } + + /* lock id2 */ + if (!same_inode) { + status = ocfs2_inode_lock_nested(inode2, &bh2, 1, + OI_LS_REFLINK_TARGET); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto out_cl1; + } + } else { + bh2 = bh1; + } + + /* + * If we swapped inode order above, we have to swap the buffer heads + * before passing them back to the caller. + */ + if (need_swap) + swap(bh1, bh2); + *bh_s = bh1; + *bh_t = bh2; + + trace_ocfs2_double_lock_end( + (unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); + + return 0; + +out_cl1: + ocfs2_inode_unlock(inode1, 1); + brelse(bh1); +out_rw2: + ocfs2_rw_unlock(inode2, 1); +out_i2: + ocfs2_rw_unlock(inode1, 1); +out_i1: + unlock_two_nondirectories(s_inode, t_inode); + return status; +} + +/* Unlock both inodes and release buffers. */ +void ocfs2_reflink_inodes_unlock(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + ocfs2_inode_unlock(s_inode, 1); + ocfs2_rw_unlock(s_inode, 1); + brelse(s_bh); + if (s_inode != t_inode) { + ocfs2_inode_unlock(t_inode, 1); + ocfs2_rw_unlock(t_inode, 1); + brelse(t_bh); + } + unlock_two_nondirectories(s_inode, t_inode); +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 7754608c83a4..8197a94feec0 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -1,18 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * refcounttree.h * * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef OCFS2_REFCOUNTTREE_H #define OCFS2_REFCOUNTTREE_H @@ -53,7 +43,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, int *credits, int *ref_blocks); int ocfs2_refcount_cow(struct inode *inode, - struct file *filep, struct buffer_head *di_bh, + struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos); typedef int (ocfs2_post_refcount_func)(struct inode *inode, @@ -85,11 +75,11 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, u32 cpos, u32 write_len, struct ocfs2_post_refcount *post); int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); int ocfs2_cow_sync_writeback(struct super_block *sb, @@ -115,4 +105,23 @@ int ocfs2_reflink_ioctl(struct inode *inode, const char __user *oldname, const char __user *newname, bool preserve); +loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len); +int ocfs2_reflink_inodes_lock(struct inode *s_inode, + struct buffer_head **bh1, + struct inode *t_inode, + struct buffer_head **bh2); +void ocfs2_reflink_inodes_unlock(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh); +int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen); + #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 41ffd36c689c..1fe61974d9f0 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -1,6 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * reservations.c * * Allocation reservations implementation @@ -13,15 +12,6 @@ * Universite Pierre et Marie Curie (Paris VI) * * The rest is copyright (C) 2010 Novell. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/fs.h> @@ -39,10 +29,7 @@ #define OCFS2_CHECK_RESERVATIONS #endif -DEFINE_SPINLOCK(resv_lock); - -#define OCFS2_MIN_RESV_WINDOW_BITS 8 -#define OCFS2_MAX_RESV_WINDOW_BITS 1024 +static DEFINE_SPINLOCK(resv_lock); int ocfs2_dir_resv_allowed(struct ocfs2_super *osb) { @@ -211,7 +198,7 @@ void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, resv->r_flags |= flags; } -int ocfs2_resmap_init(struct ocfs2_super *osb, +void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap) { memset(resmap, 0, sizeof(*resmap)); @@ -220,8 +207,6 @@ int ocfs2_resmap_init(struct ocfs2_super *osb, resmap->m_reservations = RB_ROOT; /* m_bitmap_len is initialized to zero by the above memset. */ INIT_LIST_HEAD(&resmap->m_lru); - - return 0; } static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap, @@ -429,7 +414,7 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, start = search_start; while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len, - start)) != -1) { + start)) < resmap->m_bitmap_len) { /* Search reached end of the region */ if (offset >= (search_start + search_len)) break; diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index 42c2b804f3fd..4fce17180342 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -1,20 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * reservations.h * * Allocation reservations function prototypes and structures. * * Copyright (C) 2010 Novell. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef OCFS2_RESERVATIONS_H @@ -41,7 +31,7 @@ struct ocfs2_alloc_reservation { #define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */ #define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be - * destroyed immedately after use */ + * destroyed immediately after use */ #define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed * directory btree */ @@ -83,15 +73,10 @@ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, /** * ocfs2_resmap_init() - Initialize fields of a reservations bitmap + * @osb: struct ocfs2_super to be saved in resmap * @resmap: struct ocfs2_reservation_map to initialize - * @obj: unused for now - * @ops: unused for now - * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize) - * - * Only possible return value other than '0' is -ENOMEM for failure to - * allocation mirror bitmap. */ -int ocfs2_resmap_init(struct ocfs2_super *osb, +void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap); /** @@ -140,7 +125,7 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, /** * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used. * @resmap: reservations bitmap - * @resv: optional reservation to recalulate based on new bitmap + * @resv: optional reservation to recalculate based on new bitmap * @cstart: start of allocation in clusters * @clen: end of allocation in clusters. * diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index ec55add7604a..b0733c08ed13 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -1,27 +1,11 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * resize.c * * volume resize. * Inspired by ext3/resize.c. * * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -53,14 +37,13 @@ */ static u16 ocfs2_calc_new_backup_super(struct inode *inode, struct ocfs2_group_desc *gd, - int new_clusters, - u32 first_new_cluster, u16 cl_cpg, + u16 old_bg_clusters, int set) { int i; u16 backups = 0; - u32 cluster; + u32 cluster, lgd_cluster; u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno); for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { @@ -73,6 +56,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode, else if (gd_blkno > lgd_blkno) break; + /* check if already done backup super */ + lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno); + lgd_cluster += old_bg_clusters; + if (lgd_cluster >= cluster) + continue; + if (set) ocfs2_set_bit(cluster % cl_cpg, (unsigned long *)gd->bg_bitmap); @@ -101,6 +90,9 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, u16 chain, num_bits, backups = 0; u16 cl_bpc = le16_to_cpu(cl->cl_bpc); u16 cl_cpg = le16_to_cpu(cl->cl_cpg); + u16 old_bg_clusters; + u16 contig_bits; + __le16 old_bg_contig_free_bits; trace_ocfs2_update_last_group_and_inode(new_clusters, first_new_cluster); @@ -114,6 +106,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, group = (struct ocfs2_group_desc *)group_bh->b_data; + old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc; /* update the group first. */ num_bits = new_clusters * cl_bpc; le16_add_cpu(&group->bg_bits, num_bits); @@ -127,12 +120,15 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, OCFS2_FEATURE_COMPAT_BACKUP_SB)) { backups = ocfs2_calc_new_backup_super(bm_inode, group, - new_clusters, - first_new_cluster, - cl_cpg, 1); + cl_cpg, old_bg_clusters, 1); le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } + contig_bits = ocfs2_find_max_contig_free_bits(group->bg_bitmap, + le16_to_cpu(group->bg_bits), 0); + old_bg_contig_free_bits = group->bg_contig_free_bits; + group->bg_contig_free_bits = cpu_to_le16(contig_bits); + ocfs2_journal_dirty(handle, group_bh); /* update the inode accordingly. */ @@ -157,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, spin_lock(&OCFS2_I(bm_inode)->ip_lock); OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits); + le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(bm_inode)->ip_lock); i_size_write(bm_inode, le64_to_cpu(fe->i_size)); @@ -167,12 +163,11 @@ out_rollback: if (ret < 0) { ocfs2_calc_new_backup_super(bm_inode, group, - new_clusters, - first_new_cluster, - cl_cpg, 0); + cl_cpg, old_bg_clusters, 0); le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); + group->bg_contig_free_bits = old_bg_contig_free_bits; } out: if (ret) @@ -193,7 +188,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data) for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { blkno = ocfs2_backup_super_blkno(inode->i_sb, i); cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); - if (cluster > clusters) + if (cluster >= clusters) break; ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); @@ -298,7 +293,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -372,7 +367,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -469,6 +464,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) struct ocfs2_chain_list *cl; struct ocfs2_chain_rec *cr; u16 cl_bpc; + u64 bg_ptr; if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; @@ -482,7 +478,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -513,7 +509,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); if (ret) { mlog_errno(ret); - goto out_unlock; + goto out_free_group_bh; } trace_ocfs2_group_add((unsigned long long)input->group, @@ -523,7 +519,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) if (IS_ERR(handle)) { mlog_errno(PTR_ERR(handle)); ret = -EINVAL; - goto out_unlock; + goto out_free_group_bh; } cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); @@ -538,12 +534,14 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) } group = (struct ocfs2_group_desc *)group_bh->b_data; + bg_ptr = le64_to_cpu(group->bg_next_group); group->bg_next_group = cr->c_blkno; ocfs2_journal_dirty(handle, group_bh); ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { + group->bg_next_group = cpu_to_le64(bg_ptr); mlog_errno(ret); goto out_commit; } @@ -566,7 +564,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits); + le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); @@ -574,14 +572,19 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) out_commit: ocfs2_commit_trans(osb, handle); -out_unlock: + +out_free_group_bh: + if (ret < 0) + ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh); brelse(group_bh); + +out_unlock: brelse(main_bm_bh); ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h index f38841abf10b..4990637219ef 100644 --- a/fs/ocfs2/resize.h +++ b/fs/ocfs2/resize.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * resize.h * * Function prototypes * * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_RESIZE_H diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 1424c151cccc..e544c704b583 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -1,26 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * slot_map.c * - * - * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/types.h> @@ -55,7 +37,7 @@ struct ocfs2_slot_info { unsigned int si_blocks; struct buffer_head **si_bh; unsigned int si_num_slots; - struct ocfs2_slot *si_slots; + struct ocfs2_slot si_slots[] __counted_by(si_num_slots); }; @@ -306,7 +288,7 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, assert_spin_locked(&osb->osb_lock); BUG_ON(slot_num < 0); - BUG_ON(slot_num > osb->max_slots); + BUG_ON(slot_num >= osb->max_slots); if (!si->si_slots[slot_num].sl_valid) return -ENOENT; @@ -322,8 +304,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) if (si == NULL) return; - if (si->si_inode) - iput(si->si_inode); + iput(si->si_inode); if (si->si_bh) { for (i = 0; i < si->si_blocks; i++) { if (si->si_bh[i]) { @@ -382,7 +363,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, trace_ocfs2_map_slot_buffers(bytes, si->si_blocks); - si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, + si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *), GFP_KERNEL); if (!si->si_bh) { status = -ENOMEM; @@ -421,19 +402,15 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) struct inode *inode = NULL; struct ocfs2_slot_info *si; - si = kzalloc(sizeof(struct ocfs2_slot_info) + - (sizeof(struct ocfs2_slot) * osb->max_slots), - GFP_KERNEL); + si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL); if (!si) { status = -ENOMEM; mlog_errno(status); - goto bail; + return status; } si->si_extended = ocfs2_uses_extended_slot_map(osb); si->si_num_slots = osb->max_slots; - si->si_slots = (struct ocfs2_slot *)((char *)si + - sizeof(struct ocfs2_slot_info)); inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -452,7 +429,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) osb->slot_info = (struct ocfs2_slot_info *)si; bail: - if (status < 0 && si) + if (status < 0) __ocfs2_free_slot_info(si); return status; @@ -503,8 +480,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb) trace_ocfs2_find_slot(osb->slot_num); status = ocfs2_update_disk_slot(osb, si, osb->slot_num); - if (status < 0) + if (status < 0) { mlog_errno(status); + /* + * if write block failed, invalidate slot to avoid overwrite + * slot during dismount in case another node rightly has mounted + */ + spin_lock(&osb->osb_lock); + ocfs2_invalidate_slot(si, osb->slot_num); + osb->slot_num = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + } bail: return status; @@ -527,12 +513,8 @@ void ocfs2_put_slot(struct ocfs2_super *osb) spin_unlock(&osb->osb_lock); status = ocfs2_update_disk_slot(osb, si, slot_num); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto bail; - } -bail: ocfs2_free_slot_info(osb); } - diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h index 601c95fd7003..a43644570b53 100644 --- a/fs/ocfs2/slot_map.h +++ b/fs/ocfs2/slot_map.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * slotmap.h * * description here * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index bf1f8930456f..f58e891aa2da 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -1,20 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * stack_o2cb.c * * Code which interfaces ocfs2 with the o2cb stack. * * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/kernel.h> @@ -67,31 +57,31 @@ static inline int mode_to_o2dlm(int mode) return mode; } -#define map_flag(_generic, _o2dlm) \ - if (flags & (_generic)) { \ - flags &= ~(_generic); \ - o2dlm_flags |= (_o2dlm); \ - } static int flags_to_o2dlm(u32 flags) { int o2dlm_flags = 0; - map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE); - map_flag(DLM_LKF_CANCEL, LKM_CANCEL); - map_flag(DLM_LKF_CONVERT, LKM_CONVERT); - map_flag(DLM_LKF_VALBLK, LKM_VALBLK); - map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK); - map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN); - map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE); - map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT); - map_flag(DLM_LKF_LOCAL, LKM_LOCAL); - - /* map_flag() should have cleared every flag passed in */ - BUG_ON(flags != 0); + if (flags & DLM_LKF_NOQUEUE) + o2dlm_flags |= LKM_NOQUEUE; + if (flags & DLM_LKF_CANCEL) + o2dlm_flags |= LKM_CANCEL; + if (flags & DLM_LKF_CONVERT) + o2dlm_flags |= LKM_CONVERT; + if (flags & DLM_LKF_VALBLK) + o2dlm_flags |= LKM_VALBLK; + if (flags & DLM_LKF_IVVALBLK) + o2dlm_flags |= LKM_INVVALBLK; + if (flags & DLM_LKF_ORPHAN) + o2dlm_flags |= LKM_ORPHAN; + if (flags & DLM_LKF_FORCEUNLOCK) + o2dlm_flags |= LKM_FORCE; + if (flags & DLM_LKF_TIMEOUT) + o2dlm_flags |= LKM_TIMEOUT; + if (flags & DLM_LKF_LOCAL) + o2dlm_flags |= LKM_LOCAL; return o2dlm_flags; } -#undef map_flag /* * Map an o2dlm status to standard errno values. @@ -237,7 +227,7 @@ static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) } /* - * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB + * o2dlm always has a "valid" LVB. If the dlm loses track of the LVB * contents, it will zero out the LVB. Thus the caller can always trust * the contents. */ @@ -283,19 +273,19 @@ static int o2cb_cluster_check(void) */ #define O2CB_MAP_STABILIZE_COUNT 60 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { - o2hb_fill_node_map(hbmap, sizeof(hbmap)); + o2hb_fill_node_map(hbmap, O2NM_MAX_NODES); if (!test_bit(node_num, hbmap)) { printk(KERN_ERR "o2cb: %s heartbeat has not been " "started.\n", (o2hb_global_heartbeat_active() ? "Global" : "Local")); return -EINVAL; } - o2net_fill_node_map(netmap, sizeof(netmap)); + o2net_fill_node_map(netmap, O2NM_MAX_NODES); /* Force set the current node to allow easy compare */ set_bit(node_num, netmap); - if (!memcmp(hbmap, netmap, sizeof(hbmap))) + if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES)) return 0; - if (i < O2CB_MAP_STABILIZE_COUNT) + if (i < O2CB_MAP_STABILIZE_COUNT - 1) msleep(1000); } @@ -398,7 +388,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) return 0; } -static int o2cb_cluster_this_node(unsigned int *node) +static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) { int node_num; @@ -413,7 +404,7 @@ static int o2cb_cluster_this_node(unsigned int *node) return 0; } -static struct ocfs2_stack_operations o2cb_stack_ops = { +static const struct ocfs2_stack_operations o2cb_stack_ops = { .connect = o2cb_cluster_connect, .disconnect = o2cb_cluster_disconnect, .this_node = o2cb_cluster_this_node, diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 286edf1e231f..be0a5758bd40 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1,29 +1,21 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * stack_user.c * * Code which interfaces ocfs2 with fs/dlm and a userspace stack. * * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/reboot.h> -#include <asm/uaccess.h> +#include <linux/sched.h> +#include <linux/uaccess.h> #include "stackglue.h" @@ -102,6 +94,12 @@ #define OCFS2_TEXT_UUID_LEN 32 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 +#define VERSION_LOCK "version_lock" + +enum ocfs2_connection_type { + WITH_CONTROLD, + NO_CONTROLD +}; /* * ocfs2_live_connection is refcounted because the filesystem and @@ -110,6 +108,13 @@ struct ocfs2_live_connection { struct list_head oc_list; struct ocfs2_cluster_connection *oc_conn; + enum ocfs2_connection_type oc_type; + atomic_t oc_this_node; + int oc_our_slot; + struct dlm_lksb oc_version_lksb; + char oc_lvb[DLM_LVB_LEN]; + struct completion oc_sync_wait; + wait_queue_head_t oc_wait; }; struct ocfs2_control_private { @@ -198,20 +203,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) * mount path. Since the VFS prevents multiple calls to * fill_super(), we can't get dupes here. */ -static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, - struct ocfs2_live_connection **c_ret) +static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, + struct ocfs2_live_connection *c) { int rc = 0; - struct ocfs2_live_connection *c; - - c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); - if (!c) - return -ENOMEM; mutex_lock(&ocfs2_control_lock); c->oc_conn = conn; - if (atomic_read(&ocfs2_control_opened)) + if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) list_add(&c->oc_list, &ocfs2_live_connection_list); else { printk(KERN_ERR @@ -220,12 +220,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, } mutex_unlock(&ocfs2_control_lock); - - if (!rc) - *c_ret = c; - else - kfree(c); - return rc; } @@ -366,7 +360,6 @@ static int ocfs2_control_do_setnode_msg(struct file *file, struct ocfs2_control_message_setn *msg) { long nodenum; - char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; if (ocfs2_control_get_handshake_state(file) != @@ -381,8 +374,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file, return -EINVAL; msg->space = msg->newline = '\0'; - nodenum = simple_strtol(msg->nodestr, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->nodestr, 16, &nodenum)) return -EINVAL; if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || @@ -395,9 +387,8 @@ static int ocfs2_control_do_setnode_msg(struct file *file, static int ocfs2_control_do_setversion_msg(struct file *file, struct ocfs2_control_message_setv *msg) - { +{ long major, minor; - char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; struct ocfs2_protocol_version *max = &ocfs2_user_plugin.sp_max_proto; @@ -415,11 +406,9 @@ static int ocfs2_control_do_setversion_msg(struct file *file, return -EINVAL; msg->space1 = msg->space2 = msg->newline = '\0'; - major = simple_strtol(msg->major, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->major, 16, &major)) return -EINVAL; - minor = simple_strtol(msg->minor, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->minor, 16, &minor)) return -EINVAL; /* @@ -447,7 +436,6 @@ static int ocfs2_control_do_down_msg(struct file *file, struct ocfs2_control_message_down *msg) { long nodenum; - char *p = NULL; if (ocfs2_control_get_handshake_state(file) != OCFS2_CONTROL_HANDSHAKE_VALID) @@ -462,8 +450,7 @@ static int ocfs2_control_do_down_msg(struct file *file, return -EINVAL; msg->space1 = msg->space2 = msg->newline = '\0'; - nodenum = simple_strtol(msg->nodestr, &p, 16); - if (!p || *p) + if (kstrtol(msg->nodestr, 16, &nodenum)) return -EINVAL; if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || @@ -588,7 +575,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file) */ ocfs2_control_this_node = -1; running_proto.pv_major = 0; - running_proto.pv_major = 0; + running_proto.pv_minor = 0; } out: @@ -652,14 +639,7 @@ static int ocfs2_control_init(void) static void ocfs2_control_exit(void) { - int rc; - - rc = misc_deregister(&ocfs2_control_device); - if (rc) - printk(KERN_ERR - "ocfs2: Unable to deregister ocfs2_control device " - "(errno %d)\n", - -rc); + misc_deregister(&ocfs2_control_device); } static void fsdlm_lock_ast_wrapper(void *astarg) @@ -697,28 +677,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn, void *name, unsigned int namelen) { - int ret; - if (!lksb->lksb_fsdlm.sb_lvbptr) lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + sizeof(struct dlm_lksb); - ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, - flags|DLM_LKF_NODLCKWT, name, namelen, 0, - fsdlm_lock_ast_wrapper, lksb, - fsdlm_blocking_ast_wrapper); - return ret; + return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, + flags|DLM_LKF_NODLCKWT, name, namelen, 0, + fsdlm_lock_ast_wrapper, lksb, + fsdlm_blocking_ast_wrapper); } static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { - int ret; - - ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, - flags, &lksb->lksb_fsdlm, lksb); - return ret; + return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, + flags, &lksb->lksb_fsdlm, lksb); } static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) @@ -757,20 +731,13 @@ static int user_plock(struct ocfs2_cluster_connection *conn, * * Internally, fs/dlm will pass these to a misc device, which * a userspace daemon will read and write to. - * - * For now, cancel requests (which happen internally only), - * are turned into unlocks. Most of this function taken from - * gfs2_lock. */ - if (cmd == F_CANCELLK) { - cmd = F_SETLK; - fl->fl_type = F_UNLCK; - } - - if (IS_GETLK(cmd)) + if (cmd == F_CANCELLK) + return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl); + else if (IS_GETLK(cmd)) return dlm_posix_get(conn->cc_lockspace, ino, file, fl); - else if (fl->fl_type == F_UNLCK) + else if (lock_is_unlock(fl)) return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); else return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl); @@ -799,18 +766,257 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, return 0; } +static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + ver->pv_major = pv->pv_major; + ver->pv_minor = pv->pv_minor; +} + +static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + pv->pv_major = ver->pv_major; + pv->pv_minor = ver->pv_minor; +} + +static void sync_wait_cb(void *arg) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + complete(&lc->oc_sync_wait); +} + +static int sync_unlock(struct ocfs2_cluster_connection *conn, + struct dlm_lksb *lksb, char *name) +{ + int error; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn); + if (error) { + printk(KERN_ERR "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + printk(KERN_ERR "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct ocfs2_cluster_connection *conn, + int mode, uint32_t flags, + struct dlm_lksb *lksb, char *name) +{ + int error, status; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_lock(conn->cc_lockspace, mode, lksb, flags, + name, strlen(name), + 0, sync_wait_cb, conn, NULL); + if (error) { + printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + + +static int version_lock(struct ocfs2_cluster_connection *conn, int mode, + int flags) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_lock(conn, mode, flags, + &lc->oc_version_lksb, VERSION_LOCK); +} + +static int version_unlock(struct ocfs2_cluster_connection *conn) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); +} + +/* get_protocol_version() + * + * To exchange ocfs2 versioning, we use the LVB of the version dlm lock. + * The algorithm is: + * 1. Attempt to take the lock in EX mode (non-blocking). + * 2. If successful (which means it is the first mount), write the + * version number and downconvert to PR lock. + * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after + * taking the PR lock. + */ + +static int get_protocol_version(struct ocfs2_cluster_connection *conn) +{ + int ret; + struct ocfs2_live_connection *lc = conn->cc_private; + struct ocfs2_protocol_version pv; + + running_proto.pv_major = + ocfs2_user_plugin.sp_max_proto.pv_major; + running_proto.pv_minor = + ocfs2_user_plugin.sp_max_proto.pv_minor; + + lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb; + ret = version_lock(conn, DLM_LOCK_EX, + DLM_LKF_VALBLK|DLM_LKF_NOQUEUE); + if (!ret) { + conn->cc_version.pv_major = running_proto.pv_major; + conn->cc_version.pv_minor = running_proto.pv_minor; + version_to_lvb(&running_proto, lc->oc_lvb); + version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + } else if (ret == -EAGAIN) { + ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK); + if (ret) + goto out; + lvb_to_version(lc->oc_lvb, &pv); + + if ((pv.pv_major != running_proto.pv_major) || + (pv.pv_minor > running_proto.pv_minor)) { + ret = -EINVAL; + goto out; + } + + conn->cc_version.pv_major = pv.pv_major; + conn->cc_version.pv_minor = pv.pv_minor; + } +out: + return ret; +} + +static void user_recover_prep(void *arg) +{ +} + +static void user_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct ocfs2_cluster_connection *conn = arg; + printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n", + slot->nodeid, slot->slot); + conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data); + +} + +static void user_recover_done(void *arg, struct dlm_slot *slots, + int num_slots, int our_slot, + uint32_t generation) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + int i; + + for (i = 0; i < num_slots; i++) + if (slots[i].slot == our_slot) { + atomic_set(&lc->oc_this_node, slots[i].nodeid); + break; + } + + lc->oc_our_slot = our_slot; + wake_up(&lc->oc_wait); +} + +static const struct dlm_lockspace_ops ocfs2_ls_ops = { + .recover_prep = user_recover_prep, + .recover_slot = user_recover_slot, + .recover_done = user_recover_done, +}; + +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ + version_unlock(conn); + dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL); + conn->cc_lockspace = NULL; + ocfs2_live_connection_drop(conn->cc_private); + conn->cc_private = NULL; + return 0; +} + static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; - struct ocfs2_live_connection *uninitialized_var(control); - int rc = 0; + struct ocfs2_live_connection *lc; + int rc, ops_rv; BUG_ON(conn == NULL); - rc = ocfs2_live_connection_new(conn, &control); + lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); + if (!lc) + return -ENOMEM; + + init_waitqueue_head(&lc->oc_wait); + init_completion(&lc->oc_sync_wait); + atomic_set(&lc->oc_this_node, 0); + conn->cc_private = lc; + lc->oc_type = NO_CONTROLD; + + rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, + DLM_LSFL_NEWEXCL, DLM_LVB_LEN, + &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); + if (rc) { + if (rc == -EEXIST || rc == -EPROTO) + printk(KERN_ERR "ocfs2: Unable to create the " + "lockspace %s (%d), because a ocfs2-tools " + "program is running on this file system " + "with the same name lockspace\n", + conn->cc_name, rc); + goto out; + } + + if (ops_rv == -EOPNOTSUPP) { + lc->oc_type = WITH_CONTROLD; + printk(KERN_NOTICE "ocfs2: You seem to be using an older " + "version of dlm_controld and/or ocfs2-tools." + " Please consider upgrading.\n"); + } else if (ops_rv) { + rc = ops_rv; + goto out; + } + conn->cc_lockspace = fsdlm; + + rc = ocfs2_live_connection_attach(conn, lc); if (rc) goto out; + if (lc->oc_type == NO_CONTROLD) { + rc = get_protocol_version(conn); + if (rc) { + printk(KERN_ERR "ocfs2: Could not determine" + " locking version\n"); + user_cluster_disconnect(conn); + lc = NULL; + goto out; + } + wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); + } + /* * running_proto must have been set before we allowed any mounts * to proceed. @@ -818,42 +1024,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) if (fs_protocol_compare(&running_proto, &conn->cc_version)) { printk(KERN_ERR "Unable to mount with fs locking protocol version " - "%u.%u because the userspace control daemon has " - "negotiated %u.%u\n", + "%u.%u because negotiated protocol is %u.%u\n", conn->cc_version.pv_major, conn->cc_version.pv_minor, running_proto.pv_major, running_proto.pv_minor); rc = -EPROTO; - ocfs2_live_connection_drop(control); - goto out; + ocfs2_live_connection_drop(lc); + lc = NULL; } - rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, - NULL, NULL, NULL, &fsdlm); - if (rc) { - ocfs2_live_connection_drop(control); - goto out; - } - - conn->cc_private = control; - conn->cc_lockspace = fsdlm; out: + if (rc) + kfree(lc); return rc; } -static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) -{ - dlm_release_lockspace(conn->cc_lockspace, 2); - conn->cc_lockspace = NULL; - ocfs2_live_connection_drop(conn->cc_private); - conn->cc_private = NULL; - return 0; -} -static int user_cluster_this_node(unsigned int *this_node) +static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *this_node) { int rc; + struct ocfs2_live_connection *lc = conn->cc_private; + + if (lc->oc_type == WITH_CONTROLD) + rc = ocfs2_control_get_this_node(); + else if (lc->oc_type == NO_CONTROLD) + rc = atomic_read(&lc->oc_this_node); + else + rc = -EINVAL; - rc = ocfs2_control_get_this_node(); if (rc < 0) return rc; @@ -861,7 +1059,7 @@ static int user_cluster_this_node(unsigned int *this_node) return 0; } -static struct ocfs2_stack_operations ocfs2_user_plugin_ops = { +static const struct ocfs2_stack_operations ocfs2_user_plugin_ops = { .connect = user_cluster_connect, .disconnect = user_cluster_disconnect, .this_node = user_cluster_this_node, diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 39abf89697ed..a28c127b9934 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -1,21 +1,11 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * stackglue.c * * Code which implements an OCFS2 specific interface to underlying * cluster stacks. * * Copyright (C) 2007, 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/list.h> @@ -309,6 +299,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, EXPORT_SYMBOL_GPL(ocfs2_plock); int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, @@ -342,8 +334,12 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - memcpy(new_conn->cc_name, group, grouplen); + strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; + if (cluster_name_len) + strscpy(new_conn->cc_cluster_name, cluster_name, + CLUSTER_NAME_MAX + 1); + new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; @@ -386,8 +382,9 @@ int ocfs2_cluster_connect_agnostic(const char *group, if (cluster_stack_name[0]) stack_name = cluster_stack_name; - return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, - recovery_handler, recovery_data, conn); + return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, + lproto, recovery_handler, recovery_data, + conn); } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); @@ -460,9 +457,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen) } EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); -int ocfs2_cluster_this_node(unsigned int *node) +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) { - return active_stack->sp_ops->this_node(node); + return active_stack->sp_ops->this_node(conn, node); } EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); @@ -488,7 +486,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_max_locking_protocol = - __ATTR(max_locking_protocol, S_IFREG | S_IRUGO, + __ATTR(max_locking_protocol, S_IRUGO, ocfs2_max_locking_protocol_show, NULL); static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, @@ -502,11 +500,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, list_for_each_entry(p, &ocfs2_stack_list, sp_list) { ret = snprintf(buf, remain, "%s\n", p->sp_name); - if (ret < 0) { - total = ret; - break; - } - if (ret == remain) { + if (ret >= remain) { /* snprintf() didn't fit */ total = -E2BIG; break; @@ -520,7 +514,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = - __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, + __ATTR(loaded_cluster_plugins, S_IRUGO, ocfs2_loaded_cluster_plugins_show, NULL); static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, @@ -533,7 +527,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, if (active_stack) { ret = snprintf(buf, PAGE_SIZE, "%s\n", active_stack->sp_name); - if (ret == PAGE_SIZE) + if (ret >= PAGE_SIZE) ret = -E2BIG; } spin_unlock(&ocfs2_stack_lock); @@ -542,7 +536,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_active_cluster_plugin = - __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, + __ATTR(active_cluster_plugin, S_IRUGO, ocfs2_active_cluster_plugin_show, NULL); static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, @@ -591,23 +585,38 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, static struct kobj_attribute ocfs2_attr_cluster_stack = - __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, + __ATTR(cluster_stack, S_IRUGO | S_IWUSR, ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); + + +static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "1\n"); +} + +static struct kobj_attribute ocfs2_attr_dlm_recover_support = + __ATTR(dlm_recover_callback_support, S_IRUGO, + ocfs2_dlm_recover_show, NULL); + static struct attribute *ocfs2_attrs[] = { &ocfs2_attr_max_locking_protocol.attr, &ocfs2_attr_loaded_cluster_plugins.attr, &ocfs2_attr_active_cluster_plugin.attr, &ocfs2_attr_cluster_stack.attr, + &ocfs2_attr_dlm_recover_support.attr, NULL, }; -static struct attribute_group ocfs2_attr_group = { +static const struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; -static struct kset *ocfs2_kset; +struct kset *ocfs2_kset; +EXPORT_SYMBOL_GPL(ocfs2_kset); static void ocfs2_sysfs_exit(void) { @@ -641,9 +650,7 @@ error: * and easier to preserve the name. */ -#define FS_OCFS2_NM 1 - -static ctl_table ocfs2_nm_table[] = { +static const struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", .data = ocfs2_hb_ctl_path, @@ -651,44 +658,9 @@ static ctl_table ocfs2_nm_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, - { } }; -static ctl_table ocfs2_mod_table[] = { - { - .procname = "nm", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_nm_table - }, - { } -}; - -static ctl_table ocfs2_kern_table[] = { - { - .procname = "ocfs2", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_mod_table - }, - { } -}; - -static ctl_table ocfs2_root_table[] = { - { - .procname = "fs", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_kern_table - }, - { } -}; - -static struct ctl_table_header *ocfs2_table_header = NULL; - +static struct ctl_table_header *ocfs2_table_header; /* * Initialization @@ -696,31 +668,34 @@ static struct ctl_table_header *ocfs2_table_header = NULL; static int __init ocfs2_stack_glue_init(void) { + int ret; + strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); - ocfs2_table_header = register_sysctl_table(ocfs2_root_table); + ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); if (!ocfs2_table_header) { printk(KERN_ERR "ocfs2 stack glue: unable to register sysctl\n"); return -ENOMEM; /* or something. */ } - return ocfs2_sysfs_init(); + ret = ocfs2_sysfs_init(); + if (ret) + unregister_sysctl_table(ocfs2_table_header); + + return ret; } static void __exit ocfs2_stack_glue_exit(void) { memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); - locking_max_version.pv_major = 0; - locking_max_version.pv_minor = 0; ocfs2_sysfs_exit(); - if (ocfs2_table_header) - unregister_sysctl_table(ocfs2_table_header); + unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); -MODULE_DESCRIPTION("ocfs2 cluter stack glue layer"); +MODULE_DESCRIPTION("ocfs2 cluster stack glue layer"); MODULE_LICENSE("GPL"); module_init(ocfs2_stack_glue_init); module_exit(ocfs2_stack_glue_exit); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 1ec56fdb8d0d..5486a6dce70a 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -1,20 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * stackglue.h * * Glue to the underlying cluster stack. * * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ @@ -45,6 +35,9 @@ struct file_lock; */ #define GROUP_NAME_MAX 64 +/* This shadows OCFS2_CLUSTER_NAME_LEN */ +#define CLUSTER_NAME_MAX 16 + /* * ocfs2_protocol_version changes when ocfs2 does something different in @@ -97,8 +90,10 @@ struct ocfs2_locking_protocol { * locking compatibility. */ struct ocfs2_cluster_connection { - char cc_name[GROUP_NAME_MAX]; + char cc_name[GROUP_NAME_MAX + 1]; int cc_namelen; + char cc_cluster_name[CLUSTER_NAME_MAX + 1]; + int cc_cluster_name_len; struct ocfs2_protocol_version cc_version; struct ocfs2_locking_protocol *cc_proto; void (*cc_recovery_handler)(int node_num, void *recovery_data); @@ -152,7 +147,8 @@ struct ocfs2_stack_operations { * ->this_node() returns the cluster's unique identifier for the * local node. */ - int (*this_node)(unsigned int *node); + int (*this_node)(struct ocfs2_cluster_connection *conn, + unsigned int *node); /* * Call the underlying dlm lock function. The ->dlm_lock() @@ -214,7 +210,7 @@ struct ocfs2_stack_operations { struct file_lock *fl); /* - * This is an optoinal debugging hook. If provided, the + * This is an optional debugging hook. If provided, the * stack can dump debugging information about this lock. */ void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb); @@ -227,7 +223,7 @@ struct ocfs2_stack_operations { */ struct ocfs2_stack_plugin { char *sp_name; - struct ocfs2_stack_operations *sp_ops; + const struct ocfs2_stack_operations *sp_ops; struct module *sp_owner; /* These are managed by the stackglue code. */ @@ -239,6 +235,8 @@ struct ocfs2_stack_plugin { /* Used by the filesystem */ int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, @@ -260,7 +258,8 @@ int ocfs2_cluster_connect_agnostic(const char *group, int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending); void ocfs2_cluster_hangup(const char *group, int grouplen); -int ocfs2_cluster_this_node(unsigned int *node); +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node); struct ocfs2_lock_res; int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, @@ -289,4 +288,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +extern struct kset *ocfs2_kset; + #endif /* STACKGLUE_H */ diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 5397c07ce608..6ac4dcd54588 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1,27 +1,11 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * suballoc.c * * metadata alloc and free * Inspired by ext3 block groups. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -66,6 +50,10 @@ struct ocfs2_suballoc_result { u64 sr_blkno; /* The first allocated block */ unsigned int sr_bit_offset; /* The bit in the bg */ unsigned int sr_bits; /* How many bits we claimed */ + unsigned int sr_max_contig_bits; /* The length for contiguous + * free bits, only available + * for cluster group + */ }; static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) @@ -79,8 +67,6 @@ static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); } -static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); -static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, @@ -113,12 +99,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, struct ocfs2_suballoc_result *res); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits); static int ocfs2_relink_block_group(handle_t *handle, struct inode *alloc_inode, struct buffer_head *fe_bh, @@ -147,7 +127,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) if (ac->ac_which != OCFS2_AC_USE_LOCAL) ocfs2_inode_unlock(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); ac->ac_inode = NULL; @@ -155,10 +135,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) brelse(ac->ac_bh); ac->ac_bh = NULL; ac->ac_resv = NULL; - if (ac->ac_find_loc_priv) { - kfree(ac->ac_find_loc_priv); - ac->ac_find_loc_priv = NULL; - } + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -173,12 +151,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) } #define do_error(fmt, ...) \ - do{ \ - if (resize) \ - mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ - else \ - ocfs2_error(sb, fmt, ##__VA_ARGS__); \ - } while (0) +do { \ + if (resize) \ + mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ + else \ + return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ +} while (0) static int ocfs2_validate_gd_self(struct super_block *sb, struct buffer_head *bh, @@ -187,44 +165,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - do_error("Group descriptor #%llu has bad signature %.*s", + do_error("Group descriptor #%llu has bad signature %.*s\n", (unsigned long long)bh->b_blocknr, 7, gd->bg_signature); - return -EINVAL; } if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { - do_error("Group descriptor #%llu has an invalid bg_blkno " - "of %llu", + do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_blkno)); - return -EINVAL; } if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { - do_error("Group descriptor #%llu has an invalid " - "fs_generation of #%u", + do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(gd->bg_generation)); - return -EINVAL; } if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { - do_error("Group descriptor #%llu has bit count %u but " - "claims that %u are free", + do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), le16_to_cpu(gd->bg_free_bits_count)); - return -EINVAL; } if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { - do_error("Group descriptor #%llu has bit count %u but " - "max bitmap bits of %u", + do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), 8 * le16_to_cpu(gd->bg_size)); - return -EINVAL; } return 0; @@ -239,20 +208,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (di->i_blkno != gd->bg_parent_dinode) { - do_error("Group descriptor #%llu has bad parent " - "pointer (%llu, expected %llu)", + do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), (unsigned long long)le64_to_cpu(di->i_blkno)); - return -EINVAL; } max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); if (le16_to_cpu(gd->bg_bits) > max_bits) { - do_error("Group descriptor #%llu has bit count of %u", + do_error("Group descriptor #%llu has bit count of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits)); - return -EINVAL; } /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ @@ -260,10 +226,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || ((le16_to_cpu(gd->bg_chain) == le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { - do_error("Group descriptor #%llu has bad chain %u", + do_error("Group descriptor #%llu has bad chain %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_chain)); - return -EINVAL; } return 0; @@ -390,11 +355,10 @@ static int ocfs2_block_group_fill(handle_t *handle, struct super_block * sb = alloc_inode->i_sb; if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { - ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " - "b_blocknr (%llu)", - (unsigned long long)group_blkno, - (unsigned long long) bg_bh->b_blocknr); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "group block (%llu) != b_blocknr (%llu)\n", + (unsigned long long)group_blkno, + (unsigned long long) bg_bh->b_blocknr); goto bail; } @@ -409,7 +373,7 @@ static int ocfs2_block_group_fill(handle_t *handle, memset(bg, 0, sb->s_blocksize); strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); - bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + bg->bg_generation = cpu_to_le32(osb->fs_generation); bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, osb->s_feature_incompat)); bg->bg_chain = cpu_to_le16(my_chain); @@ -481,7 +445,7 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, bg_bh = sb_getblk(osb->sb, bg_blkno); if (!bg_bh) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } @@ -661,7 +625,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle, bg_bh = sb_getblk(osb->sb, bg_blkno); if (!bg_bh) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } @@ -734,10 +698,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, ac, cl); - if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC)) + if (PTR_ERR(bg_bh) == -ENOSPC) { + ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; bg_bh = ocfs2_block_group_alloc_discontig(handle, alloc_inode, ac, cl); + } if (IS_ERR(bg_bh)) { status = PTR_ERR(bg_bh); bg_bh = NULL; @@ -777,6 +743,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); status = 0; @@ -818,11 +785,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, return -EINVAL; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); status = ocfs2_inode_lock(alloc_inode, &bh, 1); if (status < 0) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); mlog_errno(status); @@ -839,9 +806,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { - ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", - (unsigned long long)le64_to_cpu(fe->i_blkno)); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "Invalid chain allocator %llu\n", + (unsigned long long)le64_to_cpu(fe->i_blkno)); goto bail; } @@ -916,9 +883,9 @@ static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) { spin_lock(&osb->osb_lock); if (type == INODE_ALLOC_SYSTEM_INODE) - osb->s_inode_steal_slot = slot; + osb->s_inode_steal_slot = (u16)slot; else if (type == EXTENT_ALLOC_SYSTEM_INODE) - osb->s_meta_steal_slot = slot; + osb->s_meta_steal_slot = (u16)slot; spin_unlock(&osb->osb_lock); } @@ -1168,12 +1135,9 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT, NULL, ALLOC_NEW_GROUP); - if (status < 0 && status != -ENOSPC) { + if (status < 0 && status != -ENOSPC) mlog_errno(status); - goto bail; - } -bail: return status; } @@ -1185,7 +1149,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, int flags, struct ocfs2_alloc_context **ac) { - int status; + int status, ret = 0; + int retried = 0; *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { @@ -1210,7 +1175,34 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, } if (status == -ENOSPC) { +retry: status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + /* Retry if there is sufficient space cached in truncate log */ + if (status == -ENOSPC && !retried) { + retried = 1; + ocfs2_inode_unlock((*ac)->ac_inode, 1); + inode_unlock((*ac)->ac_inode); + + ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); + if (ret == 1) { + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; + goto retry; + } + + if (ret < 0) + mlog_errno(ret); + + inode_lock((*ac)->ac_inode); + ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + inode_unlock((*ac)->ac_inode); + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; + goto bail; + } + } if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -1264,25 +1256,48 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr) { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct journal_head *jh; int ret; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) return 0; - if (!buffer_jbd(bg_bh)) + jh = jbd2_journal_grab_journal_head(bg_bh); + if (!jh) return 1; - jbd_lock_bh_state(bg_bh); - bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; if (bg) ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); else ret = 1; - jbd_unlock_bh_state(bg_bh); + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); return ret; } +u16 ocfs2_find_max_contig_free_bits(void *bitmap, + u16 total_bits, u16 start) +{ + u16 offset, free_bits; + u16 contig_bits = 0; + + while (start < total_bits) { + offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); + if (offset == total_bits) + break; + + start = ocfs2_find_next_bit(bitmap, total_bits, offset); + free_bits = start - offset; + if (contig_bits < free_bits) + contig_bits = free_bits; + } + + return contig_bits; +} + static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, struct buffer_head *bg_bh, unsigned int bits_wanted, @@ -1291,6 +1306,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, { void *bitmap; u16 best_offset, best_size; + u16 prev_best_size = 0; int offset, start, found, status = 0; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; @@ -1301,10 +1317,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, found = start = best_offset = best_size = 0; bitmap = bg->bg_bitmap; - while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { - if (offset == total_bits) - break; - + while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < + total_bits) { if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { /* We found a zero, but we can't use it as it * hasn't been put to disk yet! */ @@ -1319,6 +1333,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, /* got a zero after some ones */ found = 1; start = offset + 1; + prev_best_size = best_size; } if (found > best_size) { best_size = found; @@ -1331,6 +1346,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, } } + /* best_size will be allocated, we save prev_best_size */ + res->sr_max_contig_bits = prev_best_size; if (best_size) { res->sr_bit_offset = best_offset; res->sr_bits = best_size; @@ -1343,16 +1360,21 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, return status; } -static inline int ocfs2_block_group_set_bits(handle_t *handle, +int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, unsigned int bit_off, - unsigned int num_bits) + unsigned int num_bits, + unsigned int max_contig_bits, + int fastpath) { int status; void *bitmap = bg->bg_bitmap; int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + unsigned int start = bit_off + num_bits; + u16 contig_bits; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); /* All callers get the descriptor via * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ @@ -1375,21 +1397,41 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, le16_add_cpu(&bg->bg_free_bits_count, -num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); + /* + * this is optimize path, caller set old contig value + * in max_contig_bits to bypass finding action. + */ + if (fastpath) { + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { + /* + * Usually, the block group bitmap allocates only 1 bit + * at a time, while the cluster group allocates n bits + * each time. Therefore, we only save the contig bits for + * the cluster group. + */ + contig_bits = ocfs2_find_max_contig_free_bits(bitmap, + le16_to_cpu(bg->bg_bits), start); + if (contig_bits > max_contig_bits) + max_contig_bits = contig_bits; + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); + } else { + bg->bg_contig_free_bits = 0; + } + ocfs2_journal_dirty(handle, group_bh); bail: - if (status) - mlog_errno(status); return status; } @@ -1500,7 +1542,12 @@ static int ocfs2_cluster_group_search(struct inode *inode, BUG_ON(!ocfs2_is_cluster_bitmap(inode)); - if (gd->bg_free_bits_count) { + if (le16_to_cpu(gd->bg_contig_free_bits) && + le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) + return -ENOSPC; + + /* ->bg_contig_free_bits may un-initialized, so compare again */ + if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { max_bits = le16_to_cpu(gd->bg_bits); /* Tail groups in cluster bitmaps which aren't cpg @@ -1520,7 +1567,7 @@ static int ocfs2_cluster_group_search(struct inode *inode, OCFS2_I(inode)->ip_clusters, max_bits); } - ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + ret = ocfs2_block_group_find_clear_bits(osb, group_bh, bits_wanted, max_bits, res); if (ret) @@ -1544,13 +1591,6 @@ static int ocfs2_cluster_group_search(struct inode *inode, * of bits. */ if (min_bits <= res->sr_bits) search = 0; /* success */ - else if (res->sr_bits) { - /* - * Don't show bits which we'll be returning - * for allocation to the local alloc bitmap. - */ - ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits); - } } return search; @@ -1569,7 +1609,7 @@ static int ocfs2_block_group_search(struct inode *inode, BUG_ON(min_bits != 1); BUG_ON(ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) { + if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), @@ -1588,7 +1628,7 @@ static int ocfs2_block_group_search(struct inode *inode, return ret; } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, +int ocfs2_alloc_dinode_update_counts(struct inode *inode, handle_t *handle, struct buffer_head *di_bh, u32 num_bits, @@ -1615,6 +1655,21 @@ out: return ret; } +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl; + + cl = (struct ocfs2_chain_list *)&di->id2.i_chain; + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); + le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); +} + static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, struct ocfs2_extent_rec *rec, struct ocfs2_chain_list *cl) @@ -1714,9 +1769,14 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, - res->sr_bit_offset, res->sr_bits); - if (ret < 0) + res->sr_bit_offset, res->sr_bits, + res->sr_max_contig_bits, 0); + if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, + res->sr_bits, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); @@ -1736,6 +1796,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, { int status; u16 chain; + u32 contig_bits; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; @@ -1761,10 +1822,21 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, status = -ENOSPC; /* for now, the chain search is a bit simplistic. We just use * the 1st group with any empty bits. */ - while ((status = ac->ac_group_search(alloc_inode, group_bh, - bits_wanted, min_bits, - ac->ac_max_block, - res)) == -ENOSPC) { + while (1) { + if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { + contig_bits = le16_to_cpu(bg->bg_contig_free_bits); + if (!contig_bits) + contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, + le16_to_cpu(bg->bg_bits), 0); + if (bits_wanted > contig_bits && contig_bits >= min_bits) + bits_wanted = contig_bits; + } + + status = ac->ac_group_search(alloc_inode, group_bh, + bits_wanted, min_bits, + ac->ac_max_block, res); + if (status != -ENOSPC) + break; if (!bg->bg_next_group) break; @@ -1844,8 +1916,12 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, bg, group_bh, res->sr_bit_offset, - res->sr_bits); + res->sr_bits, + res->sr_max_contig_bits, + 0); if (status < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(status); goto bail; } @@ -1891,13 +1967,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { - ocfs2_error(ac->ac_inode->i_sb, - "Chain allocator dinode %llu has %u used " - "bits but only %u total.", - (unsigned long long)le64_to_cpu(fe->i_blkno), - le32_to_cpu(fe->id1.bitmap1.i_used), - le32_to_cpu(fe->id1.bitmap1.i_total)); - status = -EIO; + status = ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used bits but only %u total\n", + (unsigned long long)le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->id1.bitmap1.i_used), + le32_to_cpu(fe->id1.bitmap1.i_total)); goto bail; } @@ -1922,10 +1996,14 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, victim = ocfs2_find_victim_chain(cl); ac->ac_chain = victim; +search: status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) { - hint = ocfs2_group_from_res(res); + if (ocfs2_is_cluster_bitmap(ac->ac_inode)) + hint = res->sr_bg_blkno; + else + hint = ocfs2_group_from_res(res); goto set_hint; } if (status < 0 && status != -ENOSPC) { @@ -1943,7 +2021,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { if (i == victim) continue; - if (!cl->cl_recs[i].c_free) + if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) continue; ac->ac_chain = i; @@ -1959,6 +2037,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, } } + /* Chains can't supply the bits_wanted contiguous space. + * We should switch to using every single bit when allocating + * from the global bitmap. */ + if (i == le16_to_cpu(cl->cl_next_free_rec) && + status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { + ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; + ac->ac_chain = victim; + goto search; + } + set_hint: if (status != -ENOSPC) { /* If the next search of this group is not likely to @@ -2099,7 +2187,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir, ac->ac_find_loc_priv = res; *fe_blkno = res->sr_blkno; - + ocfs2_update_inode_fsync_trans(handle, dir, 0); out: if (handle) ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); @@ -2155,8 +2243,12 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, bg, bg_bh, res->sr_bit_offset, - res->sr_bits); + res->sr_bits, + res->sr_max_contig_bits, + 0); if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(ret); goto out; } @@ -2298,7 +2390,8 @@ int __ocfs2_claim_clusters(handle_t *handle, BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL - && ac->ac_which != OCFS2_AC_USE_MAIN); + && ac->ac_which != OCFS2_AC_USE_MAIN + && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG); if (ac->ac_which == OCFS2_AC_USE_LOCAL) { WARN_ON(min_clusters > 1); @@ -2372,12 +2465,15 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, struct buffer_head *group_bh, unsigned int bit_off, unsigned int num_bits, + unsigned int max_contig_bits, void (*undo_fn)(unsigned int bit, unsigned long *bmap)) { int status; unsigned int tmp; + u16 contig_bits; struct ocfs2_group_desc *undo_bg = NULL; + struct journal_head *jh; /* The caller got this descriptor from * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ @@ -2396,10 +2492,10 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, goto bail; } + jh = bh2jh(group_bh); if (undo_fn) { - jbd_lock_bh_state(group_bh); - undo_bg = (struct ocfs2_group_desc *) - bh2jh(group_bh)->b_committed_data; + spin_lock(&jh->b_state_lock); + undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; BUG_ON(!undo_bg); } @@ -2413,16 +2509,31 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + if (undo_fn) + spin_unlock(&jh->b_state_lock); + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); + } + + /* + * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), + * we still need to rescan whole bitmap. + */ + if (ocfs2_is_cluster_bitmap(alloc_inode)) { + contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, + le16_to_cpu(bg->bg_bits), 0); + if (contig_bits > max_contig_bits) + max_contig_bits = contig_bits; + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + } else { + bg->bg_contig_free_bits = 0; } if (undo_fn) - jbd_unlock_bh_state(group_bh); + spin_unlock(&jh->b_state_lock); ocfs2_journal_dirty(handle, group_bh); bail: @@ -2447,6 +2558,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *group; + __le16 old_bg_contig_free_bits = 0; /* The alloc_bh comes from ocfs2_free_dinode() or * ocfs2_free_clusters(). The callers have all locked the @@ -2471,9 +2583,11 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); + if (ocfs2_is_cluster_bitmap(alloc_inode)) + old_bg_contig_free_bits = group->bg_contig_free_bits; status = ocfs2_block_group_clear_bits(handle, alloc_inode, group, group_bh, - start_bit, count, undo_fn); + start_bit, count, 0, undo_fn); if (status < 0) { mlog_errno(status); goto bail; @@ -2483,6 +2597,9 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); + ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, + start_bit, count, + le16_to_cpu(old_bg_contig_free_bits), 1); goto bail; } @@ -2494,9 +2611,6 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, bail: brelse(group_bh); - - if (status) - mlog_errno(status); return status; } @@ -2537,16 +2651,16 @@ static int _ocfs2_free_clusters(handle_t *handle, int status; u16 bg_start_bit; u64 bg_blkno; - struct ocfs2_dinode *fe; /* You can't ever have a contiguous set of clusters * bigger than a block group bitmap so we never have to worry * about looping on them. * This is expensive. We can safely remove once this stuff has * gotten tested really well. */ - BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); + BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, + ocfs2_blocks_to_clusters(bitmap_inode->i_sb, + start_blk))); - fe = (struct ocfs2_dinode *) bitmap_bh->b_data; ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, &bg_start_bit); @@ -2567,8 +2681,6 @@ static int _ocfs2_free_clusters(handle_t *handle, num_clusters); out: - if (status) - mlog_errno(status); return status; } @@ -2598,53 +2710,6 @@ int ocfs2_release_clusters(handle_t *handle, _ocfs2_clear_bit); } -static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) -{ - printk("Block Group:\n"); - printk("bg_signature: %s\n", bg->bg_signature); - printk("bg_size: %u\n", bg->bg_size); - printk("bg_bits: %u\n", bg->bg_bits); - printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); - printk("bg_chain: %u\n", bg->bg_chain); - printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); - printk("bg_next_group: %llu\n", - (unsigned long long)bg->bg_next_group); - printk("bg_parent_dinode: %llu\n", - (unsigned long long)bg->bg_parent_dinode); - printk("bg_blkno: %llu\n", - (unsigned long long)bg->bg_blkno); -} - -static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) -{ - int i; - - printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno); - printk("i_signature: %s\n", fe->i_signature); - printk("i_size: %llu\n", - (unsigned long long)fe->i_size); - printk("i_clusters: %u\n", fe->i_clusters); - printk("i_generation: %u\n", - le32_to_cpu(fe->i_generation)); - printk("id1.bitmap1.i_used: %u\n", - le32_to_cpu(fe->id1.bitmap1.i_used)); - printk("id1.bitmap1.i_total: %u\n", - le32_to_cpu(fe->id1.bitmap1.i_total)); - printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); - printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); - printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); - printk("id2.i_chain.cl_next_free_rec: %u\n", - fe->id2.i_chain.cl_next_free_rec); - for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { - printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, - fe->id2.i_chain.cl_recs[i].c_free); - printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, - fe->id2.i_chain.cl_recs[i].c_total); - printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, - (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); - } -} - /* * For a given allocation, determine which allocators will need to be * accessed, and lock them, reserving the appropriate number of bits. @@ -2671,7 +2736,7 @@ int ocfs2_lock_allocators(struct inode *inode, BUG_ON(clusters_to_add != 0 && data_ac == NULL); - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); @@ -2862,9 +2927,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - inode_alloc_inode = - ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, - suballoc_slot); + if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) + inode_alloc_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); + else + inode_alloc_inode = ocfs2_get_system_file_inode(osb, + INODE_ALLOC_SYSTEM_INODE, suballoc_slot); if (!inode_alloc_inode) { /* the error code could be inaccurate, but we are not able to * get the correct one. */ @@ -2874,10 +2942,11 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); + iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); goto bail; @@ -2889,7 +2958,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); ocfs2_inode_unlock(inode_alloc_inode, 0); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); brelse(alloc_bh); diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a36d0aa50911..bcf2ed4a8631 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * suballoc.h * * Defines sub allocator api * * Copyright (C) 2003, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef _CHAINALLOC_H_ @@ -45,6 +29,7 @@ struct ocfs2_alloc_context { #define OCFS2_AC_USE_MAIN 2 #define OCFS2_AC_USE_INODE 3 #define OCFS2_AC_USE_META 4 +#define OCFS2_AC_USE_MAIN_DISCONTIG 5 u32 ac_which; /* these are used by the chain search */ @@ -54,7 +39,7 @@ struct ocfs2_alloc_context { u64 ac_last_group; u64 ac_max_block; /* Highest block number to allocate. 0 is - is the same as ~0 - unlimited */ + the same as ~0 - unlimited */ int ac_find_loc_only; /* hack for reflink operation ordering */ struct ocfs2_suballoc_result *ac_find_loc_priv; /* */ @@ -86,6 +71,26 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, u32 bits_wanted, struct ocfs2_alloc_context **ac); +int ocfs2_alloc_dinode_update_counts(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); +u16 ocfs2_find_max_contig_free_bits(void *bitmap, + u16 total_bits, u16 start); +int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits, + unsigned int max_contig_bits, + int fastpath); + int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, @@ -106,7 +111,7 @@ int ocfs2_claim_clusters(handle_t *handle, u32 *cluster_start, u32 *num_clusters); /* - * Use this variant of ocfs2_claim_clusters to specify a maxiumum + * Use this variant of ocfs2_claim_clusters to specify a maximum * number of clusters smaller than the allocation reserved. */ int __ocfs2_claim_clusters(handle_t *handle, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 854d80955bf8..2c7ba1480f7a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * super.c * * load/unload driver, mount/dismount volumes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/module.h> @@ -35,13 +19,13 @@ #include <linux/blkdev.h> #include <linux/socket.h> #include <linux/inet.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/crc32.h> #include <linux/debugfs.h> -#include <linux/mount.h> #include <linux/seq_file.h> #include <linux/quotaops.h> -#include <linux/cleancache.h> +#include <linux/signal.h> #define CREATE_TRACE_POINTS #include "ocfs2_trace.h" @@ -68,50 +52,43 @@ #include "super.h" #include "sysfile.h" #include "uptodate.h" -#include "ver.h" #include "xattr.h" #include "quota.h" #include "refcounttree.h" #include "suballoc.h" #include "buffer_head_io.h" +#include "filecheck.h" -static struct kmem_cache *ocfs2_inode_cachep = NULL; +static struct kmem_cache *ocfs2_inode_cachep; struct kmem_cache *ocfs2_dquot_cachep; struct kmem_cache *ocfs2_qf_chunk_cachep; -/* OCFS2 needs to schedule several different types of work which - * require cluster locking, disk I/O, recovery waits, etc. Since these - * types of work tend to be heavy we avoid using the kernel events - * workqueue and schedule on our own. */ -struct workqueue_struct *ocfs2_wq = NULL; - -static struct dentry *ocfs2_debugfs_root = NULL; +static struct dentry *ocfs2_debugfs_root; MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster file system"); struct mount_options { unsigned long commit_interval; unsigned long mount_opt; unsigned int atime_quantum; - signed short slot; + unsigned short slot; int localalloc_opt; unsigned int resv_level; int dir_resv_level; char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + bool user_stack; }; -static int ocfs2_parse_options(struct super_block *sb, char *options, - struct mount_options *mopt, - int is_remount); +static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options); static int ocfs2_show_options(struct seq_file *s, struct dentry *root); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); -static int ocfs2_remount(struct super_block *sb, int *flags, char *data); static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); static int ocfs2_initialize_mem_caches(void); static void ocfs2_free_mem_caches(void); @@ -138,36 +115,36 @@ static int ocfs2_get_sector(struct super_block *sb, int block, int sect_size); static struct inode *ocfs2_alloc_inode(struct super_block *sb); -static void ocfs2_destroy_inode(struct inode *inode); +static void ocfs2_free_inode(struct inode *inode); static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); static int ocfs2_enable_quotas(struct ocfs2_super *osb); static void ocfs2_disable_quotas(struct ocfs2_super *osb); +static struct dquot __rcu **ocfs2_get_dquots(struct inode *inode) +{ + return OCFS2_I(inode)->i_dquot; +} + static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, .alloc_inode = ocfs2_alloc_inode, - .destroy_inode = ocfs2_destroy_inode, - .drop_inode = ocfs2_drop_inode, + .free_inode = ocfs2_free_inode, + .drop_inode = inode_just_drop, .evict_inode = ocfs2_evict_inode, .sync_fs = ocfs2_sync_fs, .put_super = ocfs2_put_super, - .remount_fs = ocfs2_remount, .show_options = ocfs2_show_options, .quota_read = ocfs2_quota_read, .quota_write = ocfs2_quota_write, + .get_dquots = ocfs2_get_dquots, }; enum { Opt_barrier, - Opt_err_panic, - Opt_err_ro, + Opt_errors, Opt_intr, - Opt_nointr, - Opt_hb_none, - Opt_hb_local, - Opt_hb_global, - Opt_data_ordered, - Opt_data_writeback, + Opt_heartbeat, + Opt_data, Opt_atime_quantum, Opt_slot, Opt_commit, @@ -175,48 +152,64 @@ enum { Opt_localflocks, Opt_stack, Opt_user_xattr, - Opt_nouser_xattr, Opt_inode64, Opt_acl, - Opt_noacl, Opt_usrquota, Opt_grpquota, - Opt_coherency_buffered, - Opt_coherency_full, + Opt_coherency, Opt_resv_level, Opt_dir_resv_level, - Opt_err, + Opt_journal_async_commit, +}; + +static const struct constant_table ocfs2_param_errors[] = { + {"panic", OCFS2_MOUNT_ERRORS_PANIC}, + {"remount-ro", OCFS2_MOUNT_ERRORS_ROFS}, + {"continue", OCFS2_MOUNT_ERRORS_CONT}, + {} +}; + +static const struct constant_table ocfs2_param_heartbeat[] = { + {"local", OCFS2_MOUNT_HB_LOCAL}, + {"none", OCFS2_MOUNT_HB_NONE}, + {"global", OCFS2_MOUNT_HB_GLOBAL}, + {} +}; + +static const struct constant_table ocfs2_param_data[] = { + {"writeback", OCFS2_MOUNT_DATA_WRITEBACK}, + {"ordered", 0}, + {} }; -static const match_table_t tokens = { - {Opt_barrier, "barrier=%u"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_intr, "intr"}, - {Opt_nointr, "nointr"}, - {Opt_hb_none, OCFS2_HB_NONE}, - {Opt_hb_local, OCFS2_HB_LOCAL}, - {Opt_hb_global, OCFS2_HB_GLOBAL}, - {Opt_data_ordered, "data=ordered"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_atime_quantum, "atime_quantum=%u"}, - {Opt_slot, "preferred_slot=%u"}, - {Opt_commit, "commit=%u"}, - {Opt_localalloc, "localalloc=%d"}, - {Opt_localflocks, "localflocks"}, - {Opt_stack, "cluster_stack=%s"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_inode64, "inode64"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_usrquota, "usrquota"}, - {Opt_grpquota, "grpquota"}, - {Opt_coherency_buffered, "coherency=buffered"}, - {Opt_coherency_full, "coherency=full"}, - {Opt_resv_level, "resv_level=%u"}, - {Opt_dir_resv_level, "dir_resv_level=%u"}, - {Opt_err, NULL} +static const struct constant_table ocfs2_param_coherency[] = { + {"buffered", OCFS2_MOUNT_COHERENCY_BUFFERED}, + {"full", 0}, + {} +}; + +static const struct fs_parameter_spec ocfs2_param_spec[] = { + fsparam_u32 ("barrier", Opt_barrier), + fsparam_enum ("errors", Opt_errors, ocfs2_param_errors), + fsparam_flag_no ("intr", Opt_intr), + fsparam_enum ("heartbeat", Opt_heartbeat, ocfs2_param_heartbeat), + fsparam_enum ("data", Opt_data, ocfs2_param_data), + fsparam_u32 ("atime_quantum", Opt_atime_quantum), + fsparam_u32 ("preferred_slot", Opt_slot), + fsparam_u32 ("commit", Opt_commit), + fsparam_s32 ("localalloc", Opt_localalloc), + fsparam_flag ("localflocks", Opt_localflocks), + fsparam_string ("cluster_stack", Opt_stack), + fsparam_flag_no ("user_xattr", Opt_user_xattr), + fsparam_flag ("inode64", Opt_inode64), + fsparam_flag_no ("acl", Opt_acl), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_enum ("coherency", Opt_coherency, ocfs2_param_coherency), + fsparam_u32 ("resv_level", Opt_resv_level), + fsparam_u32 ("dir_resv_level", Opt_dir_resv_level), + fsparam_flag ("journal_async_commit", Opt_journal_async_commit), + {} }; #ifdef CONFIG_DEBUG_FS @@ -226,32 +219,33 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) struct ocfs2_recovery_map *rm = osb->recovery_map; struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; int i, out = 0; + unsigned long flags; - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", "Device", osb->dev_str, osb->uuid_str, osb->fs_generation, osb->vol_label); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => State: %d Flags: 0x%lX\n", "Volume", atomic_read(&osb->vol_state), osb->osb_flags); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Block: %lu Cluster: %d\n", "Sizes", osb->sb->s_blocksize, osb->s_clustersize); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Compat: 0x%X Incompat: 0x%X " "ROcompat: 0x%X\n", "Features", osb->s_feature_compat, osb->s_feature_incompat, osb->s_feature_ro_compat); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", osb->s_mount_opt, osb->s_atime_quantum); if (cconn) { - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Stack: %s Name: %*s " "Version: %d.%d\n", "Cluster", (*osb->osb_cluster_stack == '\0' ? @@ -261,42 +255,42 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) cconn->cc_version.pv_minor); } - spin_lock(&osb->dc_task_lock); - out += snprintf(buf + out, len - out, + spin_lock_irqsave(&osb->dc_task_lock, flags); + out += scnprintf(buf + out, len - out, "%10s => Pid: %d Count: %lu WakeSeq: %lu " "WorkSeq: %lu\n", "DownCnvt", (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), osb->blocked_lock_count, osb->dc_wake_sequence, osb->dc_work_sequence); - spin_unlock(&osb->dc_task_lock); + spin_unlock_irqrestore(&osb->dc_task_lock, flags); spin_lock(&osb->osb_lock); - out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", + out += scnprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", "Recovery", (osb->recovery_thread_task ? task_pid_nr(osb->recovery_thread_task) : -1)); if (rm->rm_used == 0) - out += snprintf(buf + out, len - out, " None\n"); + out += scnprintf(buf + out, len - out, " None\n"); else { for (i = 0; i < rm->rm_used; i++) - out += snprintf(buf + out, len - out, " %d", + out += scnprintf(buf + out, len - out, " %d", rm->rm_entries[i]); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); } spin_unlock(&osb->osb_lock); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Pid: %d Interval: %lu\n", "Commit", (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), osb->osb_commit_interval); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => State: %d TxnId: %lu NumTxns: %d\n", "Journal", osb->journal->j_state, osb->journal->j_trans_id, atomic_read(&osb->journal->j_num_trans)); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => GlobalAllocs: %d LocalAllocs: %d " "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", "Stats", @@ -306,7 +300,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) atomic_read(&osb->alloc_stats.moves), atomic_read(&osb->alloc_stats.bg_extends)); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => State: %u Descriptor: %llu Size: %u bits " "Default: %u bits\n", "LocalAlloc", osb->local_alloc_state, @@ -314,7 +308,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) osb->local_alloc_bits, osb->local_alloc_default_bits); spin_lock(&osb->osb_lock); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => InodeSlot: %d StolenInodes: %d, " "MetaSlot: %d StolenMeta: %d\n", "Steal", osb->s_inode_steal_slot, @@ -323,20 +317,20 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) atomic_read(&osb->s_num_meta_stolen)); spin_unlock(&osb->osb_lock); - out += snprintf(buf + out, len - out, "OrphanScan => "); - out += snprintf(buf + out, len - out, "Local: %u Global: %u ", + out += scnprintf(buf + out, len - out, "OrphanScan => "); + out += scnprintf(buf + out, len - out, "Local: %u Global: %u ", os->os_count, os->os_seqno); - out += snprintf(buf + out, len - out, " Last Scan: "); + out += scnprintf(buf + out, len - out, " Last Scan: "); if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) - out += snprintf(buf + out, len - out, "Disabled\n"); + out += scnprintf(buf + out, len - out, "Disabled\n"); else - out += snprintf(buf + out, len - out, "%lu seconds ago\n", - (get_seconds() - os->os_scantime.tv_sec)); + out += scnprintf(buf + out, len - out, "%lu seconds ago\n", + (unsigned long)(ktime_get_seconds() - os->os_scantime)); - out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", + out += scnprintf(buf + out, len - out, "%10s => %3s %10s\n", "Slots", "Num", "RecoGen"); for (i = 0; i < osb->max_slots; ++i) { - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s %c %3d %10d\n", " ", (i == osb->slot_num ? '*' : ' '), @@ -416,10 +410,10 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait) ocfs2_schedule_truncate_log_flush(osb, 0); } - if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { if (wait) - jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + jbd2_log_wait_commit(osb->journal->j_journal, target); } return 0; @@ -467,9 +461,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog_errno(status); - /* FIXME: Should ERROR_RO_FS */ mlog(ML_ERROR, "Unable to load system inode %d, " "possibly corrupt fs?", i); goto bail; @@ -498,7 +491,7 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", status, i, osb->slot_num); goto bail; @@ -557,25 +550,23 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) { struct ocfs2_inode_info *oi; - oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS); + oi = alloc_inode_sb(sb, ocfs2_inode_cachep, GFP_NOFS); if (!oi) return NULL; + oi->i_sync_tid = 0; + oi->i_datasync_tid = 0; + memset(&oi->i_dquot, 0, sizeof(oi->i_dquot)); + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; } -static void ocfs2_i_callback(struct rcu_head *head) +static void ocfs2_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); } -static void ocfs2_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, ocfs2_i_callback); -} - static unsigned long long ocfs2_max_file_offset(unsigned int bbits, unsigned int cbits) { @@ -590,13 +581,12 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, */ #if BITS_PER_LONG == 32 -# if defined(CONFIG_LBDAF) BUILD_BUG_ON(sizeof(sector_t) != 8); /* * We might be limited by page cache size. */ - if (bytes > PAGE_CACHE_SIZE) { - bytes = PAGE_CACHE_SIZE; + if (bytes > PAGE_SIZE) { + bytes = PAGE_SIZE; trim = 1; /* * Shift by 31 here so that we don't get larger than @@ -604,15 +594,6 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, */ bitshift = 31; } -# else - /* - * We are limited by the size of sector_t. Use block size, as - * that's what we expose to the VFS. - */ - bytes = 1 << bbits; - trim = 1; - bitshift = 31; -# endif #endif /* @@ -623,30 +604,32 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, return (((unsigned long long)bytes) << bitshift) - trim; } -static int ocfs2_remount(struct super_block *sb, int *flags, char *data) +static int ocfs2_reconfigure(struct fs_context *fc) { int incompat_features; int ret = 0; - struct mount_options parsed_options; + struct mount_options *parsed_options = fc->fs_private; + struct super_block *sb = fc->root->d_sb; struct ocfs2_super *osb = OCFS2_SB(sb); u32 tmp; - if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || - !ocfs2_check_set_options(sb, &parsed_options)) { + sync_filesystem(sb); + + if (!ocfs2_check_set_options(sb, parsed_options)) { ret = -EINVAL; goto out; } tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; - if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { + if ((osb->s_mount_opt & tmp) != (parsed_options->mount_opt & tmp)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); goto out; } if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != - (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { + (parsed_options->mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot change data mode on remount\n"); goto out; @@ -655,16 +638,16 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) /* Probably don't want this on remount; it might * mess with other nodes */ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && - (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { + (parsed_options->mount_opt & OCFS2_MOUNT_INODE64)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); goto out; } /* We're going to/from readonly mode. */ - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { /* Disable quota accounting before remounting RO */ - if (*flags & MS_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { ret = ocfs2_susp_quotas(osb, 0); if (ret < 0) goto out; @@ -678,8 +661,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto unlock_osb; } - if (*flags & MS_RDONLY) { - sb->s_flags |= MS_RDONLY; + if (fc->sb_flags & SB_RDONLY) { + sb->s_flags |= SB_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; } else { if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { @@ -696,14 +679,14 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) ret = -EINVAL; goto unlock_osb; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; } - trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); + trace_ocfs2_remount(sb->s_flags, osb->osb_flags, fc->sb_flags); unlock_osb: spin_unlock(&osb->osb_lock); /* Enable quota accounting after remounting RW */ - if (!ret && !(*flags & MS_RDONLY)) { + if (!ret && !(fc->sb_flags & SB_RDONLY)) { if (sb_any_quota_suspended(sb)) ret = ocfs2_susp_quotas(osb, 1); else @@ -711,7 +694,7 @@ unlock_osb: if (ret < 0) { /* Return back changes... */ spin_lock(&osb->osb_lock); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); goto out; @@ -722,18 +705,18 @@ unlock_osb: if (!ret) { /* Only save off the new mount options in case of a successful * remount. */ - osb->s_mount_opt = parsed_options.mount_opt; - osb->s_atime_quantum = parsed_options.atime_quantum; - osb->preferred_slot = parsed_options.slot; - if (parsed_options.commit_interval) - osb->osb_commit_interval = parsed_options.commit_interval; + osb->s_mount_opt = parsed_options->mount_opt; + osb->s_atime_quantum = parsed_options->atime_quantum; + osb->preferred_slot = parsed_options->slot; + if (parsed_options->commit_interval) + osb->osb_commit_interval = parsed_options->commit_interval; if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? - MS_POSIXACL : 0); + SB_POSIXACL : 0); } out: return ret; @@ -894,11 +877,12 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) { int type; struct super_block *sb = osb->sb; - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; int status = 0; - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; if (unsuspend) @@ -922,17 +906,19 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) static int ocfs2_enable_quotas(struct ocfs2_super *osb) { - struct inode *inode[MAXQUOTAS] = { NULL, NULL }; + struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL }; struct super_block *sb = osb->sb; - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; - unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[OCFS2_MAXQUOTAS] = { + LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; int status; int type; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; inode[type] = ocfs2_get_system_file_inode(osb, ino[type], @@ -941,18 +927,18 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb) status = -ENOENT; goto out_quota_off; } - status = dquot_enable(inode[type], type, QFMT_OCFS2, - DQUOT_USAGE_ENABLED); + status = dquot_load_quota_inode(inode[type], type, QFMT_OCFS2, + DQUOT_USAGE_ENABLED); if (status < 0) goto out_quota_off; } - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) iput(inode[type]); return 0; out_quota_off: ocfs2_disable_quotas(osb); - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) iput(inode[type]); mlog_errno(status); return status; @@ -967,121 +953,85 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) /* We mostly ignore errors in this function because there's not much * we can do when we see them */ - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!sb_has_quota_loaded(sb, type)) continue; - /* Cancel periodic syncing before we grab dqonoff_mutex */ - oinfo = sb_dqinfo(sb, type)->dqi_priv; - cancel_delayed_work_sync(&oinfo->dqi_sync_work); + if (!sb_has_quota_suspended(sb, type)) { + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + } inode = igrab(sb->s_dquot.files[type]); /* Turn off quotas. This will remove all dquot structures from * memory and so they will be automatically synced to global * quota files */ dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - if (!inode) - continue; iput(inode); } } -/* Handle quota on quotactl */ -static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) -{ - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) - return -EINVAL; - - return dquot_enable(sb_dqopt(sb)->files[type], type, - format_id, DQUOT_LIMITS_ENABLED); -} - -/* Handle quota off quotactl */ -static int ocfs2_quota_off(struct super_block *sb, int type) -{ - return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); -} - -static const struct quotactl_ops ocfs2_quotactl_ops = { - .quota_on_meta = ocfs2_quota_on, - .quota_off = ocfs2_quota_off, - .quota_sync = dquot_quota_sync, - .get_info = dquot_get_dqinfo, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .set_dqblk = dquot_set_dqblk, -}; - -static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) +static int ocfs2_fill_super(struct super_block *sb, struct fs_context *fc) { struct dentry *root; int status, sector_size; - struct mount_options parsed_options; + struct mount_options *parsed_options = fc->fs_private; struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; - char nodestr[8]; + char nodestr[12]; struct ocfs2_blockcheck_stats stats; - trace_ocfs2_fill_super(sb, data, silent); - - if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { - status = -EINVAL; - goto read_super_error; - } + trace_ocfs2_fill_super(sb, fc, fc->sb_flags & SB_SILENT); /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); if (status < 0) { mlog(ML_ERROR, "superblock probe failed!\n"); - goto read_super_error; + goto out; } status = ocfs2_initialize_super(sb, bh, sector_size, &stats); - osb = OCFS2_SB(sb); - if (status < 0) { - mlog_errno(status); - goto read_super_error; - } brelse(bh); bh = NULL; + if (status < 0) + goto out; + + osb = OCFS2_SB(sb); - if (!ocfs2_check_set_options(sb, &parsed_options)) { + if (!ocfs2_check_set_options(sb, parsed_options)) { status = -EINVAL; - goto read_super_error; - } - osb->s_mount_opt = parsed_options.mount_opt; - osb->s_atime_quantum = parsed_options.atime_quantum; - osb->preferred_slot = parsed_options.slot; - osb->osb_commit_interval = parsed_options.commit_interval; - - ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); - osb->osb_resv_level = parsed_options.resv_level; - osb->osb_dir_resv_level = parsed_options.resv_level; - if (parsed_options.dir_resv_level == -1) - osb->osb_dir_resv_level = parsed_options.resv_level; + goto out_super; + } + osb->s_mount_opt = parsed_options->mount_opt; + osb->s_atime_quantum = parsed_options->atime_quantum; + osb->preferred_slot = parsed_options->slot; + osb->osb_commit_interval = parsed_options->commit_interval; + + ocfs2_la_set_sizes(osb, parsed_options->localalloc_opt); + osb->osb_resv_level = parsed_options->resv_level; + osb->osb_dir_resv_level = parsed_options->resv_level; + if (parsed_options->dir_resv_level == -1) + osb->osb_dir_resv_level = parsed_options->resv_level; else - osb->osb_dir_resv_level = parsed_options.dir_resv_level; + osb->osb_dir_resv_level = parsed_options->dir_resv_level; - status = ocfs2_verify_userspace_stack(osb, &parsed_options); + status = ocfs2_verify_userspace_stack(osb, parsed_options); if (status) - goto read_super_error; + goto out_super; sb->s_magic = OCFS2_SUPER_MAGIC; - sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | - ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~(SB_POSIXACL | SB_NOSEC)) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0); - /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, + /* Hard readonly mode only if: bdev_read_only, SB_RDONLY, * heartbeat=none */ if (bdev_read_only(sb->s_bdev)) { - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { status = -EACCES; mlog(ML_ERROR, "Readonly device detected but readonly " "mount was not specified.\n"); - goto read_super_error; + goto out_super; } /* You should not be able to start a local heartbeat @@ -1090,7 +1040,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -EROFS; mlog(ML_ERROR, "Local heartbeat specified on readonly " "device.\n"); - goto read_super_error; + goto out_super; } status = ocfs2_check_journals_nolocks(osb); @@ -1099,9 +1049,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) mlog(ML_ERROR, "Recovery required on readonly " "file system, but write access is " "unavailable.\n"); - else - mlog_errno(status); - goto read_super_error; + goto out_super; } ocfs2_set_ro_flag(osb, 1); @@ -1112,64 +1060,59 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } if (!ocfs2_is_hard_readonly(osb)) { - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) ocfs2_set_ro_flag(osb, 0); } status = ocfs2_verify_heartbeat(osb); - if (status < 0) { - mlog_errno(status); - goto read_super_error; - } + if (status < 0) + goto out_super; osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); - if (!osb->osb_debug_root) { - status = -EINVAL; - mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); - goto read_super_error; - } - osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_osb_debug_fops); - if (!osb->osb_ctxt) { - status = -EINVAL; - mlog_errno(status); - goto read_super_error; - } + debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, + osb, &ocfs2_osb_debug_fops); if (ocfs2_meta_ecc(osb)) { - status = ocfs2_blockcheck_stats_debugfs_install( - &osb->osb_ecc_stats, - osb->osb_debug_root); - if (status) { - mlog(ML_ERROR, - "Unable to create blockcheck statistics " - "files\n"); - goto read_super_error; - } + ocfs2_initialize_journal_triggers(sb, osb->s_journal_triggers); + ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, + osb->osb_debug_root); } status = ocfs2_mount_volume(sb); if (status < 0) - goto read_super_error; + goto out_debugfs; if (osb->root_inode) inode = igrab(osb->root_inode); if (!inode) { status = -EIO; - mlog_errno(status); - goto read_super_error; + goto out_dismount; + } + + osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, + &ocfs2_kset->kobj); + if (!osb->osb_dev_kset) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); + goto out_dismount; + } + + /* Create filecheck sysfs related directories/files at + * /sys/fs/ocfs2/<devname>/filecheck */ + if (ocfs2_filecheck_create_sysfs(osb)) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " + "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); + goto out_dismount; } root = d_make_root(inode); if (!root) { status = -ENOMEM; - mlog_errno(status); - goto read_super_error; + goto out_dismount; } sb->s_root = root; @@ -1193,7 +1136,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Now we can initialize quotas because we can afford to wait * for cluster locks recovery now. That also means that truncation * log recovery can happen but that waits for proper quota setup */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!sb_rdonly(sb)) { status = ocfs2_enable_quotas(osb); if (status < 0) { /* We have to err-out specially here because @@ -1216,60 +1159,92 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) return status; -read_super_error: - brelse(bh); +out_dismount: + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + ocfs2_free_replay_slots(osb); + ocfs2_dismount_volume(sb, 1); + goto out; - if (osb) { - atomic_set(&osb->vol_state, VOLUME_DISABLED); - wake_up(&osb->osb_mount_event); - ocfs2_dismount_volume(sb, 1); - } +out_debugfs: + debugfs_remove_recursive(osb->osb_debug_root); +out_super: + ocfs2_release_system_inodes(osb); + kfree(osb->recovery_map); + ocfs2_delete_osb(osb); + kfree(osb); +out: + mlog_errno(status); - if (status) - mlog_errno(status); return status; } -static struct dentry *ocfs2_mount(struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *data) +static int ocfs2_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); + return get_tree_bdev(fc, ocfs2_fill_super); } -static void ocfs2_kill_sb(struct super_block *sb) +static void ocfs2_free_fc(struct fs_context *fc) { - struct ocfs2_super *osb = OCFS2_SB(sb); + kfree(fc->fs_private); +} - /* Failed mount? */ - if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) - goto out; +static const struct fs_context_operations ocfs2_context_ops = { + .parse_param = ocfs2_parse_param, + .get_tree = ocfs2_get_tree, + .reconfigure = ocfs2_reconfigure, + .free = ocfs2_free_fc, +}; - /* Prevent further queueing of inode drop events */ - spin_lock(&dentry_list_lock); - ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); - spin_unlock(&dentry_list_lock); - /* Wait for work to finish and/or remove it */ - cancel_work_sync(&osb->dentry_lock_work); -out: - kill_block_super(sb); +static int ocfs2_init_fs_context(struct fs_context *fc) +{ + struct mount_options *mopt; + + mopt = kzalloc(sizeof(struct mount_options), GFP_KERNEL); + if (!mopt) + return -EINVAL; + + mopt->commit_interval = 0; + mopt->mount_opt = OCFS2_MOUNT_NOINTR; + mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; + mopt->slot = OCFS2_INVALID_SLOT; + mopt->localalloc_opt = -1; + mopt->cluster_stack[0] = '\0'; + mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; + mopt->dir_resv_level = -1; + + fc->fs_private = mopt; + fc->ops = &ocfs2_context_ops; + + return 0; } static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", - .mount = ocfs2_mount, - .kill_sb = ocfs2_kill_sb, - + .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, - .next = NULL + .next = NULL, + .init_fs_context = ocfs2_init_fs_context, + .parameters = ocfs2_param_spec, }; MODULE_ALIAS_FS("ocfs2"); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options) { + if (options->user_stack == 0) { + u32 tmp; + + /* Ensure only one heartbeat mode */ + tmp = options->mount_opt & (OCFS2_MOUNT_HB_LOCAL | + OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + return 0; + } + } if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { @@ -1301,233 +1276,142 @@ static int ocfs2_check_set_options(struct super_block *sb, return 1; } -static int ocfs2_parse_options(struct super_block *sb, - char *options, - struct mount_options *mopt, - int is_remount) +static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) { - int status, user_stack = 0; - char *p; - u32 tmp; - - trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); - - mopt->commit_interval = 0; - mopt->mount_opt = OCFS2_MOUNT_NOINTR; - mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; - mopt->slot = OCFS2_INVALID_SLOT; - mopt->localalloc_opt = -1; - mopt->cluster_stack[0] = '\0'; - mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; - mopt->dir_resv_level = -1; - - if (!options) { - status = 1; - goto bail; - } - - while ((p = strsep(&options, ",")) != NULL) { - int token, option; - substring_t args[MAX_OPT_ARGS]; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_hb_local: - mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; - break; - case Opt_hb_none: - mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; - break; - case Opt_hb_global: - mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; - break; - case Opt_barrier: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option) - mopt->mount_opt |= OCFS2_MOUNT_BARRIER; - else - mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; - break; - case Opt_intr: - mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; - break; - case Opt_nointr: + struct fs_parse_result result; + int opt; + struct mount_options *mopt = fc->fs_private; + bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); + + trace_ocfs2_parse_options(is_remount, param->key); + + opt = fs_parse(fc, ocfs2_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_heartbeat: + mopt->mount_opt |= result.uint_32; + break; + case Opt_barrier: + if (result.uint_32) + mopt->mount_opt |= OCFS2_MOUNT_BARRIER; + else + mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; + break; + case Opt_intr: + if (result.negated) mopt->mount_opt |= OCFS2_MOUNT_NOINTR; - break; - case Opt_err_panic: - mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; - break; - case Opt_err_ro: - mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; - break; - case Opt_data_ordered: - mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; - break; - case Opt_data_writeback: - mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; - break; - case Opt_user_xattr: - mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; - break; - case Opt_nouser_xattr: + else + mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; + break; + case Opt_errors: + mopt->mount_opt &= ~(OCFS2_MOUNT_ERRORS_CONT | + OCFS2_MOUNT_ERRORS_ROFS | + OCFS2_MOUNT_ERRORS_PANIC); + mopt->mount_opt |= result.uint_32; + break; + case Opt_data: + mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; + mopt->mount_opt |= result.uint_32; + break; + case Opt_user_xattr: + if (result.negated) mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; - break; - case Opt_atime_quantum: - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= 0) - mopt->atime_quantum = option; - break; - case Opt_slot: - option = 0; - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option) - mopt->slot = (s16)option; - break; - case Opt_commit: - option = 0; - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option < 0) - return 0; - if (option == 0) - option = JBD2_DEFAULT_MAX_COMMIT_AGE; - mopt->commit_interval = HZ * option; - break; - case Opt_localalloc: - option = 0; - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= 0) - mopt->localalloc_opt = option; - break; - case Opt_localflocks: - /* - * Changing this during remount could race - * flock() requests, or "unbalance" existing - * ones (e.g., a lock is taken in one mode but - * dropped in the other). If users care enough - * to flip locking modes during remount, we - * could add a "local" flag to individual - * flock structures for proper tracking of - * state. - */ - if (!is_remount) - mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; - break; - case Opt_stack: - /* Check both that the option we were passed - * is of the right length and that it is a proper - * string of the right length. - */ - if (((args[0].to - args[0].from) != - OCFS2_STACK_LABEL_LEN) || - (strnlen(args[0].from, - OCFS2_STACK_LABEL_LEN) != - OCFS2_STACK_LABEL_LEN)) { - mlog(ML_ERROR, - "Invalid cluster_stack option\n"); - status = 0; - goto bail; - } - memcpy(mopt->cluster_stack, args[0].from, - OCFS2_STACK_LABEL_LEN); - mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; - /* - * Open code the memcmp here as we don't have - * an osb to pass to - * ocfs2_userspace_stack(). - */ - if (memcmp(mopt->cluster_stack, - OCFS2_CLASSIC_CLUSTER_STACK, - OCFS2_STACK_LABEL_LEN)) - user_stack = 1; - break; - case Opt_inode64: - mopt->mount_opt |= OCFS2_MOUNT_INODE64; - break; - case Opt_usrquota: - mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; - break; - case Opt_grpquota: - mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; - break; - case Opt_coherency_buffered: - mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; - break; - case Opt_coherency_full: - mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; - break; - case Opt_acl: - mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; - mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; - break; - case Opt_noacl: + else + mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_atime_quantum: + mopt->atime_quantum = result.uint_32; + break; + case Opt_slot: + if (result.uint_32) + mopt->slot = (u16)result.uint_32; + break; + case Opt_commit: + if (result.uint_32 == 0) + mopt->commit_interval = HZ * JBD2_DEFAULT_MAX_COMMIT_AGE; + else + mopt->commit_interval = HZ * result.uint_32; + break; + case Opt_localalloc: + if (result.int_32 >= 0) + mopt->localalloc_opt = result.int_32; + break; + case Opt_localflocks: + /* + * Changing this during remount could race flock() requests, or + * "unbalance" existing ones (e.g., a lock is taken in one mode + * but dropped in the other). If users care enough to flip + * locking modes during remount, we could add a "local" flag to + * individual flock structures for proper tracking of state. + */ + if (!is_remount) + mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; + break; + case Opt_stack: + /* Check both that the option we were passed is of the right + * length and that it is a proper string of the right length. + */ + if (strlen(param->string) != OCFS2_STACK_LABEL_LEN) { + mlog(ML_ERROR, "Invalid cluster_stack option\n"); + return -EINVAL; + } + memcpy(mopt->cluster_stack, param->string, OCFS2_STACK_LABEL_LEN); + mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + /* + * Open code the memcmp here as we don't have an osb to pass + * to ocfs2_userspace_stack(). + */ + if (memcmp(mopt->cluster_stack, + OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + mopt->user_stack = 1; + break; + case Opt_inode64: + mopt->mount_opt |= OCFS2_MOUNT_INODE64; + break; + case Opt_usrquota: + mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; + break; + case Opt_grpquota: + mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; + break; + case Opt_coherency: + mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; + mopt->mount_opt |= result.uint_32; + break; + case Opt_acl: + if (result.negated) { mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; + } else { + mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; + } + break; + case Opt_resv_level: + if (is_remount) break; - case Opt_resv_level: - if (is_remount) - break; - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= OCFS2_MIN_RESV_LEVEL && - option < OCFS2_MAX_RESV_LEVEL) - mopt->resv_level = option; - break; - case Opt_dir_resv_level: - if (is_remount) - break; - if (match_int(&args[0], &option)) { - status = 0; - goto bail; - } - if (option >= OCFS2_MIN_RESV_LEVEL && - option < OCFS2_MAX_RESV_LEVEL) - mopt->dir_resv_level = option; + if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL && + result.uint_32 < OCFS2_MAX_RESV_LEVEL) + mopt->resv_level = result.uint_32; + break; + case Opt_dir_resv_level: + if (is_remount) break; - default: - mlog(ML_ERROR, - "Unrecognized mount option \"%s\" " - "or missing value\n", p); - status = 0; - goto bail; - } - } - - if (user_stack == 0) { - /* Ensure only one heartbeat mode */ - tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | - OCFS2_MOUNT_HB_GLOBAL | - OCFS2_MOUNT_HB_NONE); - if (hweight32(tmp) != 1) { - mlog(ML_ERROR, "Invalid heartbeat mount options\n"); - status = 0; - goto bail; - } + if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL && + result.uint_32 < OCFS2_MAX_RESV_LEVEL) + mopt->dir_resv_level = result.uint_32; + break; + case Opt_journal_async_commit: + mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; + break; + default: + return -EINVAL; } - status = 1; - -bail: - return status; + return 0; } static int ocfs2_show_options(struct seq_file *s, struct dentry *root) @@ -1558,6 +1442,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_ERRORS_PANIC) seq_printf(s, ",errors=panic"); + else if (opts & OCFS2_MOUNT_ERRORS_CONT) + seq_printf(s, ",errors=continue"); else seq_printf(s, ",errors=remount-ro"); @@ -1578,8 +1464,7 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",localflocks,"); if (osb->osb_cluster_stack[0]) - seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, - osb->osb_cluster_stack); + seq_show_option(s, "cluster_stack", osb->osb_cluster_stack); if (opts & OCFS2_MOUNT_USRQUOTA) seq_printf(s, ",usrquota"); if (opts & OCFS2_MOUNT_GRPQUOTA) @@ -1609,19 +1494,15 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (osb->osb_dir_resv_level != osb->osb_resv_level) seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); + if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) + seq_printf(s, ",journal_async_commit"); + return 0; } -wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; - static int __init ocfs2_init(void) { - int status, i; - - ocfs2_print_version(); - - for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) - init_waitqueue_head(&ocfs2__ioend_wq[i]); + int status; status = init_ocfs2_uptodate_cache(); if (status < 0) @@ -1631,32 +1512,18 @@ static int __init ocfs2_init(void) if (status < 0) goto out2; - ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); - if (!ocfs2_wq) { - status = -ENOMEM; - goto out3; - } - ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); - if (!ocfs2_debugfs_root) { - status = -EFAULT; - mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); - } ocfs2_set_locking_protocol(); - status = register_quota_format(&ocfs2_quota_format); - if (status < 0) - goto out4; + register_quota_format(&ocfs2_quota_format); + status = register_filesystem(&ocfs2_fs_type); if (!status) return 0; unregister_quota_format(&ocfs2_quota_format); -out4: - destroy_workqueue(ocfs2_wq); debugfs_remove(ocfs2_debugfs_root); -out3: ocfs2_free_mem_caches(); out2: exit_ocfs2_uptodate_cache(); @@ -1667,11 +1534,6 @@ out1: static void __exit ocfs2_exit(void) { - if (ocfs2_wq) { - flush_workqueue(ocfs2_wq); - destroy_workqueue(ocfs2_wq); - } - unregister_quota_format(&ocfs2_quota_format); debugfs_remove(ocfs2_debugfs_root); @@ -1744,8 +1606,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) ocfs2_inode_unlock(inode, 0); status = 0; bail: - if (inode) - iput(inode); + iput(inode); if (status) mlog_errno(status); @@ -1762,14 +1623,15 @@ static void ocfs2_inode_init_once(void *data) spin_lock_init(&oi->ip_lock); ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); + INIT_LIST_HEAD(&oi->ip_unwritten_list); oi->ip_dir_start_lookup = 0; - atomic_set(&oi->ip_unaligned_aio, 0); init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); oi->ip_blkno = 0ULL; oi->ip_clusters = 0; + oi->ip_next_orphan = NULL; ocfs2_resv_init_once(&oi->ip_la_data_resv); @@ -1789,27 +1651,23 @@ static int ocfs2_initialize_mem_caches(void) sizeof(struct ocfs2_inode_info), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_ACCOUNT), ocfs2_inode_init_once); ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), 0, - (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, NULL); ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", sizeof(struct ocfs2_quota_chunk), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + SLAB_RECLAIM_ACCOUNT, NULL); if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || !ocfs2_qf_chunk_cachep) { - if (ocfs2_inode_cachep) - kmem_cache_destroy(ocfs2_inode_cachep); - if (ocfs2_dquot_cachep) - kmem_cache_destroy(ocfs2_dquot_cachep); - if (ocfs2_qf_chunk_cachep) - kmem_cache_destroy(ocfs2_qf_chunk_cachep); + kmem_cache_destroy(ocfs2_inode_cachep); + kmem_cache_destroy(ocfs2_dquot_cachep); + kmem_cache_destroy(ocfs2_qf_chunk_cachep); return -ENOMEM; } @@ -1823,16 +1681,13 @@ static void ocfs2_free_mem_caches(void) * destroy cache. */ rcu_barrier(); - if (ocfs2_inode_cachep) - kmem_cache_destroy(ocfs2_inode_cachep); + kmem_cache_destroy(ocfs2_inode_cachep); ocfs2_inode_cachep = NULL; - if (ocfs2_dquot_cachep) - kmem_cache_destroy(ocfs2_dquot_cachep); + kmem_cache_destroy(ocfs2_dquot_cachep); ocfs2_dquot_cachep = NULL; - if (ocfs2_qf_chunk_cachep) - kmem_cache_destroy(ocfs2_qf_chunk_cachep); + kmem_cache_destroy(ocfs2_qf_chunk_cachep); ocfs2_qf_chunk_cachep = NULL; } @@ -1848,16 +1703,14 @@ static int ocfs2_get_sector(struct super_block *sb, *bh = sb_getblk(sb, block); if (!*bh) { - mlog_errno(-EIO); - return -EIO; + mlog_errno(-ENOMEM); + return -ENOMEM; } lock_buffer(*bh); if (!buffer_dirty(*bh)) clear_buffer_uptodate(*bh); unlock_buffer(*bh); - ll_rw_block(READ, 1, bh); - wait_on_buffer(*bh); - if (!buffer_uptodate(*bh)) { + if (bh_read(*bh, 0) < 0) { mlog_errno(-EIO); brelse(*bh); *bh = NULL; @@ -1870,53 +1723,71 @@ static int ocfs2_get_sector(struct super_block *sb, static int ocfs2_mount_volume(struct super_block *sb) { int status = 0; - int unlock_super = 0; struct ocfs2_super *osb = OCFS2_SB(sb); if (ocfs2_is_hard_readonly(osb)) - goto leave; + goto out; + + mutex_init(&osb->obs_trim_fs_mutex); status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); - goto leave; + if (status == -EBADR && ocfs2_userspace_stack(osb)) + mlog(ML_ERROR, "couldn't mount because cluster name on" + " disk does not match the running cluster name.\n"); + goto out; } status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); - goto leave; + goto out_dlm; } - unlock_super = 1; /* This will load up the node map and add ourselves to it. */ status = ocfs2_find_slot(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_super_lock; } /* load all node-local system inodes */ status = ocfs2_init_local_system_inodes(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_super_lock; } status = ocfs2_check_volume(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_system_inodes; } status = ocfs2_truncate_log_init(osb); - if (status < 0) + if (status < 0) { mlog_errno(status); + goto out_check_volume; + } -leave: - if (unlock_super) - ocfs2_super_unlock(osb, 1); + ocfs2_super_unlock(osb, 1); + return 0; +out_check_volume: + ocfs2_free_replay_slots(osb); +out_system_inodes: + if (osb->local_alloc_state == OCFS2_LA_ENABLED) + ocfs2_shutdown_local_alloc(osb); + ocfs2_release_system_inodes(osb); + /* before journal shutdown, we should release slot_info */ + ocfs2_free_slot_info(osb); + ocfs2_journal_shutdown(osb); +out_super_lock: + ocfs2_super_unlock(osb, 1); +out_dlm: + ocfs2_dlm_shutdown(osb, 0); +out: return status; } @@ -1924,7 +1795,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) { int tmp, hangup_needed = 0; struct ocfs2_super *osb = NULL; - char nodestr[8]; + char nodestr[12]; trace_ocfs2_dismount_volume(sb); @@ -1932,19 +1803,25 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); - debugfs_remove(osb->osb_ctxt); + /* Remove file check sysfs related directories/files, + * and wait for the pending file check operations */ + ocfs2_filecheck_remove_sysfs(osb); - /* - * Flush inode dropping work queue so that deletes are - * performed while the filesystem is still working - */ - ocfs2_drop_all_dl_inodes(osb); + kset_unregister(osb->osb_dev_kset); /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); + /* Stop quota recovery so that we can disable quotas */ + ocfs2_recovery_disable_quota(osb); + ocfs2_disable_quotas(osb); + /* All dquots should be freed by now */ + WARN_ON(!llist_empty(&osb->dquot_drop_list)); + /* Wait for worker to be done with the work structure in osb */ + cancel_work_sync(&osb->dquot_drop_work); + ocfs2_shutdown_local_alloc(osb); ocfs2_truncate_log_shutdown(osb); @@ -1952,8 +1829,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); - ocfs2_journal_shutdown(osb); - ocfs2_sync_blockdev(sb); ocfs2_purge_refcount_trees(osb); @@ -1976,6 +1851,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_release_system_inodes(osb); + ocfs2_journal_shutdown(osb); + /* * If we're dismounting due to mount error, mount.ocfs2 will clean * up heartbeat. If we're a local mount, there is no heartbeat. @@ -1986,11 +1863,10 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) !ocfs2_is_hard_readonly(osb)) hangup_needed = 1; - if (osb->cconn) - ocfs2_dlm_shutdown(osb, hangup_needed); + ocfs2_dlm_shutdown(osb, hangup_needed); ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); - debugfs_remove(osb->osb_debug_root); + debugfs_remove_recursive(osb->osb_debug_root); if (hangup_needed) ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); @@ -2074,8 +1950,6 @@ static int ocfs2_initialize_super(struct super_block *sb, int i, cbits, bbits; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; - struct ocfs2_journal *journal; - __le32 uuid_net_key; struct ocfs2_super *osb; u64 total_blocks; @@ -2083,22 +1957,25 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out; } sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; - sb->s_d_op = &ocfs2_dentry_ops; + set_default_d_op(sb, &ocfs2_dentry_ops); sb->s_export_op = &ocfs2_export_ops; - sb->s_qcop = &ocfs2_quotactl_ops; + sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->dq_op = &ocfs2_quota_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; /* this is needed to support O_LARGEFILE */ cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); + super_set_uuid(sb, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; @@ -2107,7 +1984,6 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); osb->sb = sb; - /* Save off for ocfs2_rw_direct */ osb->s_sectsize_bits = blksize_bits(sector_size); BUG_ON(!osb->s_sectsize_bits); @@ -2121,6 +1997,8 @@ static int ocfs2_initialize_super(struct super_block *sb, spin_lock_init(&osb->osb_xattr_lock); ocfs2_init_steal_slots(osb); + mutex_init(&osb->system_file_mutex); + atomic_set(&osb->alloc_stats.moves, 0); atomic_set(&osb->alloc_stats.local_data, 0); atomic_set(&osb->alloc_stats.bitmap_data, 0); @@ -2140,7 +2018,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Invalid number of node slots (%u)\n", osb->max_slots); status = -EINVAL; - goto bail; + goto out; } ocfs2_orphan_scan_init(osb); @@ -2149,7 +2027,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); mlog_errno(status); - goto bail; + goto out; } init_waitqueue_head(&osb->checkpoint_event); @@ -2167,17 +2045,13 @@ static int ocfs2_initialize_super(struct super_block *sb, init_waitqueue_head(&osb->osb_mount_event); - status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); - if (status) { - mlog_errno(status); - goto bail; - } + ocfs2_resmap_init(osb, &osb->osb_la_resmap); osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); if (!osb->vol_label) { mlog(ML_ERROR, "unable to alloc vol label\n"); status = -ENOMEM; - goto bail; + goto out_recovery_map; } osb->slot_recovery_generations = @@ -2186,7 +2060,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->slot_recovery_generations) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_vol_label; } init_waitqueue_head(&osb->osb_wipe_event); @@ -2196,7 +2070,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->osb_orphan_wipes) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_slot_recovery_gen; } osb->osb_rf_lock_tree = RB_ROOT; @@ -2212,31 +2086,38 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "couldn't mount because of unsupported " "optional features (%x).\n", i); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } - if (!(osb->sb->s_flags & MS_RDONLY) && - (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { + if (!sb_rdonly(osb->sb) && (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { mlog(ML_ERROR, "couldn't mount RDWR because of " "unsupported optional features (%x).\n", i); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } if (ocfs2_clusterinfo_valid(osb)) { + /* + * ci_stack and ci_cluster in ocfs2_cluster_info may not be null + * terminated, so make sure no overflow happens here by using + * memcpy. Destination strings will always be null terminated + * because osb is allocated using kzalloc. + */ osb->osb_stackflags = OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; memcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, OCFS2_STACK_LABEL_LEN); - osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "couldn't mount because of an invalid " "cluster stack label (%s) \n", osb->osb_cluster_stack); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } + memcpy(osb->osb_cluster_name, + OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, + OCFS2_CLUSTER_NAME_LEN); } else { /* The empty string is identical with classic tools that * don't know about s_cluster_info. */ @@ -2245,35 +2126,17 @@ static int ocfs2_initialize_super(struct super_block *sb, get_random_bytes(&osb->s_next_generation, sizeof(u32)); - /* FIXME - * This should be done in ocfs2_journal_init(), but unknown - * ordering issues will cause the filesystem to crash. - * If anyone wants to figure out what part of the code - * refers to osb->journal before ocfs2_journal_init() is run, - * be my guest. + /* + * FIXME + * This should be done in ocfs2_journal_init(), but any inode + * writes back operation will cause the filesystem to crash. */ - /* initialize our journal structure */ - - journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); - if (!journal) { - mlog(ML_ERROR, "unable to alloc journal\n"); - status = -ENOMEM; - goto bail; - } - osb->journal = journal; - journal->j_osb = osb; - - atomic_set(&journal->j_num_trans, 0); - init_rwsem(&journal->j_trans_barrier); - init_waitqueue_head(&journal->j_checkpointed); - spin_lock_init(&journal->j_lock); - journal->j_trans_id = (unsigned long) 1; - INIT_LIST_HEAD(&journal->j_la_cleanups); - INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); - journal->j_state = OCFS2_JOURNAL_FREE; + status = ocfs2_journal_alloc(osb); + if (status < 0) + goto out_orphan_wipes; - INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); - osb->dentry_lock_list = NULL; + INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); + init_llist_head(&osb->dquot_drop_list); /* get some pseudo constants for clustersize bits */ osb->s_clustersize_bits = @@ -2285,7 +2148,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", osb->s_clustersize); status = -EINVAL; - goto bail; + goto out_journal; } total_blocks = ocfs2_clusters_to_blocks(osb->sb, @@ -2297,20 +2160,18 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Volume too large " "to mount safely on this system"); status = -EFBIG; - goto bail; + goto out_journal; } if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, sizeof(di->id2.i_super.s_uuid))) { mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); status = -ENOMEM; - goto bail; + goto out_journal; } - memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); - - strncpy(osb->vol_label, di->id2.i_super.s_label, 63); - osb->vol_label[63] = '\0'; + strscpy(osb->vol_label, di->id2.i_super.s_label, + OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); osb->first_cluster_group_blkno = @@ -2326,7 +2187,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->osb_dlm_debug) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_uuid_str; } atomic_set(&osb->vol_state, VOLUME_INIT); @@ -2335,7 +2196,7 @@ static int ocfs2_initialize_super(struct super_block *sb, status = ocfs2_init_global_system_inodes(osb); if (status < 0) { mlog_errno(status); - goto bail; + goto out_dlm_out; } /* @@ -2346,7 +2207,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!inode) { status = -EINVAL; mlog_errno(status); - goto bail; + goto out_system_inodes; } osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; @@ -2359,11 +2220,39 @@ static int ocfs2_initialize_super(struct super_block *sb, status = ocfs2_init_slot_info(osb); if (status < 0) { mlog_errno(status); - goto bail; + goto out_system_inodes; } - cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); -bail: + osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); + if (!osb->ocfs2_wq) { + status = -ENOMEM; + mlog_errno(status); + goto out_slot_info; + } + + return status; + +out_slot_info: + ocfs2_free_slot_info(osb); +out_system_inodes: + ocfs2_release_system_inodes(osb); +out_dlm_out: + ocfs2_put_dlm_debug(osb->osb_dlm_debug); +out_uuid_str: + kfree(osb->uuid_str); +out_journal: + kfree(osb->journal); +out_orphan_wipes: + kfree(osb->osb_orphan_wipes); +out_slot_recovery_gen: + kfree(osb->slot_recovery_generations); +out_vol_label: + kfree(osb->vol_label); +out_recovery_map: + kfree(osb->recovery_map); +out: + kfree(osb); + sb->s_fs_info = NULL; return status; } @@ -2378,6 +2267,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct ocfs2_blockcheck_stats *stats) { int status = -EAGAIN; + u32 blksz_bits; if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { @@ -2392,11 +2282,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, goto out; } status = -EINVAL; - if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { + /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */ + blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); + if (blksz_bits < 9 || blksz_bits > 12) { mlog(ML_ERROR, "found superblock with incorrect block " - "size: found %u, should be %u\n", - 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), - blksz); + "size bits: found %u, should be 9, 10, 11, or 12\n", + blksz_bits); + } else if ((1 << blksz_bits) != blksz) { + mlog(ML_ERROR, "found superblock with incorrect block " + "size: found %u, should be %u\n", 1 << blksz_bits, blksz); } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != OCFS2_MAJOR_REV_LEVEL || le16_to_cpu(di->id2.i_super.s_minor_rev_level) != @@ -2414,8 +2308,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, (unsigned long long)bh->b_blocknr); } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { - mlog(ML_ERROR, "bad cluster size found: %u\n", - 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); + mlog(ML_ERROR, "bad cluster size bit found: %u\n", + le32_to_cpu(di->id2.i_super.s_clustersize_bits)); } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { mlog(ML_ERROR, "bad root_blkno: 0\n"); } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { @@ -2448,7 +2342,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) * ourselves. */ /* Init our journal object. */ - status = ocfs2_journal_init(osb->journal, &dirty); + status = ocfs2_journal_init(osb, &dirty); if (status < 0) { mlog(ML_ERROR, "Could not initialize journal!\n"); goto finally; @@ -2483,6 +2377,15 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) goto finally; } + if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) + jbd2_journal_set_features(osb->journal->j_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + else + jbd2_journal_clear_features(osb->journal->j_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + if (dirty) { /* recover my local alloc if we didn't unmount cleanly. */ status = ocfs2_begin_local_alloc_recovery(osb, @@ -2505,7 +2408,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) if (dirty) { /* Recovery will be completed after we've mounted the * rest of the volume. */ - osb->dirty = 1; osb->local_alloc_copy = local_alloc; local_alloc = NULL; } @@ -2541,81 +2443,99 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) { /* This function assumes that the caller has the main osb resource */ + /* ocfs2_initializer_super have already created this workqueue */ + if (osb->ocfs2_wq) + destroy_workqueue(osb->ocfs2_wq); + ocfs2_free_slot_info(osb); kfree(osb->osb_orphan_wipes); kfree(osb->slot_recovery_generations); /* FIXME * This belongs in journal shutdown, but because we have to - * allocate osb->journal at the start of ocfs2_initialize_osb(), + * allocate osb->journal at the middle of ocfs2_initialize_super(), * we free it here. */ kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); + kfree(osb->vol_label); ocfs2_put_dlm_debug(osb->osb_dlm_debug); memset(osb, 0, sizeof(struct ocfs2_super)); } -/* Put OCFS2 into a readonly state, or (if the user specifies it), - * panic(). We do not support continue-on-error operation. */ -static void ocfs2_handle_error(struct super_block *sb) +/* Depending on the mount option passed, perform one of the following: + * Put OCFS2 into a readonly state (default) + * Return EIO so that only the process errs + * Fix the error as if fsck.ocfs2 -y + * panic + */ +static int ocfs2_handle_error(struct super_block *sb) { struct ocfs2_super *osb = OCFS2_SB(sb); + int rv = 0; + + ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); + pr_crit("On-disk corruption discovered. " + "Please run fsck.ocfs2 once the filesystem is unmounted.\n"); - if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) + if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) { panic("OCFS2: (device %s): panic forced after error\n", sb->s_id); + } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) { + pr_crit("OCFS2: Returning error to the calling process.\n"); + rv = -EIO; + } else { /* default option */ + rv = -EROFS; + if (sb_rdonly(sb) && (ocfs2_is_soft_readonly(osb) || ocfs2_is_hard_readonly(osb))) + return rv; - ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); - - if (sb->s_flags & MS_RDONLY && - (ocfs2_is_soft_readonly(osb) || - ocfs2_is_hard_readonly(osb))) - return; + pr_crit("OCFS2: File system is now read-only.\n"); + sb->s_flags |= SB_RDONLY; + ocfs2_set_ro_flag(osb, 0); + } - printk(KERN_CRIT "File system is now read-only due to the potential " - "of on-disk corruption. Please run fsck.ocfs2 once the file " - "system is unmounted.\n"); - sb->s_flags |= MS_RDONLY; - ocfs2_set_ro_flag(osb, 0); + return rv; } -static char error_buf[1024]; - -void __ocfs2_error(struct super_block *sb, - const char *function, - const char *fmt, ...) +int __ocfs2_error(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; /* Not using mlog here because we want to show the actual * function the error came from. */ - printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", - sb->s_id, function, error_buf); + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV", + sb->s_id, function, &vaf); - ocfs2_handle_error(sb); + va_end(args); + + return ocfs2_handle_error(sb); } /* Handle critical errors. This is intentionally more drastic than * ocfs2_handle_error, so we only use for things like journal errors, * etc. */ -void __ocfs2_abort(struct super_block* sb, - const char *function, +void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); - va_end(args); - printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", - sb->s_id, function, error_buf); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV", + sb->s_id, function, &vaf); + + va_end(args); /* We don't have the cluster support yet to go straight to * hard readonly in here. Until then, we want to keep diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 74ff74cf78fe..8312651135b9 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -1,47 +1,28 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * super.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_SUPER_H #define OCFS2_SUPER_H -extern struct workqueue_struct *ocfs2_wq; - -int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, - int node_num); - __printf(3, 4) -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_error(sb, fmt, ...) \ + __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) __printf(3, 4) void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_abort(sb, fmt, ...) \ + __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) /* * Void signal blockers, because in-kernel sigprocmask() only fails diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 66edce7ecfd7..ad8be3300b49 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * linux/cluster/ssi/cfs/symlink.c * * This program is free software; you can redistribute it and/or @@ -54,47 +52,39 @@ #include "buffer_head_io.h" -static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page) +static int ocfs2_fast_symlink_read_folio(struct file *f, struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct buffer_head *bh = NULL; int status = ocfs2_read_inode_block(inode, &bh); struct ocfs2_dinode *fe; const char *link; - void *kaddr; size_t len; if (status < 0) { mlog_errno(status); - return status; + goto out; } fe = (struct ocfs2_dinode *) bh->b_data; link = (char *) fe->id2.i_symlink; /* will be less than a page size */ len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb)); - kaddr = kmap_atomic(page); - memcpy(kaddr, link, len + 1); - kunmap_atomic(kaddr); - SetPageUptodate(page); - unlock_page(page); + memcpy_to_folio(folio, 0, link, len + 1); +out: + folio_end_read(folio, status == 0); brelse(bh); - return 0; + return status; } const struct address_space_operations ocfs2_fast_symlink_aops = { - .readpage = ocfs2_fast_symlink_readpage, + .read_folio = ocfs2_fast_symlink_read_folio, }; const struct inode_operations ocfs2_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, - .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, }; diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h index 71ee4245e919..ffcf0210545c 100644 --- a/fs/ocfs2/symlink.h +++ b/fs/ocfs2/symlink.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * symlink.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_SYMLINK_H diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index f053688d22a3..d53a6cc866be 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * sysfile.c * * Initialize, read, write, etc. system files. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -69,10 +53,11 @@ static struct inode **get_local_system_inode(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); if (unlikely(!local_system_inodes)) { - local_system_inodes = kzalloc(sizeof(struct inode *) * - NUM_LOCAL_SYSTEM_INODES * - osb->max_slots, - GFP_NOFS); + local_system_inodes = + kzalloc(array3_size(sizeof(struct inode *), + NUM_LOCAL_SYSTEM_INODES, + osb->max_slots), + GFP_NOFS); if (!local_system_inodes) { mlog_errno(-ENOMEM); /* @@ -113,9 +98,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, } else arr = get_local_system_inode(osb, type, slot); + mutex_lock(&osb->system_file_mutex); if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ inode = igrab(inode); + mutex_unlock(&osb->system_file_mutex); BUG_ON(!inode); return inode; @@ -129,6 +116,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, *arr = igrab(inode); BUG_ON(!*arr); } + mutex_unlock(&osb->system_file_mutex); return inode; } @@ -139,14 +127,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, char namebuf[40]; struct inode *inode = NULL; u64 blkno; - int status = 0; + int len, status = 0; - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, slot); + len = ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, slot); - status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, len, &blkno); if (status < 0) { goto bail; } diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h index cc9ea661ffc1..2b38c75990fd 100644 --- a/fs/ocfs2/sysfile.h +++ b/fs/ocfs2/sysfile.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * sysfile.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_SYSFILE_H diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 52eaf33d346f..09854925fa5c 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -1,6 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * uptodate.c * * Tracking the up-to-date-ness of a local buffer_head with respect to @@ -8,21 +7,6 @@ * * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * * Standard buffer head caching flags (uptodate, etc) are insufficient * in a clustered environment - a buffer may be marked up to date on * our local node but could have been modified by another cluster @@ -67,7 +51,7 @@ struct ocfs2_meta_cache_item { sector_t c_block; }; -static struct kmem_cache *ocfs2_uptodate_cachep = NULL; +static struct kmem_cache *ocfs2_uptodate_cachep; u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) { @@ -633,6 +617,5 @@ int __init init_ocfs2_uptodate_cache(void) void exit_ocfs2_uptodate_cache(void) { - if (ocfs2_uptodate_cachep) - kmem_cache_destroy(ocfs2_uptodate_cachep); + kmem_cache_destroy(ocfs2_uptodate_cachep); } diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index 0d826fe2da0d..85d94134001b 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * uptodate.h * * Cluster uptodate tracking * * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef OCFS2_UPTODATE_H diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c deleted file mode 100644 index e2488f4128a2..000000000000 --- a/fs/ocfs2/ver.c +++ /dev/null @@ -1,43 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define OCFS2_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION - -void ocfs2_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(OCFS2_BUILD_VERSION); diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h deleted file mode 100644 index d7395cb91d2f..000000000000 --- a/fs/ocfs2/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef OCFS2_VER_H -#define OCFS2_VER_H - -void ocfs2_print_version(void); - -#endif /* OCFS2_VER_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 317ef0abccbb..dc1761e84814 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1,6 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * xattr.c * * Copyright (C) 2004, 2008 Oracle. All rights reserved. @@ -8,15 +7,6 @@ * CREDITS: * Lots of code in this file is copy from linux/fs/ext3/xattr.c. * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/capability.h> @@ -97,23 +87,19 @@ static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), }; -const struct xattr_handler *ocfs2_xattr_handlers[] = { +const struct xattr_handler * const ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, - &ocfs2_xattr_acl_access_handler, - &ocfs2_xattr_acl_default_handler, &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL }; -static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { - [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, - [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] - = &ocfs2_xattr_acl_access_handler, - [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] - = &ocfs2_xattr_acl_default_handler, - [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, - [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, +static const struct xattr_handler * const ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { + [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, + [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, + [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, + [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; struct ocfs2_xattr_info { @@ -369,7 +355,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket) * them fully. */ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, - u64 xb_blkno) + u64 xb_blkno, int new) { int i, rc = 0; @@ -377,15 +363,22 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb, xb_blkno + i); if (!bucket->bu_bhs[i]) { - rc = -EIO; + rc = -ENOMEM; mlog_errno(rc); break; } if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), - bucket->bu_bhs[i])) - ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), - bucket->bu_bhs[i]); + bucket->bu_bhs[i])) { + if (new) + ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + else { + set_buffer_uptodate(bucket->bu_bhs[i]); + ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + } + } } if (rc) @@ -492,30 +485,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ocfs2_error(sb, - "Extended attribute block #%llu has bad " - "signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - xb->xb_signature); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + xb->xb_signature); } if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extended attribute block #%llu has an " - "invalid xb_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(xb->xb_blkno)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(xb->xb_blkno)); } if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extended attribute block #%llu has an invalid " - "xb_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(xb->xb_fs_generation)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(xb->xb_fs_generation)); } return 0; @@ -543,8 +530,7 @@ static inline const char *ocfs2_xattr_prefix(int name_index) if (name_index > 0 && name_index < OCFS2_XATTR_MAX) handler = ocfs2_xattr_handler_map[name_index]; - - return handler ? handler->prefix : NULL; + return handler ? xattr_prefix(handler) : NULL; } static u32 ocfs2_xattr_name_hash(struct inode *inode, @@ -638,14 +624,17 @@ int ocfs2_calc_xattr_init(struct inode *dir, si->value_len); if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl_len = ocfs2_xattr_get_nolock(dir, dir_bh, OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, "", NULL, 0); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (acl_len > 0) { a_size = ocfs2_xattr_entry_real_size(0, acl_len); if (S_ISDIR(mode)) a_size <<= 1; } else if (acl_len != 0 && acl_len != -ENODATA) { + ret = acl_len; mlog_errno(ret); return ret; } @@ -659,7 +648,7 @@ int ocfs2_calc_xattr_init(struct inode *dir, * 256(name) + 80(value) + 16(entry) = 352 bytes, * The max space of acl xattr taken inline is * 80(value) + 16(entry) * 2(if directory) = 192 bytes, - * when blocksize = 512, may reserve one more cluser for + * when blocksize = 512, may reserve one more cluster for * xattr bucket, otherwise reserve one metadata block * for them is ok. * If this is a new directory with inline data, @@ -754,8 +743,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, BUG_ON(why == RESTART_META); credits = ocfs2_calc_extend_credits(inode->i_sb, - &vb->vb_xv->xr_list, - clusters_to_add); + &vb->vb_xv->xr_list); status = ocfs2_extend_trans(handle, credits); if (status < 0) { status = -ENOMEM; @@ -884,14 +872,39 @@ static int ocfs2_xattr_value_truncate(struct inode *inode, return ret; } -static int ocfs2_xattr_list_entry(char *buffer, size_t size, - size_t *result, const char *prefix, +static int ocfs2_xattr_list_entry(struct super_block *sb, + char *buffer, size_t size, + size_t *result, int type, const char *name, int name_len) { char *p = buffer + *result; - int prefix_len = strlen(prefix); - int total_len = prefix_len + name_len + 1; + const char *prefix; + int prefix_len; + int total_len; + + switch(type) { + case OCFS2_XATTR_INDEX_USER: + if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return 0; + break; + case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS: + case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT: + if (!(sb->s_flags & SB_POSIXACL)) + return 0; + break; + + case OCFS2_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return 0; + break; + } + + prefix = ocfs2_xattr_prefix(type); + if (!prefix) + return 0; + prefix_len = strlen(prefix); + total_len = prefix_len + name_len + 1; *result += total_len; /* we are just looking for how big our buffer needs to be */ @@ -914,23 +927,20 @@ static int ocfs2_xattr_list_entries(struct inode *inode, { size_t result = 0; int i, type, ret; - const char *prefix, *name; + const char *name; for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) { struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; type = ocfs2_xattr_get_type(entry); - prefix = ocfs2_xattr_prefix(type); - - if (prefix) { - name = (const char *)header + - le16_to_cpu(entry->xe_name_offset); + name = (const char *)header + + le16_to_cpu(entry->xe_name_offset); - ret = ocfs2_xattr_list_entry(buffer, buffer_size, - &result, prefix, name, - entry->xe_name_len); - if (ret) - return ret; - } + ret = ocfs2_xattr_list_entry(inode->i_sb, + buffer, buffer_size, + &result, type, name, + entry->xe_name_len); + if (ret) + return ret; } return result; @@ -1014,7 +1024,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, int ret = 0, i_ret = 0, b_ret = 0; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode); + struct ocfs2_inode_info *oi = OCFS2_I(d_inode(dentry)); if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb))) return -EOPNOTSUPP; @@ -1022,7 +1032,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return ret; - ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0); + ret = ocfs2_inode_lock(d_inode(dentry), &di_bh, 0); if (ret < 0) { mlog_errno(ret); return ret; @@ -1031,7 +1041,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, di = (struct ocfs2_dinode *)di_bh->b_data; down_read(&oi->ip_xattr_sem); - i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size); + i_ret = ocfs2_xattr_ibody_list(d_inode(dentry), di, buffer, size); if (i_ret < 0) b_ret = 0; else { @@ -1039,26 +1049,26 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, buffer += i_ret; size -= i_ret; } - b_ret = ocfs2_xattr_block_list(dentry->d_inode, di, + b_ret = ocfs2_xattr_block_list(d_inode(dentry), di, buffer, size); if (b_ret < 0) i_ret = 0; } up_read(&oi->ip_xattr_sem); - ocfs2_inode_unlock(dentry->d_inode, 0); + ocfs2_inode_unlock(d_inode(dentry), 0); brelse(di_bh); return i_ret + b_ret; } -static int ocfs2_xattr_find_entry(int name_index, +static int ocfs2_xattr_find_entry(struct inode *inode, int name_index, const char *name, struct ocfs2_xattr_search *xs) { struct ocfs2_xattr_entry *entry; size_t name_len; - int i, cmp = 1; + int i, name_offset, cmp = 1; if (name == NULL) return -EINVAL; @@ -1066,13 +1076,22 @@ static int ocfs2_xattr_find_entry(int name_index, name_len = strlen(name); entry = xs->here; for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + if ((void *)entry >= xs->end) { + ocfs2_error(inode->i_sb, "corrupted xattr entries"); + return -EFSCORRUPTED; + } cmp = name_index - ocfs2_xattr_get_type(entry); if (!cmp) cmp = name_len - entry->xe_name_len; - if (!cmp) - cmp = memcmp(name, (xs->base + - le16_to_cpu(entry->xe_name_offset)), - name_len); + if (!cmp) { + name_offset = le16_to_cpu(entry->xe_name_offset); + if ((xs->base + name_offset + name_len) > xs->end) { + ocfs2_error(inode->i_sb, + "corrupted xattr entries"); + return -EFSCORRUPTED; + } + cmp = memcmp(name, (xs->base + name_offset), name_len); + } if (cmp == 0) break; entry += 1; @@ -1156,7 +1175,7 @@ static int ocfs2_xattr_ibody_get(struct inode *inode, xs->base = (void *)xs->header; xs->here = xs->header->xh_entries; - ret = ocfs2_xattr_find_entry(name_index, name, xs); + ret = ocfs2_xattr_find_entry(inode, name_index, name, xs); if (ret) return ret; size = le64_to_cpu(xs->here->xe_value_size); @@ -1195,7 +1214,7 @@ static int ocfs2_xattr_block_get(struct inode *inode, struct ocfs2_xattr_value_root *xv; size_t size; int ret = -ENODATA, name_offset, name_len, i; - int uninitialized_var(block_off); + int block_off; xs->bucket = ocfs2_xattr_bucket_new(inode); if (!xs->bucket) { @@ -1232,6 +1251,10 @@ static int ocfs2_xattr_block_get(struct inode *inode, i, &block_off, &name_offset); + if (ret) { + mlog_errno(ret); + goto cleanup; + } xs->base = bucket_block(xs->bucket, block_off); } if (ocfs2_xattr_is_local(xs->here)) { @@ -1278,7 +1301,7 @@ int ocfs2_xattr_get_nolock(struct inode *inode, return -EOPNOTSUPP; if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) - ret = -ENODATA; + return -ENODATA; xis.inode_bh = xbs.inode_bh = di_bh; di = (struct ocfs2_dinode *)di_bh->b_data; @@ -1303,20 +1326,21 @@ static int ocfs2_xattr_get(struct inode *inode, void *buffer, size_t buffer_size) { - int ret; + int ret, had_lock; struct buffer_head *di_bh = NULL; + struct ocfs2_lock_holder oh; - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - return ret; + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh); + if (had_lock < 0) { + mlog_errno(had_lock); + return had_lock; } down_read(&OCFS2_I(inode)->ip_xattr_sem); ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, name, buffer, buffer_size); up_read(&OCFS2_I(inode)->ip_xattr_sem); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); @@ -2012,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc, rc = 0; ocfs2_xa_cleanup_value_truncate(loc, "removing", orig_clusters); - if (rc) - goto out; + goto out; } } @@ -2499,7 +2522,7 @@ static int ocfs2_xattr_free_block(struct inode *inode, mlog_errno(ret); goto out; } - mutex_lock(&xb_alloc_inode->i_mutex); + inode_lock(xb_alloc_inode); ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); if (ret < 0) { @@ -2524,7 +2547,7 @@ out_unlock: ocfs2_inode_unlock(xb_alloc_inode, 1); brelse(xb_alloc_bh); out_mutex: - mutex_unlock(&xb_alloc_inode->i_mutex); + inode_unlock(xb_alloc_inode); iput(xb_alloc_inode); out: brelse(blk_bh); @@ -2552,7 +2575,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return 0; - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + if (ocfs2_is_refcount_inode(inode)) { ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); @@ -2603,6 +2626,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); out_commit: @@ -2682,7 +2706,7 @@ static int ocfs2_xattr_ibody_find(struct inode *inode, /* Find the named attribute. */ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { - ret = ocfs2_xattr_find_entry(name_index, name, xs); + ret = ocfs2_xattr_find_entry(inode, name_index, name, xs); if (ret && ret != -ENODATA) return ret; xs->not_found = ret; @@ -2817,7 +2841,7 @@ static int ocfs2_xattr_block_find(struct inode *inode, xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; xs->here = xs->header->xh_entries; - ret = ocfs2_xattr_find_entry(name_index, name, xs); + ret = ocfs2_xattr_find_entry(inode, name_index, name, xs); } else ret = ocfs2_xattr_index_block_find(inode, blk_bh, name_index, @@ -2865,6 +2889,12 @@ static int ocfs2_create_xattr_block(struct inode *inode, } new_bh = sb_getblk(inode->i_sb, first_blkno); + if (!new_bh) { + ret = -ENOMEM; + mlog_errno(ret); + goto end; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode), @@ -2878,7 +2908,7 @@ static int ocfs2_create_xattr_block(struct inode *inode, /* Initialize ocfs2_xattr_block */ xblk = (struct ocfs2_xattr_block *)new_bh->b_data; memset(xblk, 0, inode->i_sb->s_blocksize); - strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + strscpy(xblk->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE); xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc); xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); @@ -3040,8 +3070,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { clusters_add += new_clusters; credits += ocfs2_calc_extend_credits(inode->i_sb, - &def_xv.xv.xr_list, - new_clusters); + &def_xv.xv.xr_list); } goto meta_guess; @@ -3106,8 +3135,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, if (!ocfs2_xattr_is_local(xe)) credits += ocfs2_calc_extend_credits( inode->i_sb, - &def_xv.xv.xr_list, - new_clusters); + &def_xv.xv.xr_list); goto out; } } @@ -3132,9 +3160,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, meta_add += ocfs2_extend_meta_needed(&xv->xr_list); clusters_add += new_clusters - old_clusters; credits += ocfs2_calc_extend_credits(inode->i_sb, - &xv->xr_list, - new_clusters - - old_clusters); + &xv->xr_list); if (value_size >= OCFS2_XATTR_ROOT_SIZE) goto out; } @@ -3180,7 +3206,7 @@ meta_guess: &xb->xb_attrs.xb_root.xt_list; meta_add += ocfs2_extend_meta_needed(el); credits += ocfs2_calc_extend_credits(inode->i_sb, - el, 1); + el); } else credits += OCFS2_SUBALLOC_ALLOC + 1; @@ -3199,8 +3225,15 @@ meta_guess: clusters_add += 1; } } else { - meta_add += 1; credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + struct ocfs2_extent_list *el = &def_xv.xv.xr_list; + meta_add += ocfs2_extend_meta_needed(el); + credits += ocfs2_calc_extend_credits(inode->i_sb, + el); + } else { + meta_add += 1; + } } out: if (clusters_need) @@ -3396,9 +3429,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - inode->i_ctime = CURRENT_TIME; - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); } out: @@ -3502,11 +3535,12 @@ int ocfs2_xattr_set(struct inode *inode, { struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - int ret, credits, ref_meta = 0, ref_credits = 0; + int ret, credits, had_lock, ref_meta = 0, ref_credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; - struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, }; struct ocfs2_refcount_tree *ref_tree = NULL; + struct ocfs2_lock_holder oh; struct ocfs2_xattr_info xi = { .xi_name_index = name_index, @@ -3524,7 +3558,7 @@ int ocfs2_xattr_set(struct inode *inode, .not_found = -ENODATA, }; - if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + if (!ocfs2_supports_xattr(osb)) return -EOPNOTSUPP; /* @@ -3537,8 +3571,9 @@ int ocfs2_xattr_set(struct inode *inode, return -ENOMEM; } - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 1, &oh); + if (had_lock < 0) { + ret = had_lock; mlog_errno(ret); goto cleanup_nolock; } @@ -3573,7 +3608,7 @@ int ocfs2_xattr_set(struct inode *inode, } /* Check whether the value is refcounted and do some preparation. */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL && + if (ocfs2_is_refcount_inode(inode) && (!xis.not_found || !xbs.not_found)) { ret = ocfs2_prepare_refcount_xattr(inode, di, &xi, &xis, &xbs, &ref_tree, @@ -3584,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode, } } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); if (ret < 0) { - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); mlog_errno(ret); goto cleanup; } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, &xbs, &ctxt, ref_meta, &credits); @@ -3609,13 +3644,15 @@ int ocfs2_xattr_set(struct inode *inode, if (IS_ERR(ctxt.handle)) { ret = PTR_ERR(ctxt.handle); mlog_errno(ret); - goto cleanup; + goto out_free_ac; } ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0); ocfs2_commit_trans(osb, ctxt.handle); +out_free_ac: if (ctxt.data_ac) ocfs2_free_alloc_context(ctxt.data_ac); if (ctxt.meta_ac) @@ -3633,7 +3670,7 @@ cleanup: if (ret) mlog_errno(ret); } - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); cleanup_nolock: brelse(di_bh); brelse(xbs.xattr_bh); @@ -3672,11 +3709,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in xattr tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3691,11 +3727,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, } if (!e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } @@ -3792,7 +3827,6 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); int low_bucket = 0, bucket, high_bucket; struct ocfs2_xattr_bucket *search; - u32 last_hash; u64 blkno, lower_blkno = 0; search = ocfs2_xattr_bucket_new(inode); @@ -3836,8 +3870,6 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, if (xh->xh_count) xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1]; - last_hash = le32_to_cpu(xe->xe_name_hash); - /* record lower_blkno which may be the insert place. */ lower_blkno = blkno; @@ -4019,32 +4051,30 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, int ret = 0, type; struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para; int i, block_off, new_offset; - const char *prefix, *name; + const char *name; for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) { struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i]; type = ocfs2_xattr_get_type(entry); - prefix = ocfs2_xattr_prefix(type); - if (prefix) { - ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, - bucket_xh(bucket), - i, - &block_off, - &new_offset); - if (ret) - break; + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(bucket), + i, + &block_off, + &new_offset); + if (ret) + break; - name = (const char *)bucket_block(bucket, block_off) + - new_offset; - ret = ocfs2_xattr_list_entry(xl->buffer, - xl->buffer_size, - &xl->result, - prefix, name, - entry->xe_name_len); - if (ret) - break; - } + name = (const char *)bucket_block(bucket, block_off) + + new_offset; + ret = ocfs2_xattr_list_entry(inode->i_sb, + xl->buffer, + xl->buffer_size, + &xl->result, + type, name, + entry->xe_name_len); + if (ret) + break; } return ret; @@ -4136,15 +4166,6 @@ static int cmp_xe(const void *a, const void *b) return 0; } -static void swap_xe(void *a, void *b, int size) -{ - struct ocfs2_xattr_entry *l = a, *r = b, tmp; - - tmp = *l; - memcpy(l, r, sizeof(struct ocfs2_xattr_entry)); - memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry)); -} - /* * When the ocfs2_xattr_block is filled up, new bucket will be created * and all the xattr entries will be moved to the new bucket. @@ -4210,7 +4231,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change); sort(target + offset, count, sizeof(struct ocfs2_xattr_entry), - cmp_xe, swap_xe); + cmp_xe, NULL); } /* @@ -4292,7 +4313,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); - ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); + ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1); if (ret) { mlog_errno(ret); goto out; @@ -4350,7 +4371,7 @@ static int cmp_xe_offset(const void *a, const void *b) /* * defrag a xattr bucket if we find that the bucket has some - * holes beteen name/value pairs. + * holes between name/value pairs. * We will move all the name/value pairs to the end of the bucket * so that we can spare some space for insertion. */ @@ -4405,7 +4426,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, */ sort(entries, le16_to_cpu(xh->xh_count), sizeof(struct ocfs2_xattr_entry), - cmp_xe_offset, swap_xe); + cmp_xe_offset, NULL); /* Move all name/values to the end of the bucket. */ xe = xh->xh_entries; @@ -4447,7 +4468,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, /* sort the entries by their name_hash. */ sort(entries, le16_to_cpu(xh->xh_count), sizeof(struct ocfs2_xattr_entry), - cmp_xe, swap_xe); + cmp_xe, NULL); buf = bucket_buf; for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize) @@ -4636,7 +4657,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, * Even if !new_bucket_head, we're overwriting t_bucket. Thus, * there's no need to read it. */ - ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); + ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head); if (ret) { mlog_errno(ret); goto out; @@ -4802,7 +4823,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, * Even if !t_is_new, we're overwriting t_bucket. Thus, * there's no need to read it. */ - ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); + ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new); if (ret) goto out; @@ -4990,7 +5011,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode, * 2. If cluster_size == bucket_size: * a) If the previous extent rec has more than one cluster and the insert * place isn't in the last cluster, copy the entire last cluster to the - * new one. This time, we don't need to upate the first_bh and header_bh + * new one. This time, we don't need to update the first_bh and header_bh * since they will not be moved into the new cluster. * b) Otherwise, move the bottom half of the xattrs in the last cluster into * the new one. And we set the extend flag to zero if the insert place is @@ -5316,16 +5337,6 @@ out: return ret; } -static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, - struct ocfs2_xattr_bucket *bucket, - int offs) -{ - int block_off = offs >> inode->i_sb->s_blocksize_bits; - - offs = offs % inode->i_sb->s_blocksize; - return bucket_block(bucket, block_off) + offs; -} - /* * Truncate the specified xe_off entry in xattr bucket. * bucket is indicated by header_bh and len is the new length. @@ -5437,7 +5448,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, return ret; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5474,13 +5485,14 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, ret = ocfs2_truncate_log_append(osb, handle, blkno, len); if (ret) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); out_commit: ocfs2_commit_trans(osb, handle); out: ocfs2_schedule_truncate_log_flush(osb, 1); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (meta_ac) ocfs2_free_alloc_context(meta_ac); @@ -5656,6 +5668,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i, &xv, NULL); + if (ret) { + mlog_errno(ret); + break; + } ret = ocfs2_lock_xattr_remove_allocators(inode, xv, args->ref_ci, @@ -5881,6 +5897,10 @@ static int ocfs2_xattr_value_attach_refcount(struct inode *inode, while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, &num_clusters, el, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } cpos += num_clusters; if ((ext_flags & OCFS2_EXT_REFCOUNTED)) @@ -6169,7 +6189,7 @@ struct ocfs2_xattr_reflink { /* * Given a xattr header and xe offset, * return the proper xv and the corresponding bh. - * xattr in inode, block and xattr tree have different implementaions. + * xattr in inode, block and xattr tree have different implementations. */ typedef int (get_xattr_value_root)(struct super_block *sb, struct buffer_head *bh, @@ -6211,8 +6231,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, le16_to_cpu(xv->xr_list.l_next_free_rec); *credits += ocfs2_calc_extend_credits(sb, - &def_xv.xv.xr_list, - le32_to_cpu(xv->xr_clusters)); + &def_xv.xv.xr_list); /* * If the value is a tree with depth > 1, We don't go deep @@ -6250,7 +6269,7 @@ static int ocfs2_get_xattr_value_root(struct super_block *sb, } /* - * Lock the meta_ac and caculate how much credits we need for reflink xattrs. + * Lock the meta_ac and calculate how much credits we need for reflink xattrs. * It is only used for inline xattr and xattr block. */ static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb, @@ -6332,7 +6351,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle, trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count)); - last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)]; + last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)] - 1; for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) { xe = &xh->xh_entries[i]; @@ -6381,7 +6400,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle, * and then insert the extents one by one. */ if (xv->xr_list.l_tree_depth) { - memcpy(new_xv, &def_xv, sizeof(def_xv)); + memcpy(new_xv, &def_xv, OCFS2_XATTR_ROOT_SIZE); vb->vb_xv = new_xv; vb->vb_bh = value_bh; ocfs2_init_xattr_value_extent_tree(&data_et, @@ -6491,16 +6510,7 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args) } new_oi = OCFS2_I(args->new_inode); - /* - * Adjust extent record count to reserve space for extended attribute. - * Inline data count had been adjusted in ocfs2_duplicate_inline_data(). - */ - if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) && - !(ocfs2_inode_is_fast_symlink(args->new_inode))) { - struct ocfs2_extent_list *el = &new_di->id2.i_list; - le16_add_cpu(&el->l_count, -(inline_size / - sizeof(struct ocfs2_extent_rec))); - } + spin_lock(&new_oi->ip_lock); new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); @@ -6766,7 +6776,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators( *credits += 1; /* count in the xattr tree change. */ - num_free_extents = ocfs2_num_free_extents(osb, xt_et); + num_free_extents = ocfs2_num_free_extents(xt_et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); @@ -6777,7 +6787,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators( metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el); *credits += ocfs2_calc_extend_credits(osb->sb, - xt_et->et_root_el, len); + xt_et->et_root_el); if (metas.num_metas) { ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas, @@ -6797,7 +6807,7 @@ out: if (ret) { if (*meta_ac) { ocfs2_free_alloc_context(*meta_ac); - meta_ac = NULL; + *meta_ac = NULL; } } @@ -6825,7 +6835,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle, break; } - ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); + ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1); if (ret) { mlog_errno(ret); break; @@ -7181,7 +7191,7 @@ out: * Used for reflink a non-preserve-security file. * * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_mutex. + * must not hold any lock expect i_rwsem. */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, @@ -7201,7 +7211,6 @@ int ocfs2_init_security_and_acl(struct inode *dir, mlog_errno(ret); goto leave; } - ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); @@ -7211,49 +7220,46 @@ int ocfs2_init_security_and_acl(struct inode *dir, leave: return ret; } + /* * 'security' attributes support */ -static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int ocfs2_xattr_security_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name, buffer, size); } -static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int ocfs2_xattr_security_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; - - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } -int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, +static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { + struct ocfs2_security_xattr_info *si = fs_info; const struct xattr *xattr; int err = 0; + if (si) { + si->value = kmemdup(xattr_array->value, xattr_array->value_len, + GFP_KERNEL); + if (!si->value) + return -ENOMEM; + + si->name = xattr_array->name; + si->value_len = xattr_array->value_len; + return 0; + } + for (xattr = xattr_array; xattr->name != NULL; xattr++) { err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, xattr->name, xattr->value, @@ -7269,13 +7275,23 @@ int ocfs2_init_security_get(struct inode *inode, const struct qstr *qstr, struct ocfs2_security_xattr_info *si) { + int ret; + /* check whether ocfs2 support feature xattr */ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) return -EOPNOTSUPP; - if (si) - return security_old_inode_init_security(inode, dir, qstr, - &si->name, &si->value, - &si->value_len); + if (si) { + ret = security_inode_init_security(inode, dir, qstr, + &ocfs2_initxattrs, si); + /* + * security_inode_init_security() does not return -EOPNOTSUPP, + * we have to check the xattr ourselves. + */ + if (!ret && !si->name) + si->enable = 0; + + return ret; + } return security_inode_init_security(inode, dir, qstr, &ocfs2_initxattrs, NULL); @@ -7296,7 +7312,6 @@ int ocfs2_init_security_set(handle_t *handle, const struct xattr_handler ocfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = ocfs2_xattr_security_list, .get = ocfs2_xattr_security_get, .set = ocfs2_xattr_security_set, }; @@ -7304,43 +7319,26 @@ const struct xattr_handler ocfs2_xattr_security_handler = { /* * 'trusted' attributes support */ -static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) +static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, buffer, size); } -static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; - - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } const struct xattr_handler ocfs2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .list = ocfs2_xattr_trusted_list, .get = ocfs2_xattr_trusted_get, .set = ocfs2_xattr_trusted_set, }; @@ -7348,55 +7346,35 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = { /* * 'user' attributes support */ -static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) +static int ocfs2_xattr_user_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - - if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (strcmp(name, "") == 0) - return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name, + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, buffer, size); } -static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int ocfs2_xattr_user_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (strcmp(name, "") == 0) - return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, size, flags); } const struct xattr_handler ocfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .list = ocfs2_xattr_user_list, .get = ocfs2_xattr_user_get, .set = ocfs2_xattr_user_set, }; diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index e5c7f15465b4..65e9aa743919 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -1,18 +1,8 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * xattr.h * * Copyright (C) 2004, 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef OCFS2_XATTR_H @@ -32,7 +22,7 @@ enum ocfs2_xattr_type { struct ocfs2_security_xattr_info { int enable; - char *name; + const char *name; void *value; size_t value_len; }; @@ -40,9 +30,7 @@ struct ocfs2_security_xattr_info { extern const struct xattr_handler ocfs2_xattr_user_handler; extern const struct xattr_handler ocfs2_xattr_trusted_handler; extern const struct xattr_handler ocfs2_xattr_security_handler; -extern const struct xattr_handler ocfs2_xattr_acl_access_handler; -extern const struct xattr_handler ocfs2_xattr_acl_default_handler; -extern const struct xattr_handler *ocfs2_xattr_handlers[]; +extern const struct xattr_handler * const ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, |
