summaryrefslogtreecommitdiff
path: root/fs/ocfs2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/Kconfig13
-rw-r--r--fs/ocfs2/Makefile9
-rw-r--r--fs/ocfs2/acl.c248
-rw-r--r--fs/ocfs2/acl.h20
-rw-r--r--fs/ocfs2/alloc.c1329
-rw-r--r--fs/ocfs2/alloc.h38
-rw-r--r--fs/ocfs2/aops.c1473
-rw-r--r--fs/ocfs2/aops.h75
-rw-r--r--fs/ocfs2/blockcheck.c84
-rw-r--r--fs/ocfs2/blockcheck.h21
-rw-r--r--fs/ocfs2/buffer_head_io.c122
-rw-r--r--fs/ocfs2/buffer_head_io.h23
-rw-r--r--fs/ocfs2/cluster/Makefile3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1088
-rw-r--r--fs/ocfs2/cluster/heartbeat.h28
-rw-r--r--fs/ocfs2/cluster/masklog.c73
-rw-r--r--fs/ocfs2/cluster/masklog.h79
-rw-r--r--fs/ocfs2/cluster/netdebug.c187
-rw-r--r--fs/ocfs2/cluster/nodemanager.c430
-rw-r--r--fs/ocfs2/cluster/nodemanager.h21
-rw-r--r--fs/ocfs2/cluster/ocfs2_heartbeat.h20
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h21
-rw-r--r--fs/ocfs2/cluster/quorum.c73
-rw-r--r--fs/ocfs2/cluster/quorum.h21
-rw-r--r--fs/ocfs2/cluster/sys.c23
-rw-r--r--fs/ocfs2/cluster/sys.h21
-rw-r--r--fs/ocfs2/cluster/tcp.c431
-rw-r--r--fs/ocfs2/cluster/tcp.h29
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h36
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dcache.c145
-rw-r--r--fs/ocfs2/dcache.h32
-rw-r--r--fs/ocfs2/dir.c407
-rw-r--r--fs/ocfs2/dir.h22
-rw-r--r--fs/ocfs2/dlm/Makefile6
-rw-r--r--fs/ocfs2/dlm/dlmapi.h25
-rw-r--r--fs/ocfs2/dlm/dlmast.c55
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h95
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c79
-rw-r--r--fs/ocfs2/dlm/dlmconvert.h21
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c390
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h48
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c352
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h47
-rw-r--r--fs/ocfs2/dlm/dlmlock.c43
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c475
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c285
-rw-r--r--fs/ocfs2/dlm/dlmthread.c147
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c97
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlmfs/Makefile5
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c256
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c58
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h26
-rw-r--r--fs/ocfs2/dlmglue.c793
-rw-r--r--fs/ocfs2/dlmglue.h83
-rw-r--r--fs/ocfs2/export.c70
-rw-r--r--fs/ocfs2/export.h20
-rw-r--r--fs/ocfs2/extent_map.c136
-rw-r--r--fs/ocfs2/extent_map.h22
-rw-r--r--fs/ocfs2/file.c1385
-rw-r--r--fs/ocfs2/file.h42
-rw-r--r--fs/ocfs2/filecheck.c509
-rw-r--r--fs/ocfs2/filecheck.h64
-rw-r--r--fs/ocfs2/heartbeat.c47
-rw-r--r--fs/ocfs2/heartbeat.h20
-rw-r--r--fs/ocfs2/inode.c606
-rw-r--r--fs/ocfs2/inode.h66
-rw-r--r--fs/ocfs2/ioctl.c362
-rw-r--r--fs/ocfs2/ioctl.h4
-rw-r--r--fs/ocfs2/journal.c789
-rw-r--r--fs/ocfs2/journal.h107
-rw-r--r--fs/ocfs2/localalloc.c165
-rw-r--r--fs/ocfs2/localalloc.h26
-rw-r--r--fs/ocfs2/locks.c48
-rw-r--r--fs/ocfs2/locks.h20
-rw-r--r--fs/ocfs2/mmap.c108
-rw-r--r--fs/ocfs2/mmap.h3
-rw-r--r--fs/ocfs2/move_extents.c311
-rw-r--r--fs/ocfs2/move_extents.h14
-rw-r--r--fs/ocfs2/namei.c824
-rw-r--r--fs/ocfs2/namei.h31
-rw-r--r--fs/ocfs2/ocfs1_fs_compat.h19
-rw-r--r--fs/ocfs2/ocfs2.h204
-rw-r--r--fs/ocfs2/ocfs2_fs.h134
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h24
-rw-r--r--fs/ocfs2/ocfs2_lockid.h27
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h14
-rw-r--r--fs/ocfs2/ocfs2_trace.h122
-rw-r--r--fs/ocfs2/quota.h12
-rw-r--r--fs/ocfs2/quota_global.c226
-rw-r--r--fs/ocfs2/quota_local.c154
-rw-r--r--fs/ocfs2/refcounttree.c829
-rw-r--r--fs/ocfs2/refcounttree.h39
-rw-r--r--fs/ocfs2/reservations.c25
-rw-r--r--fs/ocfs2/reservations.h27
-rw-r--r--fs/ocfs2/resize.c77
-rw-r--r--fs/ocfs2/resize.h20
-rw-r--r--fs/ocfs2/slot_map.c58
-rw-r--r--fs/ocfs2/slot_map.h20
-rw-r--r--fs/ocfs2/stack_o2cb.c65
-rw-r--r--fs/ocfs2/stack_user.c394
-rw-r--r--fs/ocfs2/stackglue.c125
-rw-r--r--fs/ocfs2/stackglue.h35
-rw-r--r--fs/ocfs2/suballoc.c469
-rw-r--r--fs/ocfs2/suballoc.h45
-rw-r--r--fs/ocfs2/super.c1332
-rw-r--r--fs/ocfs2/super.h33
-rw-r--r--fs/ocfs2/symlink.c30
-rw-r--r--fs/ocfs2/symlink.h20
-rw-r--r--fs/ocfs2/sysfile.c44
-rw-r--r--fs/ocfs2/sysfile.h20
-rw-r--r--fs/ocfs2/uptodate.c25
-rw-r--r--fs/ocfs2/uptodate.h20
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/ocfs2/xattr.c564
-rw-r--r--fs/ocfs2/xattr.h20
122 files changed, 11620 insertions, 9775 deletions
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 77a8de5f7119..2514d36cbe01 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,11 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
config OCFS2_FS
tristate "OCFS2 file system support"
- depends on NET && SYSFS && CONFIGFS_FS
+ depends on INET && SYSFS && CONFIGFS_FS
+ select BUFFER_HEAD
select JBD2
select CRC32
select QUOTA
select QUOTA_TREE
select FS_POSIX_ACL
+ select LEGACY_DIRECT_IO
help
OCFS2 is a general purpose extent based shared disk cluster file
system with many similarities to ext3. It supports 64 bit inode
@@ -15,12 +18,12 @@ config OCFS2_FS
You'll want to install the ocfs2-tools package in order to at least
get "mount.ocfs2".
- Project web page: http://oss.oracle.com/projects/ocfs2
- Tools web page: http://oss.oracle.com/projects/ocfs2-tools
- OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+ Project web page: https://ocfs2.wiki.kernel.org/
+ Tools web page: https://github.com/markfasheh/ocfs2-tools
+ OCFS2 mailing lists: https://subspace.kernel.org/lists.linux.dev.html
For more information on OCFS2, see the file
- <file:Documentation/filesystems/ocfs2.txt>.
+ <file:Documentation/filesystems/ocfs2.rst>.
config OCFS2_FS_O2CB
tristate "O2CB Kernelspace Clustering"
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f17e58b32989..cc9b32b9db7c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,6 +1,5 @@
-ccflags-y := -Ifs/ocfs2
-
-ccflags-y += -DCATCH_BH_JBD_RACES
+# SPDX-License-Identifier: GPL-2.0
+ccflags-y := -I$(src)
obj-$(CONFIG_OCFS2_FS) += \
ocfs2.o \
@@ -38,11 +37,11 @@ ocfs2-objs := \
symlink.o \
sysfile.o \
uptodate.o \
- ver.o \
quota_local.o \
quota_global.o \
xattr.o \
- acl.o
+ acl.o \
+ filecheck.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 8a404576fb26..af1e2cedb217 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -1,6 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* acl.c
*
* Copyright (C) 2004, 2008 Oracle. All rights reserved.
@@ -8,21 +7,13 @@
* CREDITS:
* Lots of code in this file is copy from linux/fs/ext3/acl.c.
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/fs_struct.h>
#include <cluster/masklog.h>
@@ -51,10 +42,6 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
return ERR_PTR(-EINVAL);
count = size / sizeof(struct posix_acl_entry);
- if (count < 0)
- return ERR_PTR(-EINVAL);
- if (count == 0)
- return NULL;
acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
@@ -164,36 +151,6 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
return acl;
}
-
-/*
- * Get posix acl.
- */
-static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
- struct posix_acl *acl;
- int ret;
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return NULL;
-
- ret = ocfs2_inode_lock(inode, &di_bh, 0);
- if (ret < 0) {
- mlog_errno(ret);
- acl = ERR_PTR(ret);
- return acl;
- }
-
- acl = ocfs2_get_acl_nolock(inode, type, di_bh);
-
- ocfs2_inode_unlock(inode, 0);
-
- brelse(di_bh);
-
- return acl;
-}
-
/*
* Helper function to set i_mode in memory and disk. Some call paths
* will not have di_bh or a journal handle to pass, in which case it
@@ -235,10 +192,11 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_mode = new_mode;
- inode->i_ctime = CURRENT_TIME;
+ inode_set_ctime_current(inode);
di->i_mode = cpu_to_le16(inode->i_mode);
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
@@ -273,22 +231,6 @@ static int ocfs2_set_acl(handle_t *handle,
switch (type) {
case ACL_TYPE_ACCESS:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
- if (acl) {
- umode_t mode = inode->i_mode;
- ret = posix_acl_equiv_mode(acl, &mode);
- if (ret < 0)
- return ret;
- else {
- if (ret == 0)
- acl = NULL;
-
- ret = ocfs2_acl_set_mode(inode, di_bh,
- handle, mode);
- if (ret)
- return ret;
-
- }
- }
break;
case ACL_TYPE_DEFAULT:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
@@ -313,33 +255,71 @@ static int ocfs2_set_acl(handle_t *handle,
ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
kfree(value);
+ if (!ret)
+ set_cached_acl(inode, type, acl);
return ret;
}
-struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
+int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+ struct buffer_head *bh = NULL;
+ int status, had_lock;
+ struct ocfs2_lock_holder oh;
+ struct inode *inode = d_inode(dentry);
+
+ had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
+ if (had_lock < 0)
+ return had_lock;
+ if (type == ACL_TYPE_ACCESS && acl) {
+ umode_t mode;
+
+ status = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode,
+ &acl);
+ if (status)
+ goto unlock;
+
+ status = ocfs2_acl_set_mode(inode, bh, NULL, mode);
+ if (status)
+ goto unlock;
+ }
+ status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+unlock:
+ ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
+ brelse(bh);
+ return status;
+}
+
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu)
{
struct ocfs2_super *osb;
struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
- int ret = -EAGAIN;
+ int had_lock;
+ struct ocfs2_lock_holder oh;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return NULL;
- ret = ocfs2_read_inode_block(inode, &di_bh);
- if (ret < 0)
- return ERR_PTR(ret);
+ had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh);
+ if (had_lock < 0)
+ return ERR_PTR(had_lock);
+ down_read(&OCFS2_I(inode)->ip_xattr_sem);
acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+ up_read(&OCFS2_I(inode)->ip_xattr_sem);
+ ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
brelse(di_bh);
-
return acl;
}
-int ocfs2_acl_chmod(struct inode *inode)
+int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct posix_acl *acl;
@@ -351,10 +331,12 @@ int ocfs2_acl_chmod(struct inode *inode)
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return 0;
- acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ down_read(&OCFS2_I(inode)->ip_xattr_sem);
+ acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
+ up_read(&OCFS2_I(inode)->ip_xattr_sem);
+ if (IS_ERR_OR_NULL(acl))
+ return PTR_ERR_OR_ZERO(acl);
+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
if (ret)
return ret;
ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
@@ -382,8 +364,10 @@ int ocfs2_init_acl(handle_t *handle,
if (!S_ISLNK(inode->i_mode)) {
if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ down_read(&OCFS2_I(dir)->ip_xattr_sem);
acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
dir_bh);
+ up_read(&OCFS2_I(dir)->ip_xattr_sem);
if (IS_ERR(acl))
return PTR_ERR(acl);
}
@@ -405,7 +389,7 @@ int ocfs2_init_acl(handle_t *handle,
goto cleanup;
}
mode = inode->i_mode;
- ret = posix_acl_create(&acl, GFP_NOFS, &mode);
+ ret = __posix_acl_create(&acl, GFP_NOFS, &mode);
if (ret < 0)
return ret;
@@ -425,113 +409,3 @@ cleanup:
posix_acl_release(acl);
return ret;
}
-
-static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
- char *list,
- size_t list_len,
- const char *name,
- size_t name_len,
- int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
- char *list,
- size_t list_len,
- const char *name,
- size_t name_len,
- int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- struct posix_acl *acl;
- int ret;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ocfs2_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return ret;
-}
-
-static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- struct inode *inode = dentry->d_inode;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl;
- int ret = 0;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return -EOPNOTSUPP;
-
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- ret = posix_acl_valid(acl);
- if (ret)
- goto cleanup;
- }
- } else
- acl = NULL;
-
- ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
-
-cleanup:
- posix_acl_release(acl);
- return ret;
-}
-
-const struct xattr_handler ocfs2_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = ocfs2_xattr_list_acl_access,
- .get = ocfs2_xattr_get_acl,
- .set = ocfs2_xattr_set_acl,
-};
-
-const struct xattr_handler ocfs2_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = ocfs2_xattr_list_acl_default,
- .get = ocfs2_xattr_get_acl,
- .set = ocfs2_xattr_set_acl,
-};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 071fbd380f2f..667c6f03fa60 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -1,18 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* acl.h
*
* Copyright (C) 2004, 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef OCFS2_ACL_H
@@ -26,8 +16,10 @@ struct ocfs2_acl_entry {
__le32 e_id;
};
-struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
-extern int ocfs2_acl_chmod(struct inode *);
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu);
+int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct posix_acl *acl, int type);
+extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
struct buffer_head *, struct buffer_head *,
struct ocfs2_alloc_context *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 17e6bdde96c5..b267ec580da9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* alloc.c
*
* Extent allocs and frees
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -30,6 +14,7 @@
#include <linux/swap.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/sched/signal.h>
#include <cluster/masklog.h>
@@ -164,7 +149,14 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec);
static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
-static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+
+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct buffer_head **new_eb_bh,
+ int blk_wanted, int *blk_given);
+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
+
+static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
.eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
.eo_update_clusters = ocfs2_dinode_update_clusters,
@@ -286,7 +278,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
}
-static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
.eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
.eo_update_clusters = ocfs2_xattr_value_update_clusters,
@@ -332,7 +324,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
}
-static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
.eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
.eo_update_clusters = ocfs2_xattr_tree_update_clusters,
@@ -379,7 +371,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
et->et_root_el = &dx_root->dr_list;
}
-static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
.eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
.eo_update_clusters = ocfs2_dx_root_update_clusters,
@@ -425,7 +417,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
return CONTIG_NONE;
}
-static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
.eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
.eo_update_clusters = ocfs2_refcount_tree_update_clusters,
@@ -438,7 +430,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
struct buffer_head *bh,
ocfs2_journal_access_func access,
void *obj,
- struct ocfs2_extent_tree_operations *ops)
+ const struct ocfs2_extent_tree_operations *ops)
{
et->et_ops = ops;
et->et_root_bh = bh;
@@ -447,6 +439,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
if (!obj)
obj = (void *)bh->b_data;
et->et_object = obj;
+ et->et_dealloc = NULL;
et->et_ops->eo_fill_root_el(et);
if (!et->et_ops->eo_fill_max_leaf_clusters)
@@ -573,7 +566,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
struct ocfs2_path *path,
struct ocfs2_extent_rec *insert_rec);
/*
- * Reset the actual path elements so that we can re-use the structure
+ * Reset the actual path elements so that we can reuse the structure
* to build another path. Generally, this involves freeing the buffer
* heads.
*/
@@ -908,32 +901,28 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- ocfs2_error(sb,
- "Extent block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- eb->h_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ eb->h_signature);
+ goto bail;
}
if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid h_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(eb->h_blkno));
- return -EINVAL;
- }
-
- if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid "
- "h_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(eb->h_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ goto bail;
}
- return 0;
+ if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation)
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(eb->h_fs_generation));
+bail:
+ return rc;
}
int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
@@ -956,8 +945,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
/*
* How many free extents have we got before we need more meta data?
*/
-int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct ocfs2_extent_tree *et)
+int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
{
int retval;
struct ocfs2_extent_list *el = NULL;
@@ -979,7 +967,14 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
el = &eb->h_list;
}
- BUG_ON(el->l_tree_depth != 0);
+ if (el->l_tree_depth != 0) {
+ retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)last_eb_blk,
+ le16_to_cpu(el->l_tree_depth));
+ goto bail;
+ }
retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
bail:
@@ -1025,7 +1020,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
for(i = count; i < (num_got + count); i++) {
bhs[i] = sb_getblk(osb->sb, first_blkno);
if (bhs[i] == NULL) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -1070,7 +1065,6 @@ bail:
brelse(bhs[i]);
bhs[i] = NULL;
}
- mlog_errno(status);
}
return status;
}
@@ -1160,7 +1154,7 @@ static int ocfs2_add_branch(handle_t *handle,
struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
- int status, new_blocks, i;
+ int status, new_blocks, i, block_given = 0;
u64 next_blkno, new_last_eb_blk;
struct buffer_head *bh;
struct buffer_head **new_eb_bhs = NULL;
@@ -1188,7 +1182,7 @@ static int ocfs2_add_branch(handle_t *handle,
/*
* If there is a gap before the root end and the real end
- * of the righmost leaf block, we need to remove the gap
+ * of the rightmost leaf block, we need to remove the gap
* between new_cpos and root_end first so that the tree
* is consistent after we add a new branch(it will start
* from new_cpos).
@@ -1215,16 +1209,36 @@ static int ocfs2_add_branch(handle_t *handle,
goto bail;
}
- status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
- meta_ac, new_eb_bhs);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ /* Firstyly, try to reuse dealloc since we have already estimated how
+ * many extent blocks we may use.
+ */
+ if (!ocfs2_is_dealloc_empty(et)) {
+ status = ocfs2_reuse_blk_from_dealloc(handle, et,
+ new_eb_bhs, new_blocks,
+ &block_given);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ BUG_ON(block_given > new_blocks);
+
+ if (block_given < new_blocks) {
+ BUG_ON(!meta_ac);
+ status = ocfs2_create_new_meta_bhs(handle, et,
+ new_blocks - block_given,
+ meta_ac,
+ &new_eb_bhs[block_given]);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
}
/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
* linked with the rest of the tree.
- * conversly, new_eb_bhs[0] is the new bottommost leaf.
+ * conversely, new_eb_bhs[0] is the new bottommost leaf.
*
* when we leave the loop, new_last_eb_blk will point to the
* newest leaf, and next_blkno will point to the topmost extent
@@ -1342,15 +1356,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **ret_new_eb_bh)
{
- int status, i;
+ int status, i, block_given = 0;
u32 new_clusters;
struct buffer_head *new_eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *root_el;
struct ocfs2_extent_list *eb_el;
- status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
- &new_eb_bh);
+ if (!ocfs2_is_dealloc_empty(et)) {
+ status = ocfs2_reuse_blk_from_dealloc(handle, et,
+ &new_eb_bh, 1,
+ &block_given);
+ } else if (meta_ac) {
+ status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
+ &new_eb_bh);
+
+ } else {
+ BUG();
+ }
+
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1445,22 +1469,17 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty "
- "extent list (next_free_rec == 0)",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
- status = -EIO;
+ status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty extent list (next_free_rec == 0)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
goto bail;
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
- ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has extent "
- "list where extent # %d has no physical "
- "block start",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
- status = -EIO;
+ status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has extent list where extent # %d has no physical block start\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
goto bail;
}
@@ -1516,7 +1535,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
int depth = le16_to_cpu(el->l_tree_depth);
struct buffer_head *bh = NULL;
- BUG_ON(meta_ac == NULL);
+ BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
shift = ocfs2_find_branch_target(et, &bh);
if (shift < 0) {
@@ -1565,10 +1584,8 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
* the new data. */
ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
meta_ac);
- if (ret < 0) {
+ if (ret < 0)
mlog_errno(ret);
- goto out;
- }
out:
if (final_depth)
@@ -1786,10 +1803,17 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
el = root_el;
while (el->l_tree_depth) {
+ if (unlikely(le16_to_cpu(el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH)) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has invalid tree depth %u in extent list\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ le16_to_cpu(el->l_tree_depth));
+ ret = -EROFS;
+ goto out;
+ }
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has empty extent list at "
- "depth %u\n",
+ "Owner %llu has empty extent list at depth %u\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth));
ret = -EROFS;
@@ -1814,8 +1838,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (blkno == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad blkno in extent list "
- "at depth %u (index %d)\n",
+ "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth), i);
ret = -EROFS;
@@ -1836,8 +1859,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad count in extent list "
- "at block %llu (next free=%u, count=%u)\n",
+ "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)bh->b_blocknr,
le16_to_cpu(el->l_next_free_rec),
@@ -1940,14 +1962,12 @@ out:
* the new changes.
*
* left_rec: the record on the left.
- * left_child_el: is the child list pointed to by left_rec
* right_rec: the record to the right of left_rec
* right_child_el: is the child list pointed to by right_rec
*
* By definition, this only works on interior nodes.
*/
static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
- struct ocfs2_extent_list *left_child_el,
struct ocfs2_extent_rec *right_rec,
struct ocfs2_extent_list *right_child_el)
{
@@ -2010,7 +2030,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
*/
BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
- ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+ ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
&root_el->l_recs[i + 1], right_el);
}
@@ -2035,7 +2055,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
int i, idx;
struct ocfs2_extent_list *el, *left_el, *right_el;
struct ocfs2_extent_rec *left_rec, *right_rec;
- struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+ struct buffer_head *root_bh;
/*
* Update the counts and position values within all the
@@ -2067,8 +2087,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
el = right_path->p_node[i].el;
right_rec = &el->l_recs[0];
- ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
- right_el);
+ ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
@@ -2116,8 +2135,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
if (left_el->l_next_free_rec != left_el->l_count) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Inode %llu has non-full interior leaf node %llu"
- "(next free = %u)",
+ "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)left_leaf_bh->b_blocknr,
le16_to_cpu(left_el->l_next_free_rec));
@@ -2256,8 +2274,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -2283,9 +2300,9 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
int ret = 0;
int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
- if (handle->h_buffer_credits < credits)
+ if (jbd2_handle_buffer_credits(handle) < credits)
ret = ocfs2_extend_trans(handle,
- credits - handle->h_buffer_credits);
+ credits - jbd2_handle_buffer_credits(handle));
return ret;
}
@@ -2362,7 +2379,7 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
{
- int ret, start, orig_credits = handle->h_buffer_credits;
+ int ret, start, orig_credits = jbd2_handle_buffer_credits(handle);
u32 cpos;
struct ocfs2_path *left_path = NULL;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
@@ -2518,7 +2535,7 @@ out_ret_path:
static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_tree *et,
- int subtree_index, struct ocfs2_path *path)
+ struct ocfs2_path *path)
{
int i, idx, ret;
struct ocfs2_extent_rec *rec;
@@ -2526,21 +2543,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_block *eb;
u32 range;
- /*
- * In normal tree rotation process, we will never touch the
- * tree branch above subtree_index and ocfs2_extend_rotate_transaction
- * doesn't reserve the credits for them either.
- *
- * But we do have a special case here which will update the rightmost
- * records for all the bh in the path.
- * So we have to allocate extra credits and access them.
- */
- ret = ocfs2_extend_trans(handle, subtree_index);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
@@ -2626,11 +2628,8 @@ static void ocfs2_unlink_subtree(handle_t *handle,
int i;
struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
- struct ocfs2_extent_list *el;
struct ocfs2_extent_block *eb;
- el = path_leaf_el(left_path);
-
eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
@@ -2779,8 +2778,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
if (del_right_subtree) {
ocfs2_unlink_subtree(handle, et, left_path, right_path,
subtree_index, dealloc);
- ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
- left_path);
+ ret = ocfs2_update_edge_lengths(handle, et, left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2872,8 +2870,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -2925,7 +2922,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
struct ocfs2_path *right_path = NULL;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
- BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+ if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
+ return 0;
*empty_extent_path = NULL;
@@ -2966,7 +2964,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
- ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
orig_credits, left_path);
if (ret) {
mlog_errno(ret);
@@ -3039,21 +3037,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
-
ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
- /*
- * There's two ways we handle this depending on
- * whether path is the only existing one.
- */
- ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
@@ -3096,8 +3082,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
ocfs2_unlink_subtree(handle, et, left_path, path,
subtree_index, dealloc);
- ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
- left_path);
+ ret = ocfs2_update_edge_lengths(handle, et, left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3130,6 +3115,30 @@ out:
return ret;
}
+static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ handle_t *handle;
+ int ret;
+ int credits = path->p_tree_depth * 2 + 1;
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+ return ret;
+}
+
/*
* Left rotation of btree records.
*
@@ -3151,7 +3160,7 @@ static int ocfs2_rotate_tree_left(handle_t *handle,
struct ocfs2_path *path,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
- int ret, orig_credits = handle->h_buffer_credits;
+ int ret, orig_credits = jbd2_handle_buffer_credits(handle);
struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
@@ -3197,11 +3206,10 @@ rightmost_no_delete:
goto rightmost_no_delete;
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ret = -EIO;
- ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty extent block at %llu",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
- (unsigned long long)le64_to_cpu(eb->h_blkno));
+ ret = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty extent block at %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
goto out;
}
@@ -3370,7 +3378,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
ret = ocfs2_get_right_path(et, left_path, &right_path);
if (ret) {
mlog_errno(ret);
- goto out;
+ return ret;
}
right_el = path_leaf_el(right_path);
@@ -3390,8 +3398,8 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
right_path);
ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
- handle->h_buffer_credits,
- right_path);
+ jbd2_handle_buffer_credits(handle),
+ right_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3453,8 +3461,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
subtree_index);
}
out:
- if (right_path)
- ocfs2_free_path(right_path);
+ ocfs2_free_path(right_path);
return ret;
}
@@ -3536,7 +3543,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
ret = ocfs2_get_left_path(et, right_path, &left_path);
if (ret) {
mlog_errno(ret);
- goto out;
+ return ret;
}
left_el = path_leaf_el(left_path);
@@ -3553,8 +3560,8 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
right_path);
ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
- handle->h_buffer_credits,
- left_path);
+ jbd2_handle_buffer_credits(handle),
+ left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3604,8 +3611,6 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
* The easy case - we can just plop the record right in.
*/
*left_rec = *split_rec;
-
- has_empty_extent = 0;
} else
le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
@@ -3628,6 +3633,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
*/
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ jbd2_handle_buffer_credits(handle),
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
@@ -3647,8 +3660,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
right_path, subtree_index);
}
out:
- if (left_path)
- ocfs2_free_path(left_path);
+ ocfs2_free_path(left_path);
return ret;
}
@@ -3667,6 +3679,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ jbd2_handle_buffer_credits(handle),
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The merge code will need to create an empty
* extent to take the place of the newly
@@ -3700,7 +3720,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
* update split_index here.
*
* When the split_index is zero, we need to merge it to the
- * prevoius extent block. It is more efficient and easier
+ * previous extent block. It is more efficient and easier
* if we do merge_right first and merge_left later.
*/
ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
@@ -3715,6 +3735,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ jbd2_handle_buffer_credits(handle),
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* The merge left us with an empty extent, remove it. */
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
@@ -3736,6 +3765,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
goto out;
}
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ jbd2_handle_buffer_credits(handle),
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
@@ -3771,6 +3809,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
}
if (ctxt->c_split_covers_rec) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ jbd2_handle_buffer_credits(handle),
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ ret = 0;
+ goto out;
+ }
+
/*
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
@@ -3906,7 +3954,7 @@ rotate:
* above.
*
* This leaf needs to have space, either by the empty 1st
- * extent record, or by virtue of an l_next_rec < l_count.
+ * extent record, or by virtue of an l_next_free_rec < l_count.
*/
ocfs2_rotate_leaf(el, insert_rec);
}
@@ -3916,7 +3964,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
struct ocfs2_path *path,
struct ocfs2_extent_rec *insert_rec)
{
- int ret, i, next_free;
+ int i, next_free;
struct buffer_head *bh;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
@@ -3931,9 +3979,8 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has a bad extent list",
+ "Owner %llu has a bad extent list\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
- ret = -EIO;
return;
}
@@ -4313,13 +4360,13 @@ out:
return ret;
}
-static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
struct ocfs2_extent_list *el, int index,
- struct ocfs2_extent_rec *split_rec)
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_merge_ctxt *ctxt)
{
- int status;
+ int status = 0;
enum ocfs2_contig_type ret = CONTIG_NONE;
u32 left_cpos, right_cpos;
struct ocfs2_extent_rec *rec = NULL;
@@ -4334,17 +4381,20 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
} else if (path->p_tree_depth > 0) {
status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
if (status)
- goto out;
+ goto exit;
if (left_cpos != 0) {
left_path = ocfs2_new_path_from_path(path);
- if (!left_path)
- goto out;
+ if (!left_path) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto exit;
+ }
status = ocfs2_find_path(et->et_ci, left_path,
left_cpos);
if (status)
- goto out;
+ goto free_left_path;
new_el = path_leaf_el(left_path);
@@ -4352,16 +4402,12 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
le16_to_cpu(new_el->l_count)) {
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of "
- "%d. It should have "
- "matched the l_count of %d",
- (unsigned long long)le64_to_cpu(eb->h_blkno),
- le16_to_cpu(new_el->l_next_free_rec),
- le16_to_cpu(new_el->l_count));
- status = -EINVAL;
- goto out;
+ status = ocfs2_error(sb,
+ "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec),
+ le16_to_cpu(new_el->l_count));
+ goto free_left_path;
}
rec = &new_el->l_recs[
le16_to_cpu(new_el->l_next_free_rec) - 1];
@@ -4388,18 +4434,21 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
path->p_tree_depth > 0) {
status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
if (status)
- goto out;
+ goto free_left_path;
if (right_cpos == 0)
- goto out;
+ goto free_left_path;
right_path = ocfs2_new_path_from_path(path);
- if (!right_path)
- goto out;
+ if (!right_path) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto free_left_path;
+ }
status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
if (status)
- goto out;
+ goto free_right_path;
new_el = path_leaf_el(right_path);
rec = &new_el->l_recs[0];
@@ -4407,13 +4456,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of %d",
- (unsigned long long)le64_to_cpu(eb->h_blkno),
- le16_to_cpu(new_el->l_next_free_rec));
- status = -EINVAL;
- goto out;
+ status = ocfs2_error(sb,
+ "Extent block #%llu has an invalid l_next_free_rec of %d\n",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec));
+ goto free_right_path;
}
rec = &new_el->l_recs[1];
}
@@ -4430,13 +4477,15 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
ret = contig_type;
}
-out:
- if (left_path)
- ocfs2_free_path(left_path);
- if (right_path)
- ocfs2_free_path(right_path);
+free_right_path:
+ ocfs2_free_path(right_path);
+free_left_path:
+ ocfs2_free_path(left_path);
+exit:
+ if (status == 0)
+ ctxt->c_contig_type = ret;
- return ret;
+ return status;
}
static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
@@ -4476,7 +4525,7 @@ static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
}
/*
- * This should only be called against the righmost leaf extent list.
+ * This should only be called against the rightmost leaf extent list.
*
* ocfs2_figure_appending_type() will figure out whether we'll have to
* insert at the tail of the rightmost leaf.
@@ -4671,7 +4720,7 @@ int ocfs2_insert_extent(handle_t *handle,
struct ocfs2_alloc_context *meta_ac)
{
int status;
- int uninitialized_var(free_records);
+ int free_records;
struct buffer_head *last_eb_bh = NULL;
struct ocfs2_insert_type insert = {0, };
struct ocfs2_extent_rec rec;
@@ -4726,7 +4775,7 @@ bail:
}
/*
- * Allcate and add clusters into the extent b-tree.
+ * Allocate and add clusters into the extent b-tree.
* The new clusters(clusters_to_add) will be inserted at logical_offset.
* The extent b-tree's root is specified by et, and
* it is not limited to the file storage. Any extent tree can use this
@@ -4742,6 +4791,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
enum ocfs2_alloc_restarted *reason_ret)
{
int status = 0, err = 0;
+ int need_free = 0;
int free_extents;
enum ocfs2_alloc_restarted reason = RESTART_NONE;
u32 bit_off, num_bits;
@@ -4755,7 +4805,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
if (mark_unwritten)
flags = OCFS2_EXT_UNWRITTEN;
- free_extents = ocfs2_num_free_extents(osb, et);
+ free_extents = ocfs2_num_free_extents(et);
if (free_extents < 0) {
status = free_extents;
mlog_errno(status);
@@ -4796,7 +4846,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ need_free = 1;
+ goto bail;
}
block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
@@ -4807,7 +4858,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
num_bits, flags, meta_ac);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ need_free = 1;
+ goto bail;
}
ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -4821,6 +4873,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
reason = RESTART_TRANS;
}
+bail:
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num_bits);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num_bits);
+ }
+
leave:
if (reason_ret)
*reason_ret = reason;
@@ -4945,6 +5010,14 @@ leftright:
el = path_leaf_el(path);
split_index = ocfs2_search_extent_list(el, cpos);
+ if (split_index == -1) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
+ ret = -EROFS;
+ goto out;
+ }
goto leftright;
}
out:
@@ -5007,7 +5080,6 @@ int ocfs2_split_extent(handle_t *handle,
struct buffer_head *last_eb_bh = NULL;
struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
struct ocfs2_merge_ctxt ctxt;
- struct ocfs2_extent_list *rightmost_el;
if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
@@ -5017,9 +5089,14 @@ int ocfs2_split_extent(handle_t *handle,
goto out;
}
- ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
- split_index,
- split_rec);
+ ret = ocfs2_figure_merge_contig_type(et, path, el,
+ split_index,
+ split_rec,
+ &ctxt);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The core merge / split code wants to know how much room is
@@ -5027,8 +5104,6 @@ int ocfs2_split_extent(handle_t *handle,
* rightmost extent list.
*/
if (path->p_tree_depth) {
- struct ocfs2_extent_block *eb;
-
ret = ocfs2_read_extent_block(et->et_ci,
ocfs2_et_get_last_eb_blk(et),
&last_eb_bh);
@@ -5036,11 +5111,7 @@ int ocfs2_split_extent(handle_t *handle,
mlog_errno(ret);
goto out;
}
-
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- rightmost_el = &eb->h_list;
- } else
- rightmost_el = path_root_el(path);
+ }
if (rec->e_cpos == split_rec->e_cpos &&
rec->e_leaf_clusters == split_rec->e_leaf_clusters)
@@ -5119,12 +5190,11 @@ int ocfs2_change_extent_flag(handle_t *handle,
el = path_leaf_el(left_path);
index = ocfs2_search_extent_list(el, cpos);
- if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ if (index == -1) {
ocfs2_error(sb,
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)
- ocfs2_metadata_cache_owner(et->et_ci), cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5133,7 +5203,7 @@ int ocfs2_change_extent_flag(handle_t *handle,
rec = &el->l_recs[index];
if (new_flags && (rec->e_flags & new_flags)) {
mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
- "extent that already had them",
+ "extent that already had them\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
new_flags);
goto out;
@@ -5141,7 +5211,7 @@ int ocfs2_change_extent_flag(handle_t *handle,
if (clear_flags && !(rec->e_flags & clear_flags)) {
mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
- "extent that didn't have them",
+ "extent that didn't have them\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
clear_flags);
goto out;
@@ -5191,9 +5261,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
cpos, len, phys);
if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
- "that are being written to, but the feature bit "
- "is not set in the super block.",
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
ret = -EROFS;
goto out;
@@ -5292,7 +5360,7 @@ static int ocfs2_truncate_rec(handle_t *handle,
{
int ret;
u32 left_cpos, rec_range, trunc_range;
- int wants_rotate = 0, is_rightmost_tree_rec = 0;
+ int is_rightmost_tree_rec = 0;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
struct ocfs2_path *left_path = NULL;
struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5300,6 +5368,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ jbd2_handle_buffer_credits(handle),
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
@@ -5362,8 +5439,8 @@ static int ocfs2_truncate_rec(handle_t *handle,
}
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
+ jbd2_handle_buffer_credits(handle),
+ path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5389,7 +5466,6 @@ static int ocfs2_truncate_rec(handle_t *handle,
memset(rec, 0, sizeof(*rec));
ocfs2_cleanup_merge(el, index);
- wants_rotate = 1;
next_free = le16_to_cpu(el->l_next_free_rec);
if (is_rightmost_tree_rec && next_free > 1) {
@@ -5432,10 +5508,8 @@ static int ocfs2_truncate_rec(handle_t *handle,
ocfs2_journal_dirty(handle, path_leaf_bh(path));
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto out;
- }
out:
ocfs2_free_path(left_path);
@@ -5475,10 +5549,9 @@ int ocfs2_remove_extent(handle_t *handle,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
- if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5541,9 +5614,9 @@ int ocfs2_remove_extent(handle_t *handle,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
- if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: split at cpos %u lost record.",
+ "Owner %llu: split at cpos %u lost record\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5559,8 +5632,7 @@ int ocfs2_remove_extent(handle_t *handle,
ocfs2_rec_clusters(el, rec);
if (rec_range != trunc_range) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: error after split at cpos %u"
- "trunc len %u, existing record is (%u,%u)",
+ "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos, len, le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
@@ -5570,10 +5642,8 @@ int ocfs2_remove_extent(handle_t *handle,
ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
cpos, len);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto out;
- }
}
out:
@@ -5602,7 +5672,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
*ac = NULL;
- num_free_extents = ocfs2_num_free_extents(osb, et);
+ num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
@@ -5618,7 +5688,6 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
- goto out;
}
}
@@ -5637,7 +5706,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 cpos, u32 phys_cpos, u32 len, int flags,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- u64 refcount_loc)
+ u64 refcount_loc, bool refcount_tree_locked)
{
int ret, credits = 0, extra_blocks = 0;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
@@ -5648,14 +5717,15 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_refcount_tree *ref_tree = NULL;
if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
- BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
- OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
- ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
- &ref_tree, NULL);
- if (ret) {
- mlog_errno(ret);
- goto bail;
+ if (!refcount_tree_locked) {
+ ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
}
ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -5677,7 +5747,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
goto bail;
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
@@ -5712,6 +5782,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
}
ocfs2_et_update_clusters(et, -len);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -5733,7 +5804,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
out_commit:
ocfs2_commit_trans(osb, handle);
out:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
bail:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
@@ -5789,7 +5860,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+ BUG_ON(inode_trylock(tl_inode));
start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
@@ -5854,7 +5925,6 @@ bail:
}
static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
- handle_t *handle,
struct inode *data_alloc_inode,
struct buffer_head *data_alloc_bh)
{
@@ -5867,16 +5937,25 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
struct ocfs2_truncate_log *tl;
struct inode *tl_inode = osb->osb_tl_inode;
struct buffer_head *tl_bh = osb->osb_tl_bh;
+ handle_t *handle;
di = (struct ocfs2_dinode *) tl_bh->b_data;
tl = &di->id2.i_dealloc;
i = le16_to_cpu(tl->tl_used) - 1;
while (i >= 0) {
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail;
+ }
+
/* Caller has given us at least enough credits to
* update the truncate log dinode */
status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
+ ocfs2_commit_trans(osb, handle);
mlog_errno(status);
goto bail;
}
@@ -5885,16 +5964,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, tl_bh);
- /* TODO: Perhaps we can calculate the bulk of the
- * credits up front rather than extending like
- * this. */
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
rec = tl->tl_recs[i];
start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
le32_to_cpu(rec.t_start));
@@ -5911,10 +5980,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
data_alloc_bh, start_blk,
num_clusters);
if (status < 0) {
+ ocfs2_commit_trans(osb, handle);
mlog_errno(status);
goto bail;
}
}
+
+ ocfs2_commit_trans(osb, handle);
i--;
}
@@ -5924,20 +5996,20 @@ bail:
return status;
}
-/* Expects you to already be holding tl_inode->i_mutex */
+/* Expects you to already be holding tl_inode->i_rwsem */
int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
{
int status;
unsigned int num_to_flush;
- handle_t *handle;
struct inode *tl_inode = osb->osb_tl_inode;
struct inode *data_alloc_inode = NULL;
struct buffer_head *tl_bh = osb->osb_tl_bh;
struct buffer_head *data_alloc_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
+ struct ocfs2_journal *journal = osb->journal;
- BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+ BUG_ON(inode_trylock(tl_inode));
di = (struct ocfs2_dinode *) tl_bh->b_data;
@@ -5956,6 +6028,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out;
}
+ /* Appending truncate log(TA) and flushing truncate log(TF) are
+ * two separated transactions. They can be both committed but not
+ * checkpointed. If crash occurs then, both two transaction will be
+ * replayed with several already released to global bitmap clusters.
+ * Then truncate log will be replayed resulting in cluster double free.
+ */
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, 0);
+ jbd2_journal_unlock_updates(journal->j_journal);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
data_alloc_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -5965,7 +6051,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out;
}
- mutex_lock(&data_alloc_inode->i_mutex);
+ inode_lock(data_alloc_inode);
status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
if (status < 0) {
@@ -5973,26 +6059,16 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto out_unlock;
- }
-
- status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+ status = ocfs2_replay_truncate_records(osb, data_alloc_inode,
data_alloc_bh);
if (status < 0)
mlog_errno(status);
- ocfs2_commit_trans(osb, handle);
-
-out_unlock:
brelse(data_alloc_bh);
ocfs2_inode_unlock(data_alloc_inode, 1);
out_mutex:
- mutex_unlock(&data_alloc_inode->i_mutex);
+ inode_unlock(data_alloc_inode);
iput(data_alloc_inode);
out:
@@ -6004,9 +6080,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
int status;
struct inode *tl_inode = osb->osb_tl_inode;
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
status = __ocfs2_flush_truncate_log(osb);
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
return status;
}
@@ -6029,17 +6105,55 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
int cancel)
{
- if (osb->osb_tl_inode) {
+ if (osb->osb_tl_inode &&
+ atomic_read(&osb->osb_tl_disable) == 0) {
/* We want to push off log flushes while truncates are
* still running. */
if (cancel)
cancel_delayed_work(&osb->osb_truncate_log_wq);
- queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
}
}
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed)
+{
+ tid_t target;
+ int ret = 0;
+ unsigned int truncated_clusters;
+
+ inode_lock(osb->osb_tl_inode);
+ truncated_clusters = osb->truncated_clusters;
+ inode_unlock(osb->osb_tl_inode);
+
+ /*
+ * Check whether we can succeed in allocating if we free
+ * the truncate log.
+ */
+ if (truncated_clusters < needed)
+ goto out;
+
+ ret = ocfs2_flush_truncate_log(osb);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+ jbd2_log_wait_commit(osb->journal->j_journal, target);
+ ret = 1;
+ }
+out:
+ return ret;
+}
+
static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
int slot_num,
struct inode **tl_inode,
@@ -6048,6 +6162,9 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
int status;
struct inode *inode = NULL;
struct buffer_head *bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_truncate_log *tl;
+ unsigned int tl_count;
inode = ocfs2_get_system_file_inode(osb,
TRUNCATE_LOG_SYSTEM_INODE,
@@ -6065,6 +6182,18 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
goto bail;
}
+ di = (struct ocfs2_dinode *)bh->b_data;
+ tl = &di->id2.i_dealloc;
+ tl_count = le16_to_cpu(tl->tl_count);
+ if (unlikely(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+ tl_count == 0)) {
+ status = -EFSCORRUPTED;
+ iput(inode);
+ brelse(bh);
+ mlog_errno(status);
+ goto bail;
+ }
+
*tl_inode = inode;
*tl_bh = bh;
bail:
@@ -6106,17 +6235,17 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
if (le16_to_cpu(tl->tl_used)) {
trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
- *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
+ /*
+ * Assuming the write-out below goes well, this copy will be
+ * passed back to recovery for processing.
+ */
+ *tl_copy = kmemdup(tl_bh->b_data, tl_bh->b_size, GFP_KERNEL);
if (!(*tl_copy)) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
- /* Assuming the write-out below goes well, this copy
- * will be passed back to recovery for processing. */
- memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
-
/* All we need to do to clear the truncate log is set
* tl_used. */
tl->tl_used = 0;
@@ -6130,11 +6259,10 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
}
bail:
- if (tl_inode)
- iput(tl_inode);
+ iput(tl_inode);
brelse(tl_bh);
- if (status < 0 && (*tl_copy)) {
+ if (status < 0) {
kfree(*tl_copy);
*tl_copy = NULL;
mlog_errno(status);
@@ -6165,7 +6293,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
num_recs);
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
for(i = 0; i < num_recs; i++) {
if (ocfs2_truncate_log_needs_flush(osb)) {
status = __ocfs2_flush_truncate_log(osb);
@@ -6196,7 +6324,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
}
bail_up:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
return status;
}
@@ -6206,9 +6334,11 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
int status;
struct inode *tl_inode = osb->osb_tl_inode;
+ atomic_set(&osb->osb_tl_disable, 1);
+
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
@@ -6237,6 +6367,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
* until we're sure all is well. */
INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
ocfs2_truncate_log_worker);
+ atomic_set(&osb->osb_tl_disable, 0);
osb->osb_tl_bh = tl_bh;
osb->osb_tl_inode = tl_inode;
@@ -6300,7 +6431,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
goto out;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
@@ -6308,48 +6439,39 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out_unlock;
- }
-
while (head) {
if (head->free_bg)
bg_blkno = head->free_bg;
else
bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
head->free_bit);
+ handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
trace_ocfs2_free_cached_blocks(
(unsigned long long)head->free_blk, head->free_bit);
ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
head->free_bit, bg_blkno, 1);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto out_journal;
- }
- ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
- if (ret) {
- mlog_errno(ret);
- goto out_journal;
- }
+ ocfs2_commit_trans(osb, handle);
tmp = head;
head = head->free_next;
kfree(tmp);
}
-out_journal:
- ocfs2_commit_trans(osb, handle);
-
out_unlock:
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
out:
while(head) {
@@ -6393,7 +6515,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
handle_t *handle;
int ret = 0;
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
while (head) {
if (ocfs2_truncate_log_needs_flush(osb)) {
@@ -6425,7 +6547,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
}
}
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
while (head) {
/* Premature exit may have left some dangling items. */
@@ -6506,6 +6628,154 @@ ocfs2_find_per_slot_free_list(int type,
return fl;
}
+static struct ocfs2_per_slot_free_list *
+ocfs2_find_preferred_free_list(int type,
+ int preferred_slot,
+ int *real_slot,
+ struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+ struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
+
+ while (fl) {
+ if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
+ *real_slot = fl->f_slot;
+ return fl;
+ }
+
+ fl = fl->f_next_suballocator;
+ }
+
+ /* If we can't find any free list matching preferred slot, just use
+ * the first one.
+ */
+ fl = ctxt->c_first_suballocator;
+ *real_slot = fl->f_slot;
+
+ return fl;
+}
+
+/* Return Value 1 indicates empty */
+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_per_slot_free_list *fl = NULL;
+
+ if (!et->et_dealloc)
+ return 1;
+
+ fl = et->et_dealloc->c_first_suballocator;
+ if (!fl)
+ return 1;
+
+ if (!fl->f_first)
+ return 1;
+
+ return 0;
+}
+
+/* If extent was deleted from tree due to extent rotation and merging, and
+ * no metadata is reserved ahead of time. Try to reuse some extents
+ * just deleted. This is only used to reuse extent blocks.
+ * It is supposed to find enough extent blocks in dealloc if our estimation
+ * on metadata is accurate.
+ */
+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct buffer_head **new_eb_bh,
+ int blk_wanted, int *blk_given)
+{
+ int i, status = 0, real_slot;
+ struct ocfs2_cached_dealloc_ctxt *dealloc;
+ struct ocfs2_per_slot_free_list *fl;
+ struct ocfs2_cached_block_free *bf;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+
+ *blk_given = 0;
+
+ /* If extent tree doesn't have a dealloc, this is not faulty. Just
+ * tell upper caller dealloc can't provide any block and it should
+ * ask for alloc to claim more space.
+ */
+ dealloc = et->et_dealloc;
+ if (!dealloc)
+ goto bail;
+
+ for (i = 0; i < blk_wanted; i++) {
+ /* Prefer to use local slot */
+ fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
+ osb->slot_num, &real_slot,
+ dealloc);
+ /* If no more block can be reused, we should claim more
+ * from alloc. Just return here normally.
+ */
+ if (!fl) {
+ status = 0;
+ break;
+ }
+
+ bf = fl->f_first;
+ fl->f_first = bf->free_next;
+
+ new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
+ if (new_eb_bh[i] == NULL) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ mlog(0, "Reusing block(%llu) from "
+ "dealloc(local slot:%d, real slot:%d)\n",
+ bf->free_blk, osb->slot_num, real_slot);
+
+ ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
+
+ status = ocfs2_journal_access_eb(handle, et->et_ci,
+ new_eb_bh[i],
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
+ eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
+
+ /* We can't guarantee that buffer head is still cached, so
+ * polutlate the extent block again.
+ */
+ strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+ eb->h_blkno = cpu_to_le64(bf->free_blk);
+ eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+ eb->h_suballoc_slot = cpu_to_le16(real_slot);
+ eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
+ eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
+ eb->h_list.l_count =
+ cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+ /* We'll also be dirtied by the caller, so
+ * this isn't absolutely necessary.
+ */
+ ocfs2_journal_dirty(handle, new_eb_bh[i]);
+
+ if (!fl->f_first) {
+ dealloc->c_first_suballocator = fl->f_next_suballocator;
+ kfree(fl);
+ }
+ kfree(bf);
+ }
+
+ *blk_given = i;
+
+bail:
+ if (unlikely(status < 0)) {
+ for (i = 0; i < blk_wanted; i++)
+ brelse(new_eb_bh[i]);
+ }
+
+ return status;
+}
+
int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
int type, int slot, u64 suballoc,
u64 blkno, unsigned int bit)
@@ -6561,138 +6831,136 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
return 0;
}
-void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
- unsigned int from, unsigned int to,
- struct page *page, int zero, u64 *phys)
+void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle,
+ size_t from, size_t to, struct folio *folio, int zero,
+ u64 *phys)
{
int ret, partial = 0;
+ loff_t start_byte = folio_pos(folio) + from;
+ loff_t length = to - from;
- ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
+ ret = ocfs2_map_folio_blocks(folio, phys, inode, from, to, 0);
if (ret)
mlog_errno(ret);
if (zero)
- zero_user_segment(page, from, to);
+ folio_zero_segment(folio, from, to);
/*
* Need to set the buffers we zero'd into uptodate
* here if they aren't - ocfs2_map_page_blocks()
* might've skipped some
*/
- ret = walk_page_buffers(handle, page_buffers(page),
+ ret = walk_page_buffers(handle, folio_buffers(folio),
from, to, &partial,
ocfs2_zero_func);
if (ret < 0)
mlog_errno(ret);
else if (ocfs2_should_order_data(inode)) {
- ret = ocfs2_jbd2_file_inode(handle, inode);
+ ret = ocfs2_jbd2_inode_add_write(handle, inode,
+ start_byte, length);
if (ret < 0)
mlog_errno(ret);
}
if (!partial)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
}
-static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
- loff_t end, struct page **pages,
- int numpages, u64 phys, handle_t *handle)
+static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start,
+ loff_t end, struct folio **folios, int numfolios,
+ u64 phys, handle_t *handle)
{
int i;
- struct page *page;
- unsigned int from, to = PAGE_CACHE_SIZE;
struct super_block *sb = inode->i_sb;
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
- if (numpages == 0)
+ if (numfolios == 0)
goto out;
- to = PAGE_CACHE_SIZE;
- for(i = 0; i < numpages; i++) {
- page = pages[i];
-
- from = start & (PAGE_CACHE_SIZE - 1);
- if ((end >> PAGE_CACHE_SHIFT) == page->index)
- to = end & (PAGE_CACHE_SIZE - 1);
+ for (i = 0; i < numfolios; i++) {
+ struct folio *folio = folios[i];
+ size_t to = folio_size(folio);
+ size_t from = offset_in_folio(folio, start);
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
+ if (to > end - folio_pos(folio))
+ to = end - folio_pos(folio);
- ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
- &phys);
+ ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1,
+ &phys);
- start = (page->index + 1) << PAGE_CACHE_SHIFT;
+ start = folio_next_pos(folio);
}
out:
- if (pages)
- ocfs2_unlock_and_free_pages(pages, numpages);
+ if (folios)
+ ocfs2_unlock_and_free_folios(folios, numfolios);
}
-int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
- struct page **pages, int *num)
+static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end,
+ struct folio **folios, int *num)
{
- int numpages, ret = 0;
+ int numfolios, ret = 0;
struct address_space *mapping = inode->i_mapping;
unsigned long index;
loff_t last_page_bytes;
BUG_ON(start > end);
- numpages = 0;
+ numfolios = 0;
last_page_bytes = PAGE_ALIGN(end);
- index = start >> PAGE_CACHE_SHIFT;
+ index = start >> PAGE_SHIFT;
do {
- pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
- if (!pages[numpages]) {
- ret = -ENOMEM;
+ folios[numfolios] = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
+ if (IS_ERR(folios[numfolios])) {
+ ret = PTR_ERR(folios[numfolios]);
mlog_errno(ret);
+ folios[numfolios] = NULL;
goto out;
}
- numpages++;
- index++;
- } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
+ index = folio_next_index(folios[numfolios]);
+ numfolios++;
+ } while (index < (last_page_bytes >> PAGE_SHIFT));
out:
if (ret != 0) {
- if (pages)
- ocfs2_unlock_and_free_pages(pages, numpages);
- numpages = 0;
+ ocfs2_unlock_and_free_folios(folios, numfolios);
+ numfolios = 0;
}
- *num = numpages;
+ *num = numfolios;
return ret;
}
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
- struct page **pages, int *num)
+static int ocfs2_grab_eof_folios(struct inode *inode, loff_t start, loff_t end,
+ struct folio **folios, int *num)
{
struct super_block *sb = inode->i_sb;
BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
(end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
- return ocfs2_grab_pages(inode, start, end, pages, num);
+ return ocfs2_grab_folios(inode, start, end, folios, num);
}
/*
- * Zero the area past i_size but still within an allocated
- * cluster. This avoids exposing nonzero data on subsequent file
- * extends.
+ * Zero partial cluster for a hole punch or truncate. This avoids exposing
+ * nonzero data on subsequent file extends.
*
* We need to call this before i_size is updated on the inode because
- * otherwise block_write_full_page() will skip writeout of pages past
- * i_size. The new_i_size parameter is passed for this reason.
+ * otherwise block_write_full_folio() will skip writeout of pages past
+ * i_size.
*/
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
u64 range_start, u64 range_end)
{
- int ret = 0, numpages;
- struct page **pages = NULL;
+ int ret = 0, numfolios;
+ struct folio **folios = NULL;
u64 phys;
unsigned int ext_flags;
struct super_block *sb = inode->i_sb;
@@ -6704,17 +6972,23 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
return 0;
- pages = kcalloc(ocfs2_pages_per_cluster(sb),
- sizeof(struct page *), GFP_NOFS);
- if (pages == NULL) {
+ /*
+ * Avoid zeroing folios fully beyond current i_size. It is pointless as
+ * underlying blocks of those folios should be already zeroed out and
+ * page writeback will skip them anyway.
+ */
+ range_end = min_t(u64, range_end, i_size_read(inode));
+ if (range_start >= range_end)
+ return 0;
+
+ folios = kcalloc(ocfs2_pages_per_cluster(sb),
+ sizeof(struct folio *), GFP_NOFS);
+ if (folios == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
- if (range_start == range_end)
- goto out;
-
ret = ocfs2_extent_map_get_blocks(inode,
range_start >> sb->s_blocksize_bits,
&phys, NULL, &ext_flags);
@@ -6730,18 +7004,18 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
goto out;
- ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
- &numpages);
+ ret = ocfs2_grab_eof_folios(inode, range_start, range_end, folios,
+ &numfolios);
if (ret) {
mlog_errno(ret);
goto out;
}
- ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
- numpages, phys, handle);
+ ocfs2_zero_cluster_folios(inode, range_start, range_end, folios,
+ numfolios, phys, handle);
/*
- * Initiate writeout of the pages we zero'd here. We don't
+ * Initiate writeout of the folios we zero'd here. We don't
* wait on them - the truncate_inode_pages() call later will
* do that for us.
*/
@@ -6751,7 +7025,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
mlog_errno(ret);
out:
- kfree(pages);
+ kfree(folios);
return ret;
}
@@ -6804,29 +7078,22 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct buffer_head *di_bh)
{
- int ret, i, has_data, num_pages = 0;
+ int ret, has_data, num_folios = 0;
+ int need_free = 0;
+ u32 bit_off, num;
handle_t *handle;
- u64 uninitialized_var(block);
+ u64 block;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_alloc_context *data_ac = NULL;
- struct page **pages = NULL;
- loff_t end = osb->s_clustersize;
+ struct folio *folio = NULL;
struct ocfs2_extent_tree et;
int did_quota = 0;
has_data = i_size_read(inode) ? 1 : 0;
if (has_data) {
- pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
- sizeof(struct page *), GFP_NOFS);
- if (pages == NULL) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
if (ret) {
mlog_errno(ret);
@@ -6839,7 +7106,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
- goto out_unlock;
+ goto out;
}
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
@@ -6850,8 +7117,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
if (has_data) {
- u32 bit_off, num;
- unsigned int page_end;
+ unsigned int page_end = min_t(unsigned, PAGE_SIZE,
+ osb->s_clustersize);
u64 phys;
ret = dquot_alloc_space_nodirty(inode,
@@ -6860,7 +7127,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
goto out_commit;
did_quota = 1;
- data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+ data_ac->ac_resv = &oi->ip_la_data_resv;
ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
&num);
@@ -6871,21 +7138,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
/*
* Save two copies, one for insert, and one that can
- * be changed by ocfs2_map_and_dirty_page() below.
+ * be changed by ocfs2_map_and_dirty_folio() below.
*/
block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
- /*
- * Non sparse file systems zero on extend, so no need
- * to do that now.
- */
- if (!ocfs2_sparse_alloc(osb) &&
- PAGE_CACHE_SIZE < osb->s_clustersize)
- end = PAGE_CACHE_SIZE;
-
- ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
+ ret = ocfs2_grab_eof_folios(inode, 0, page_end, &folio,
+ &num_folios);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -6893,19 +7154,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* This should populate the 1st page for us and mark
* it up to date.
*/
- ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
+ ret = ocfs2_read_inline_data(inode, folio, di_bh);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ need_free = 1;
+ goto out_unlock;
}
- page_end = PAGE_CACHE_SIZE;
- if (PAGE_CACHE_SIZE > osb->s_clustersize)
- page_end = osb->s_clustersize;
-
- for (i = 0; i < num_pages; i++)
- ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
- pages[i], i > 0, &phys);
+ ocfs2_map_and_dirty_folio(inode, handle, 0, page_end, folio, 0,
+ &phys);
}
spin_lock(&oi->ip_lock);
@@ -6913,6 +7170,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_dinode_new_extent_list(inode, di);
ocfs2_journal_dirty(handle, di_bh);
@@ -6927,29 +7185,39 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ need_free = 1;
+ goto out_unlock;
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
}
+out_unlock:
+ if (folio)
+ ocfs2_unlock_and_free_folios(&folio, num_folios);
+
out_commit:
if (ret < 0 && did_quota)
dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num);
+ }
+
ocfs2_commit_trans(osb, handle);
-out_unlock:
+out:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
-
-out:
- if (pages) {
- ocfs2_unlock_and_free_pages(pages, num_pages);
- kfree(pages);
- }
-
return ret;
}
@@ -6974,6 +7242,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
struct ocfs2_extent_tree et;
struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
@@ -7047,15 +7316,23 @@ start:
* to check it up here before changing the tree.
*/
if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
- ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+ mlog(ML_ERROR, "Inode %lu has an empty "
"extent record, depth %u\n", inode->i_ino,
le16_to_cpu(root_el->l_tree_depth));
- status = -EROFS;
- goto bail;
+ status = ocfs2_remove_rightmost_empty_extent(osb,
+ &et, path, &dealloc);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ ocfs2_reinit_path(path, 1);
+ goto start;
+ } else {
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = 0;
+ blkno = 0;
}
- trunc_cpos = le32_to_cpu(rec->e_cpos);
- trunc_len = 0;
- blkno = 0;
} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
/*
* Truncate entire record.
@@ -7083,9 +7360,18 @@ start:
phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+ if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) {
+ status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, NULL);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
phys_cpos, trunc_len, flags, &dealloc,
- refcount_loc);
+ refcount_loc, true);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -7100,6 +7386,8 @@ start:
goto start;
bail:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
ocfs2_schedule_truncate_log_flush(osb, 1);
@@ -7123,17 +7411,20 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_inline_data *idata = &di->id2.i_data;
+ /* No need to punch hole beyond i_size. */
+ if (start >= i_size_read(inode))
+ return 0;
+
if (end > i_size_read(inode))
end = i_size_read(inode);
- BUG_ON(start >= end);
+ BUG_ON(start > end);
if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
!ocfs2_supports_inline_data(osb)) {
ocfs2_error(inode->i_sb,
- "Inline data flags for inode %llu don't agree! "
- "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+ "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
le16_to_cpu(di->i_dyn_features),
OCFS2_I(inode)->ip_dyn_features,
@@ -7162,7 +7453,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
/*
* No need to worry about the data page here - it's been
* truncated already and inline data doesn't need it for
- * pushing zero's to disk, so we'll let readpage pick it up
+ * pushing zero's to disk, so we'll let read_folio pick it up
* later.
*/
if (trunc) {
@@ -7171,11 +7462,12 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
out_commit:
@@ -7187,13 +7479,24 @@ out:
static int ocfs2_trim_extent(struct super_block *sb,
struct ocfs2_group_desc *gd,
- u32 start, u32 count)
+ u64 group, u32 start, u32 count)
{
u64 discard, bcount;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
bcount = ocfs2_clusters_to_blocks(sb, count);
- discard = le64_to_cpu(gd->bg_blkno) +
- ocfs2_clusters_to_blocks(sb, start);
+ discard = ocfs2_clusters_to_blocks(sb, start);
+
+ /*
+ * For the first cluster group, the gd->bg_blkno is not at the start
+ * of the group, but at an offset from the start. If we add it while
+ * calculating discard for first group, we will wrongly start fstrim a
+ * few blocks after the desried start block and the range can cross
+ * over into the next cluster group. So, add it only if this is not
+ * the first cluster group.
+ */
+ if (group != osb->first_cluster_group_blkno)
+ discard += le64_to_cpu(gd->bg_blkno);
trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
@@ -7201,7 +7504,7 @@ static int ocfs2_trim_extent(struct super_block *sb,
}
static int ocfs2_trim_group(struct super_block *sb,
- struct ocfs2_group_desc *gd,
+ struct ocfs2_group_desc *gd, u64 group,
u32 start, u32 max, u32 minbits)
{
int ret = 0, count = 0, next;
@@ -7220,7 +7523,7 @@ static int ocfs2_trim_group(struct super_block *sb,
next = ocfs2_find_next_bit(bitmap, max, start);
if ((next - start) >= minbits) {
- ret = ocfs2_trim_extent(sb, gd,
+ ret = ocfs2_trim_extent(sb, gd, group,
start, next - start);
if (ret < 0) {
mlog_errno(ret);
@@ -7245,10 +7548,11 @@ static int ocfs2_trim_group(struct super_block *sb,
return count;
}
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+static
+int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
- u64 start, len, trimmed, first_group, last_group, group;
+ u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
int ret, cnt;
u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL;
@@ -7260,16 +7564,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
minlen = range->minlen >> osb->s_clustersize_bits;
- trimmed = 0;
- if (!len) {
- range->len = 0;
- return 0;
- }
-
- if (minlen >= osb->bitmap_cpg)
+ if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
return -EINVAL;
+ trace_ocfs2_trim_mainbm(start, len, minlen);
+
+next_group:
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -7279,7 +7580,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
if (ret < 0) {
@@ -7288,26 +7589,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
- if (start >= le32_to_cpu(main_bm->i_clusters)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- if (start + len > le32_to_cpu(main_bm->i_clusters))
- len = le32_to_cpu(main_bm->i_clusters) - start;
+ /*
+ * Do some check before trim the first group.
+ */
+ if (!group) {
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- trace_ocfs2_trim_fs(start, len, minlen);
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
- /* Determine first and last group to examine based on start and len */
- first_group = ocfs2_which_cluster_group(main_bm_inode, start);
- if (first_group == osb->first_cluster_group_blkno)
- first_bit = start;
- else
- first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
- last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
- last_bit = osb->bitmap_cpg;
+ /*
+ * Determine first and last group to examine based on
+ * start and len
+ */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb,
+ first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode,
+ start + len - 1);
+ group = first_group;
+ }
- for (group = first_group; group <= last_group;) {
+ do {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
else
@@ -7322,7 +7631,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
gd = (struct ocfs2_group_desc *)gd_bh->b_data;
- cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+ cnt = ocfs2_trim_group(sb, gd, group,
+ first_bit, last_bit, minlen);
brelse(gd_bh);
gd_bh = NULL;
if (cnt < 0) {
@@ -7338,14 +7648,83 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
- }
- range->len = trimmed * sb->s_blocksize;
+ } while (0);
+
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
+ main_bm_bh = NULL;
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
+
+ /*
+ * If all the groups trim are not done or failed, but we should release
+ * main_bm related locks for avoiding the current IO starve, then go to
+ * trim the next group
+ */
+ if (ret >= 0 && group <= last_group) {
+ cond_resched();
+ goto next_group;
+ }
+out:
+ range->len = trimmed * osb->s_clustersize;
+ return ret;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct ocfs2_trim_fs_info info, *pinfo = NULL;
+
+ ocfs2_trim_fs_lock_res_init(osb);
+
+ trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
+
+ ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+ "finish, which is running from another node.\n",
+ osb->dev_str);
+ ret = ocfs2_trim_fs_lock(osb, &info, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ if (info.tf_valid && info.tf_success &&
+ info.tf_start == range->start &&
+ info.tf_len == range->len &&
+ info.tf_minlen == range->minlen) {
+ /* Avoid sending duplicated trim to a shared device */
+ mlog(ML_NOTICE, "The same trim on device (%s) was "
+ "just done from node (%u), return.\n",
+ osb->dev_str, info.tf_nodenum);
+ range->len = info.tf_trimlen;
+ goto out;
+ }
+ }
+
+ info.tf_nodenum = osb->node_num;
+ info.tf_start = range->start;
+ info.tf_len = range->len;
+ info.tf_minlen = range->minlen;
+
+ ret = ocfs2_trim_mainbm(sb, range);
+
+ info.tf_trimlen = range->len;
+ info.tf_success = (ret < 0 ? 0 : 1);
+ pinfo = &info;
out:
+ ocfs2_trim_fs_unlock(osb, pinfo);
+ ocfs2_trim_fs_lock_res_uninit(osb);
return ret;
}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index ca381c584127..1c0c83362904 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* alloc.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_ALLOC_H
@@ -54,13 +38,14 @@
*/
struct ocfs2_extent_tree_operations;
struct ocfs2_extent_tree {
- struct ocfs2_extent_tree_operations *et_ops;
+ const struct ocfs2_extent_tree_operations *et_ops;
struct buffer_head *et_root_bh;
struct ocfs2_extent_list *et_root_el;
struct ocfs2_caching_info *et_ci;
ocfs2_journal_access_func et_root_journal_access;
void *et_object;
unsigned int et_max_leaf_clusters;
+ struct ocfs2_cached_dealloc_ctxt *et_dealloc;
};
/*
@@ -142,10 +127,9 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 cpos, u32 phys_cpos, u32 len, int flags,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- u64 refcount_loc);
+ u64 refcount_loc, bool refcount_tree_locked);
-int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct ocfs2_extent_tree *et);
+int ocfs2_num_free_extents(struct ocfs2_extent_tree *et);
/*
* how many new metadata chunks would an allocation need at maximum?
@@ -188,6 +172,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
u64 start_blk,
unsigned int num_clusters);
int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed);
/*
* Process local structure which describes the block unlinks done
@@ -268,11 +254,9 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
return !rec->e_leaf_clusters;
}
-int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
- struct page **pages, int *num);
-void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
- unsigned int from, unsigned int to,
- struct page *page, int zero, u64 *phys);
+void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle,
+ size_t from, size_t to, struct folio *folio, int zero,
+ u64 *phys);
/*
* Structures which describe a path through a btree, and functions to
* manipulate them.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 79736a28d84f..76c86f1c2b1c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1,22 +1,6 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -25,9 +9,11 @@
#include <linux/pagemap.h>
#include <asm/byteorder.h>
#include <linux/swap.h>
-#include <linux/pipe_fs_i.h>
#include <linux/mpage.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
+#include <linux/uio.h>
+#include <linux/mm.h>
#include <cluster/masklog.h>
@@ -47,6 +33,9 @@
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
+#include "dir.h"
+#include "namei.h"
+#include "sysfile.h"
static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -57,7 +46,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh = NULL;
struct buffer_head *buffer_cache_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- void *kaddr;
trace_ocfs2_symlink_get_block(
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -80,6 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(fe->i_clusters))) {
+ err = -ENOMEM;
mlog(ML_ERROR, "block offset is outside the allocated size: "
"%llu\n", (unsigned long long)iblock);
goto bail;
@@ -92,6 +81,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
iblock;
buffer_cache_bh = sb_getblk(osb->sb, blkno);
if (!buffer_cache_bh) {
+ err = -ENOMEM;
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
goto bail;
}
@@ -100,17 +90,11 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
* could've happened. Since we've got a reference on
* the bh, even if it commits while we're doing the
* copy, the data is still good. */
- if (buffer_jbd(buffer_cache_bh)
- && ocfs2_inode_is_new(inode)) {
- kaddr = kmap_atomic(bh_result->b_page);
- if (!kaddr) {
- mlog(ML_ERROR, "couldn't kmap!\n");
- goto bail;
- }
- memcpy(kaddr + (bh_result->b_size * iblock),
- buffer_cache_bh->b_data,
- bh_result->b_size);
- kunmap_atomic(kaddr);
+ if (buffer_jbd(buffer_cache_bh) && ocfs2_inode_is_new(inode)) {
+ memcpy_to_folio(bh_result->b_folio,
+ bh_result->b_size * iblock,
+ buffer_cache_bh->b_data,
+ bh_result->b_size);
set_buffer_uptodate(bh_result);
}
brelse(buffer_cache_bh);
@@ -127,6 +111,19 @@ bail:
return err;
}
+static int ocfs2_lock_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ down_read(&oi->ip_alloc_sem);
+ ret = ocfs2_get_block(inode, iblock, bh_result, create);
+ up_read(&oi->ip_alloc_sem);
+
+ return ret;
+}
+
int ocfs2_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
@@ -152,9 +149,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
&ext_flags);
if (err) {
- mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
- "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
- (unsigned long long)p_blkno);
+ mlog(ML_ERROR, "get_blocks() failed, inode: 0x%p, "
+ "block: %llu\n", inode, (unsigned long long)iblock);
goto bail;
}
@@ -212,49 +208,41 @@ bail:
return err;
}
-int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+int ocfs2_read_inline_data(struct inode *inode, struct folio *folio,
struct buffer_head *di_bh)
{
- void *kaddr;
loff_t size;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
- ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+ ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
return -EROFS;
}
size = i_size_read(inode);
- if (size > PAGE_CACHE_SIZE ||
+ if (size > folio_size(folio) ||
size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
ocfs2_error(inode->i_sb,
- "Inode %llu has with inline data has bad size: %Lu",
+ "Inode %llu has with inline data has bad size: %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)size);
return -EROFS;
}
- kaddr = kmap_atomic(page);
- if (size)
- memcpy(kaddr, di->id2.i_data.id_data, size);
- /* Clear the remaining part of the page */
- memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
-
- SetPageUptodate(page);
+ folio_fill_tail(folio, 0, di->id2.i_data.id_data, size);
+ folio_mark_uptodate(folio);
return 0;
}
-static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
+static int ocfs2_readpage_inline(struct inode *inode, struct folio *folio)
{
int ret;
struct buffer_head *di_bh = NULL;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
ret = ocfs2_read_inode_block(inode, &di_bh);
@@ -263,25 +251,24 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
goto out;
}
- ret = ocfs2_read_inline_data(inode, page, di_bh);
+ ret = ocfs2_read_inline_data(inode, folio, di_bh);
out:
- unlock_page(page);
+ folio_unlock(folio);
brelse(di_bh);
return ret;
}
-static int ocfs2_readpage(struct file *file, struct page *page)
+static int ocfs2_read_folio(struct file *file, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ loff_t start = folio_pos(folio);
int ret, unlock = 1;
- trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
- (page ? page->index : 0));
+ trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, folio->index);
- ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
+ ret = ocfs2_inode_lock_with_folio(inode, NULL, 0, folio);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
@@ -291,11 +278,11 @@ static int ocfs2_readpage(struct file *file, struct page *page)
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
/*
- * Unlock the page and cycle ip_alloc_sem so that we don't
+ * Unlock the folio and cycle ip_alloc_sem so that we don't
* busyloop waiting for ip_alloc_sem to unlock
*/
ret = AOP_TRUNCATED_PAGE;
- unlock_page(page);
+ folio_unlock(folio);
unlock = 0;
down_read(&oi->ip_alloc_sem);
up_read(&oi->ip_alloc_sem);
@@ -303,35 +290,35 @@ static int ocfs2_readpage(struct file *file, struct page *page)
}
/*
- * i_size might have just been updated as we grabed the meta lock. We
+ * i_size might have just been updated as we grabbed the meta lock. We
* might now be discovering a truncate that hit on another node.
- * block_read_full_page->get_block freaks out if it is asked to read
+ * block_read_full_folio->get_block freaks out if it is asked to read
* beyond the end of a file, so we check here. Callers
* (generic_file_read, vm_ops->fault) are clever enough to check i_size
- * and notice that the page they just read isn't needed.
+ * and notice that the folio they just read isn't needed.
*
* XXX sys_readahead() seems to get that wrong?
*/
if (start >= i_size_read(inode)) {
- zero_user(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ folio_zero_segment(folio, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
ret = 0;
goto out_alloc;
}
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- ret = ocfs2_readpage_inline(inode, page);
+ ret = ocfs2_readpage_inline(inode, folio);
else
- ret = block_read_full_page(page, ocfs2_get_block);
+ ret = block_read_full_folio(folio, ocfs2_get_block);
unlock = 0;
out_alloc:
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
out_inode_unlock:
ocfs2_inode_unlock(inode, 0);
out:
if (unlock)
- unlock_page(page);
+ folio_unlock(folio);
return ret;
}
@@ -344,14 +331,11 @@ out:
* grow out to a tree. If need be, detecting boundary extents could
* trivially be added in a future version of ocfs2_get_block().
*/
-static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void ocfs2_readahead(struct readahead_control *rac)
{
- int ret, err = -EIO;
- struct inode *inode = mapping->host;
+ int ret;
+ struct inode *inode = rac->mapping->host;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- loff_t start;
- struct page *last;
/*
* Use the nonblocking flag for the dlm code to avoid page
@@ -359,56 +343,48 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
*/
ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
if (ret)
- return err;
+ return;
- if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
- ocfs2_inode_unlock(inode, 0);
- return err;
- }
+ if (down_read_trylock(&oi->ip_alloc_sem) == 0)
+ goto out_unlock;
/*
* Don't bother with inline-data. There isn't anything
* to read-ahead in that case anyway...
*/
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- goto out_unlock;
+ goto out_up;
/*
* Check whether a remote node truncated this file - we just
* drop out in that case as it's not worth handling here.
*/
- last = list_entry(pages->prev, struct page, lru);
- start = (loff_t)last->index << PAGE_CACHE_SHIFT;
- if (start >= i_size_read(inode))
- goto out_unlock;
+ if (readahead_pos(rac) >= i_size_read(inode))
+ goto out_up;
- err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
+ mpage_readahead(rac, ocfs2_get_block);
-out_unlock:
+out_up:
up_read(&oi->ip_alloc_sem);
+out_unlock:
ocfs2_inode_unlock(inode, 0);
-
- return err;
}
/* Note: Because we don't support holes, our allocation has
* already happened (allocation writes zeros to the file data)
* so we don't have to worry about ordered writes in
- * ocfs2_writepage.
+ * ocfs2_writepages.
*
- * ->writepage is called during the process of invalidating the page cache
+ * ->writepages is called during the process of invalidating the page cache
* during blocked lock processing. It can't block on any cluster locks
* to during block mapping. It's relying on the fact that the block
* mapping can't have disappeared under the dirty pages that it is
* being asked to write back.
*/
-static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+static int ocfs2_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- trace_ocfs2_writepage(
- (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
- page->index);
-
- return block_write_full_page(page, ocfs2_get_block, wbc);
+ return mpage_writepages(mapping, wbc, ocfs2_get_block);
}
/* Taken from ext3. We don't necessarily need the full blown
@@ -457,6 +433,15 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)block);
+ /*
+ * The swap code (ab-)uses ->bmap to get a block mapping and then
+ * bypasseѕ the file system for actual I/O. We really can't allow
+ * that on refcounted inodes, so we have to skip out here. And yes,
+ * 0 is the magic code for a bmap error..
+ */
+ if (ocfs2_is_refcount_inode(inode))
+ return 0;
+
/* We don't need to lock journal system files, since they aren't
* accessed concurrently from multiple nodes.
*/
@@ -492,158 +477,11 @@ bail:
return status;
}
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- * "So what we do is to permit the ->get_blocks function to populate
- * bh.b_size with the size of IO which is permitted at this offset and
- * this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- * fs_count, map_bh, dio->rw == WRITE);
- *
- * Note that we never bother to allocate blocks here, and thus ignore the
- * create argument.
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static bool ocfs2_release_folio(struct folio *folio, gfp_t wait)
{
- int ret;
- u64 p_blkno, inode_blocks, contig_blocks;
- unsigned int ext_flags;
- unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
- unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
-
- /* This function won't even be called if the request isn't all
- * nicely aligned and of the right size, so there's no need
- * for us to check any of that. */
-
- inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-
- /* This figures out the size of the next contiguous block, and
- * our logical offset */
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- if (ret) {
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
-
- /* We should already CoW the refcounted extent in case of create. */
- BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-
- /*
- * get_more_blocks() expects us to describe a hole by clearing
- * the mapped bit on bh_result().
- *
- * Consider an unwritten extent as a hole.
- */
- if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- map_bh(bh_result, inode->i_sb, p_blkno);
- else
- clear_buffer_mapped(bh_result);
-
- /* make sure we don't map more than max_blocks blocks here as
- that's all the kernel will handle at this point. */
- if (max_blocks < contig_blocks)
- contig_blocks = max_blocks;
- bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
- return ret;
-}
-
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static void ocfs2_dio_end_io(struct kiocb *iocb,
- loff_t offset,
- ssize_t bytes,
- void *private,
- int ret,
- bool is_async)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- int level;
- wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
-
- /* this io's submitter should not have unlocked this before we could */
- BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-
- if (ocfs2_iocb_is_sem_locked(iocb))
- ocfs2_iocb_clear_sem_locked(iocb);
-
- if (ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
-
- if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
- waitqueue_active(wq)) {
- wake_up_all(wq);
- }
- }
-
- ocfs2_iocb_clear_rw_locked(iocb);
-
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
-
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
-}
-
-/*
- * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
- * from ext3. PageChecked() bits have been removed as OCFS2 does not
- * do journalled data.
- */
-static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
- jbd2_journal_invalidatepage(journal, page, offset, length);
-}
-
-static int ocfs2_releasepage(struct page *page, gfp_t wait)
-{
- journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
- if (!page_has_buffers(page))
- return 0;
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
-}
-
-static ssize_t ocfs2_direct_IO(int rw,
- struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
-
- /*
- * Fallback to buffered I/O if we see an inode without
- * extents.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return 0;
-
- /* Fallback to buffered I/O if we are appending. */
- if (i_size_read(inode) <= offset)
- return 0;
-
- return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
- iov, offset, nr_segs,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
+ if (!folio_buffers(folio))
+ return false;
+ return try_to_free_buffers(folio);
}
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
@@ -651,12 +489,12 @@ static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
unsigned int *start,
unsigned int *end)
{
- unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+ unsigned int cluster_start = 0, cluster_end = PAGE_SIZE;
- if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+ if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) {
unsigned int cpp;
- cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+ cpp = 1 << (PAGE_SHIFT - osb->s_clustersize_bits);
cluster_start = cpos % cpp;
cluster_start = cluster_start << osb->s_clustersize_bits;
@@ -681,7 +519,7 @@ static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
*
* from == to == 0 is code for "zero the entire cluster region"
*/
-static void ocfs2_clear_page_regions(struct page *page,
+static void ocfs2_clear_folio_regions(struct folio *folio,
struct ocfs2_super *osb, u32 cpos,
unsigned from, unsigned to)
{
@@ -690,7 +528,7 @@ static void ocfs2_clear_page_regions(struct page *page,
ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_folio(folio, 0);
if (from || to) {
if (from > cluster_start)
@@ -701,20 +539,20 @@ static void ocfs2_clear_page_regions(struct page *page,
memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
}
/*
* Nonsparse file systems fully allocate before we get to the write
* code. This prevents ocfs2_write() from tagging the write as an
- * allocating one, which means ocfs2_map_page_blocks() might try to
+ * allocating one, which means ocfs2_map_folio_blocks() might try to
* read-in the blocks at the tail of our file. Avoid reading them by
* testing i_size against each block offset.
*/
-static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
+static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio,
unsigned int block_start)
{
- u64 offset = page_offset(page) + block_start;
+ u64 offset = folio_pos(folio) + block_start;
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
return 1;
@@ -732,19 +570,19 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
*
* This will also skip zeroing, which is handled externally.
*/
-int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno,
struct inode *inode, unsigned int from,
unsigned int to, int new)
{
int ret = 0;
struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
unsigned int block_end, block_start;
- unsigned int bsize = 1 << inode->i_blkbits;
+ unsigned int bsize = i_blocksize(inode);
- if (!page_has_buffers(page))
- create_empty_buffers(page, bsize, 0);
+ head = folio_buffers(folio);
+ if (!head)
+ head = create_empty_buffers(folio, bsize, 0);
- head = page_buffers(page);
for (bh = head, block_start = 0; bh != head || !block_start;
bh = bh->b_this_page, block_start += bsize) {
block_end = block_start + bsize;
@@ -756,7 +594,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
* they may belong to unallocated clusters.
*/
if (block_start >= to || block_end <= from) {
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
continue;
}
@@ -770,17 +608,16 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
if (!buffer_mapped(bh)) {
map_bh(bh, inode->i_sb, *p_blkno);
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
}
- if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
+ if (folio_test_uptodate(folio)) {
+ set_buffer_uptodate(bh);
} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_new(bh) &&
- ocfs2_should_read_blk(inode, page, block_start) &&
+ ocfs2_should_read_blk(inode, folio, block_start) &&
(block_start < from || block_end > to)) {
- ll_rw_block(READ, 1, &bh);
+ bh_read_nowait(bh, 0);
*wait_bh++=bh;
}
@@ -812,7 +649,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
if (block_start >= to)
break;
- zero_user(page, block_start, bh->b_size);
+ folio_zero_range(folio, block_start, bh->b_size);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
@@ -824,13 +661,20 @@ next_bh:
return ret;
}
-#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#if (PAGE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
#define OCFS2_MAX_CTXT_PAGES 1
#else
-#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE)
#endif
-#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+
+struct ocfs2_unwritten_extent {
+ struct list_head ue_node;
+ struct list_head ue_ip_node;
+ u32 ue_cpos;
+ u32 ue_phys;
+};
/*
* Describe the state of a single cluster to be written to.
@@ -843,7 +687,7 @@ struct ocfs2_write_cluster_desc {
* filled.
*/
unsigned c_new;
- unsigned c_unwritten;
+ unsigned c_clear_unwritten;
unsigned c_needs_zero;
};
@@ -855,6 +699,9 @@ struct ocfs2_write_ctxt {
/* First cluster allocated in a nonsparse extend */
u32 w_first_new_cpos;
+ /* Type of caller. Must be one of buffer, mmap, direct. */
+ ocfs2_write_type_t w_type;
+
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
/*
@@ -866,24 +713,24 @@ struct ocfs2_write_ctxt {
unsigned int w_large_pages;
/*
- * Pages involved in this write.
+ * Folios involved in this write.
*
- * w_target_page is the page being written to by the user.
+ * w_target_folio is the folio being written to by the user.
*
- * w_pages is an array of pages which always contains
- * w_target_page, and in the case of an allocating write with
+ * w_folios is an array of folios which always contains
+ * w_target_folio, and in the case of an allocating write with
* page_size < cluster size, it will contain zero'd and mapped
- * pages adjacent to w_target_page which need to be written
+ * pages adjacent to w_target_folio which need to be written
* out in so that future reads from that region will get
* zero's.
*/
- unsigned int w_num_pages;
- struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
- struct page *w_target_page;
+ unsigned int w_num_folios;
+ struct folio *w_folios[OCFS2_MAX_CTXT_PAGES];
+ struct folio *w_target_folio;
/*
* w_target_locked is used for page_mkwrite path indicating no unlocking
- * against w_target_page in ocfs2_write_end_nolock.
+ * against w_target_folio in ocfs2_write_end_nolock.
*/
unsigned int w_target_locked:1;
@@ -903,50 +750,75 @@ struct ocfs2_write_ctxt {
struct buffer_head *w_di_bh;
struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+ struct list_head w_unwritten_list;
+ unsigned int w_unwritten_count;
};
-void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
+void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios)
{
int i;
- for(i = 0; i < num_pages; i++) {
- if (pages[i]) {
- unlock_page(pages[i]);
- mark_page_accessed(pages[i]);
- page_cache_release(pages[i]);
- }
+ for(i = 0; i < num_folios; i++) {
+ if (!folios[i])
+ continue;
+ folio_unlock(folios[i]);
+ folio_mark_accessed(folios[i]);
+ folio_put(folios[i]);
}
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_unlock_folios(struct ocfs2_write_ctxt *wc)
{
int i;
/*
* w_target_locked is only set to true in the page_mkwrite() case.
* The intent is to allow us to lock the target page from write_begin()
- * to write_end(). The caller must hold a ref on w_target_page.
+ * to write_end(). The caller must hold a ref on w_target_folio.
*/
if (wc->w_target_locked) {
- BUG_ON(!wc->w_target_page);
- for (i = 0; i < wc->w_num_pages; i++) {
- if (wc->w_target_page == wc->w_pages[i]) {
- wc->w_pages[i] = NULL;
+ BUG_ON(!wc->w_target_folio);
+ for (i = 0; i < wc->w_num_folios; i++) {
+ if (wc->w_target_folio == wc->w_folios[i]) {
+ wc->w_folios[i] = NULL;
break;
}
}
- mark_page_accessed(wc->w_target_page);
- page_cache_release(wc->w_target_page);
+ folio_mark_accessed(wc->w_target_folio);
+ folio_put(wc->w_target_folio);
}
- ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
+ ocfs2_unlock_and_free_folios(wc->w_folios, wc->w_num_folios);
+}
+
+static void ocfs2_free_unwritten_list(struct inode *inode,
+ struct list_head *head)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
+ list_for_each_entry_safe(ue, tmp, head, ue_node) {
+ list_del(&ue->ue_node);
+ spin_lock(&oi->ip_lock);
+ list_del(&ue->ue_ip_node);
+ spin_unlock(&oi->ip_lock);
+ kfree(ue);
+ }
+}
+
+static void ocfs2_free_write_ctxt(struct inode *inode,
+ struct ocfs2_write_ctxt *wc)
+{
+ ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
+ ocfs2_unlock_folios(wc);
brelse(wc->w_di_bh);
kfree(wc);
}
static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
struct ocfs2_super *osb, loff_t pos,
- unsigned len, struct buffer_head *di_bh)
+ unsigned len, ocfs2_write_type_t type,
+ struct buffer_head *di_bh)
{
u32 cend;
struct ocfs2_write_ctxt *wc;
@@ -961,13 +833,15 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_clen = cend - wc->w_cpos + 1;
get_bh(di_bh);
wc->w_di_bh = di_bh;
+ wc->w_type = type;
- if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+ if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits))
wc->w_large_pages = 1;
else
wc->w_large_pages = 0;
ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+ INIT_LIST_HEAD(&wc->w_unwritten_list);
*wcp = wc;
@@ -979,29 +853,30 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
* and dirty so they'll be written out (in order to prevent uninitialised
* block data from leaking). And clear the new bit.
*/
-static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+static void ocfs2_zero_new_buffers(struct folio *folio, size_t from, size_t to)
{
unsigned int block_start, block_end;
struct buffer_head *head, *bh;
- BUG_ON(!PageLocked(page));
- if (!page_has_buffers(page))
+ BUG_ON(!folio_test_locked(folio));
+ head = folio_buffers(folio);
+ if (!head)
return;
- bh = head = page_buffers(page);
+ bh = head;
block_start = 0;
do {
block_end = block_start + bh->b_size;
if (buffer_new(bh)) {
if (block_end > from && block_start < to) {
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
unsigned start, end;
start = max(from, block_start);
end = min(to, block_end);
- zero_user_segment(page, start, end);
+ folio_zero_segment(folio, start, end);
set_buffer_uptodate(bh);
}
@@ -1024,29 +899,28 @@ static void ocfs2_write_failure(struct inode *inode,
loff_t user_pos, unsigned user_len)
{
int i;
- unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
+ unsigned from = user_pos & (PAGE_SIZE - 1),
to = user_pos + user_len;
- struct page *tmppage;
- ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+ if (wc->w_target_folio)
+ ocfs2_zero_new_buffers(wc->w_target_folio, from, to);
- for(i = 0; i < wc->w_num_pages; i++) {
- tmppage = wc->w_pages[i];
+ for (i = 0; i < wc->w_num_folios; i++) {
+ struct folio *folio = wc->w_folios[i];
- if (page_has_buffers(tmppage)) {
+ if (folio && folio_buffers(folio)) {
if (ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(wc->w_handle, inode);
+ ocfs2_jbd2_inode_add_write(wc->w_handle, inode,
+ user_pos, user_len);
- block_commit_write(tmppage, from, to);
+ block_commit_write(folio, from, to);
}
}
}
-static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
- struct ocfs2_write_ctxt *wc,
- struct page *page, u32 cpos,
- loff_t user_pos, unsigned user_len,
- int new)
+static int ocfs2_prepare_folio_for_write(struct inode *inode, u64 *p_blkno,
+ struct ocfs2_write_ctxt *wc, struct folio *folio, u32 cpos,
+ loff_t user_pos, unsigned user_len, int new)
{
int ret;
unsigned int map_from = 0, map_to = 0;
@@ -1059,20 +933,19 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
/* treat the write as new if the a hole/lseek spanned across
* the page boundary.
*/
- new = new | ((i_size_read(inode) <= page_offset(page)) &&
- (page_offset(page) <= user_pos));
+ new = new | ((i_size_read(inode) <= folio_pos(folio)) &&
+ (folio_pos(folio) <= user_pos));
- if (page == wc->w_target_page) {
- map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+ if (folio == wc->w_target_folio) {
+ map_from = user_pos & (PAGE_SIZE - 1);
map_to = map_from + user_len;
if (new)
- ret = ocfs2_map_page_blocks(page, p_blkno, inode,
- cluster_start, cluster_end,
- new);
+ ret = ocfs2_map_folio_blocks(folio, p_blkno, inode,
+ cluster_start, cluster_end, new);
else
- ret = ocfs2_map_page_blocks(page, p_blkno, inode,
- map_from, map_to, new);
+ ret = ocfs2_map_folio_blocks(folio, p_blkno, inode,
+ map_from, map_to, new);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1086,7 +959,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
}
} else {
/*
- * If we haven't allocated the new page yet, we
+ * If we haven't allocated the new folio yet, we
* shouldn't be writing it out without copying user
* data. This is likely a math error from the caller.
*/
@@ -1095,8 +968,8 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
map_from = cluster_start;
map_to = cluster_end;
- ret = ocfs2_map_page_blocks(page, p_blkno, inode,
- cluster_start, cluster_end, new);
+ ret = ocfs2_map_folio_blocks(folio, p_blkno, inode,
+ cluster_start, cluster_end, new);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1104,20 +977,20 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
}
/*
- * Parts of newly allocated pages need to be zero'd.
+ * Parts of newly allocated folios need to be zero'd.
*
* Above, we have also rewritten 'to' and 'from' - as far as
* the rest of the function is concerned, the entire cluster
- * range inside of a page needs to be written.
+ * range inside of a folio needs to be written.
*
- * We can skip this if the page is up to date - it's already
+ * We can skip this if the folio is uptodate - it's already
* been zero'd from being read in as a hole.
*/
- if (new && !PageUptodate(page))
- ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+ if (new && !folio_test_uptodate(folio))
+ ocfs2_clear_folio_regions(folio, OCFS2_SB(inode->i_sb),
cpos, user_data_from, user_data_to);
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
out:
return ret;
@@ -1126,18 +999,16 @@ out:
/*
* This function will only grab one clusters worth of pages.
*/
-static int ocfs2_grab_pages_for_write(struct address_space *mapping,
- struct ocfs2_write_ctxt *wc,
- u32 cpos, loff_t user_pos,
- unsigned user_len, int new,
- struct page *mmap_page)
+static int ocfs2_grab_folios_for_write(struct address_space *mapping,
+ struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos,
+ unsigned user_len, int new, struct folio *mmap_folio)
{
int ret = 0, i;
unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
loff_t last_byte;
- target_index = user_pos >> PAGE_CACHE_SHIFT;
+ target_index = user_pos >> PAGE_SHIFT;
/*
* Figure out how many pages we'll be manipulating here. For
@@ -1147,7 +1018,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
* last page of the write.
*/
if (new) {
- wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
+ wc->w_num_folios = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
/*
* We need the index *past* the last page we could possibly
@@ -1156,49 +1027,58 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
*/
last_byte = max(user_pos + user_len, i_size_read(inode));
BUG_ON(last_byte < 1);
- end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
- if ((start + wc->w_num_pages) > end_index)
- wc->w_num_pages = end_index - start;
+ end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1;
+ if ((start + wc->w_num_folios) > end_index)
+ wc->w_num_folios = end_index - start;
} else {
- wc->w_num_pages = 1;
+ wc->w_num_folios = 1;
start = target_index;
}
+ end_index = (user_pos + user_len - 1) >> PAGE_SHIFT;
- for(i = 0; i < wc->w_num_pages; i++) {
+ for(i = 0; i < wc->w_num_folios; i++) {
index = start + i;
- if (index == target_index && mmap_page) {
+ if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_MMAP) {
/*
* ocfs2_pagemkwrite() is a little different
* and wants us to directly use the page
* passed in.
*/
- lock_page(mmap_page);
+ folio_lock(mmap_folio);
/* Exit and let the caller retry */
- if (mmap_page->mapping != mapping) {
- WARN_ON(mmap_page->mapping);
- unlock_page(mmap_page);
+ if (mmap_folio->mapping != mapping) {
+ WARN_ON(mmap_folio->mapping);
+ folio_unlock(mmap_folio);
ret = -EAGAIN;
goto out;
}
- page_cache_get(mmap_page);
- wc->w_pages[i] = mmap_page;
+ folio_get(mmap_folio);
+ wc->w_folios[i] = mmap_folio;
wc->w_target_locked = true;
+ } else if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_DIRECT) {
+ /* Direct write has no mapping page. */
+ wc->w_folios[i] = NULL;
+ continue;
} else {
- wc->w_pages[i] = find_or_create_page(mapping, index,
- GFP_NOFS);
- if (!wc->w_pages[i]) {
- ret = -ENOMEM;
+ wc->w_folios[i] = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ GFP_NOFS);
+ if (IS_ERR(wc->w_folios[i])) {
+ ret = PTR_ERR(wc->w_folios[i]);
mlog_errno(ret);
+ wc->w_folios[i] = NULL;
goto out;
}
}
- wait_for_stable_page(wc->w_pages[i]);
+ folio_wait_stable(wc->w_folios[i]);
if (index == target_index)
- wc->w_target_page = wc->w_pages[i];
+ wc->w_target_folio = wc->w_folios[i];
}
out:
if (ret)
@@ -1210,19 +1090,20 @@ out:
* Prepare a single cluster for write one cluster into the file.
*/
static int ocfs2_write_cluster(struct address_space *mapping,
- u32 phys, unsigned int unwritten,
+ u32 *phys, unsigned int new,
+ unsigned int clear_unwritten,
unsigned int should_zero,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_write_ctxt *wc, u32 cpos,
loff_t user_pos, unsigned user_len)
{
- int ret, i, new;
- u64 v_blkno, p_blkno;
+ int ret, i;
+ u64 p_blkno;
struct inode *inode = mapping->host;
struct ocfs2_extent_tree et;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- new = phys == 0 ? 1 : 0;
if (new) {
u32 tmp_pos;
@@ -1232,9 +1113,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
*/
tmp_pos = cpos;
ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
- &tmp_pos, 1, 0, wc->w_di_bh,
- wc->w_handle, data_ac,
- meta_ac, NULL);
+ &tmp_pos, 1, !clear_unwritten,
+ wc->w_di_bh, wc->w_handle,
+ data_ac, meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
@@ -1251,11 +1132,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
mlog_errno(ret);
goto out;
}
- } else if (unwritten) {
+ } else if (clear_unwritten) {
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
wc->w_di_bh);
ret = ocfs2_mark_extent_written(inode, &et,
- wc->w_handle, cpos, 1, phys,
+ wc->w_handle, cpos, 1, *phys,
meta_ac, &wc->w_dealloc);
if (ret < 0) {
mlog_errno(ret);
@@ -1263,34 +1144,36 @@ static int ocfs2_write_cluster(struct address_space *mapping,
}
}
- if (should_zero)
- v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
- else
- v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
-
/*
* The only reason this should fail is due to an inability to
* find the extent added.
*/
- ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
- NULL);
+ ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
if (ret < 0) {
- ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
- "at logical block %llu",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_blkno);
+ mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
+ "at logical cluster %u",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
goto out;
}
- BUG_ON(p_blkno == 0);
+ BUG_ON(*phys == 0);
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+ if (!should_zero)
+ p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
- for(i = 0; i < wc->w_num_pages; i++) {
+ for (i = 0; i < wc->w_num_folios; i++) {
int tmpret;
- tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
- wc->w_pages[i], cpos,
- user_pos, user_len,
- should_zero);
+ /* This is the direct io target page. */
+ if (wc->w_folios[i] == NULL) {
+ p_blkno += (1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits));
+ continue;
+ }
+
+ tmpret = ocfs2_prepare_folio_for_write(inode, &p_blkno, wc,
+ wc->w_folios[i], cpos, user_pos, user_len,
+ should_zero);
if (tmpret) {
mlog_errno(tmpret);
if (ret == 0)
@@ -1333,8 +1216,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
if ((cluster_off + local_len) > osb->s_clustersize)
local_len = osb->s_clustersize - cluster_off;
- ret = ocfs2_write_cluster(mapping, desc->c_phys,
- desc->c_unwritten,
+ ret = ocfs2_write_cluster(mapping, &desc->c_phys,
+ desc->c_new,
+ desc->c_clear_unwritten,
desc->c_needs_zero,
data_ac, meta_ac,
wc, desc->c_cpos, pos, local_len);
@@ -1363,7 +1247,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
{
struct ocfs2_write_cluster_desc *desc;
- wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+ wc->w_target_from = pos & (PAGE_SIZE - 1);
wc->w_target_to = wc->w_target_from + len;
if (alloc == 0)
@@ -1400,11 +1284,71 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
&wc->w_target_to);
} else {
wc->w_target_from = 0;
- wc->w_target_to = PAGE_CACHE_SIZE;
+ wc->w_target_to = PAGE_SIZE;
}
}
/*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ struct ocfs2_write_cluster_desc *desc)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
+ int ret = 0;
+
+ if (!desc->c_needs_zero)
+ return 0;
+
+retry:
+ spin_lock(&oi->ip_lock);
+ /* Needs not to zero no metter buffer or direct. The one who is zero
+ * the cluster is doing zero. And he will clear unwritten after all
+ * cluster io finished. */
+ list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+ if (desc->c_cpos == ue->ue_cpos) {
+ BUG_ON(desc->c_new);
+ desc->c_needs_zero = 0;
+ desc->c_clear_unwritten = 0;
+ goto unlock;
+ }
+ }
+
+ if (wc->w_type != OCFS2_WRITE_DIRECT)
+ goto unlock;
+
+ if (new == NULL) {
+ spin_unlock(&oi->ip_lock);
+ new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+ GFP_NOFS);
+ if (new == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto retry;
+ }
+ /* This direct write will doing zero. */
+ new->ue_cpos = desc->c_cpos;
+ new->ue_phys = desc->c_phys;
+ desc->c_clear_unwritten = 0;
+ list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+ list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+ wc->w_unwritten_count++;
+ new = NULL;
+unlock:
+ spin_unlock(&oi->ip_lock);
+out:
+ kfree(new);
+ return ret;
+}
+
+/*
* Populate each single-cluster write descriptor in the write context
* with information about the i/o to be done.
*
@@ -1479,14 +1423,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
if (phys == 0) {
desc->c_new = 1;
desc->c_needs_zero = 1;
+ desc->c_clear_unwritten = 1;
*clusters_to_alloc = *clusters_to_alloc + 1;
}
if (ext_flags & OCFS2_EXT_UNWRITTEN) {
- desc->c_unwritten = 1;
+ desc->c_clear_unwritten = 1;
desc->c_needs_zero = 1;
}
+ ret = ocfs2_unwritten_check(inode, wc, desc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
num_clusters--;
}
@@ -1501,29 +1452,32 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct page *page;
+ struct folio *folio;
handle_t *handle;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
- page = find_or_create_page(mapping, 0, GFP_NOFS);
- if (!page) {
- ret = -ENOMEM;
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
- /*
- * If we don't set w_num_pages then this page won't get unlocked
- * and freed on cleanup of the write context.
- */
- wc->w_pages[0] = wc->w_target_page = page;
- wc->w_num_pages = 1;
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
+ folio = __filemap_get_folio(mapping, 0,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
+ if (IS_ERR(folio)) {
+ ocfs2_commit_trans(osb, handle);
+ ret = PTR_ERR(folio);
mlog_errno(ret);
goto out;
}
+ /*
+ * If we don't set w_num_folios then this folio won't get unlocked
+ * and freed on cleanup of the write context.
+ */
+ wc->w_target_folio = folio;
+ wc->w_folios[0] = folio;
+ wc->w_num_folios = 1;
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1537,8 +1491,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
ocfs2_set_inode_data_inline(inode, di);
- if (!PageUptodate(page)) {
- ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
+ if (!folio_test_uptodate(folio)) {
+ ret = ocfs2_read_inline_data(inode, folio, wc->w_di_bh);
if (ret) {
ocfs2_commit_trans(osb, handle);
@@ -1561,9 +1515,8 @@ int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
}
static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
- struct inode *inode, loff_t pos,
- unsigned len, struct page *mmap_page,
- struct ocfs2_write_ctxt *wc)
+ struct inode *inode, loff_t pos, size_t len,
+ struct folio *mmap_folio, struct ocfs2_write_ctxt *wc)
{
int ret, written = 0;
loff_t end = pos + len;
@@ -1578,7 +1531,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
* Handle inodes which already have inline data 1st.
*/
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- if (mmap_page == NULL &&
+ if (mmap_folio == NULL &&
ocfs2_size_fits_inline_data(wc->w_di_bh, end))
goto do_inline_write;
@@ -1602,7 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
* Check whether the write can fit.
*/
di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
- if (mmap_page ||
+ if (mmap_folio ||
end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
return 0;
@@ -1648,8 +1601,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
if (ret)
mlog_errno(ret);
- wc->w_first_new_cpos =
- ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+ /* There is no wc if this is call from direct. */
+ if (wc)
+ wc->w_first_new_cpos =
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
return ret;
}
@@ -1666,48 +1621,10 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
return ret;
}
-/*
- * Try to flush truncate logs if we can free enough clusters from it.
- * As for return value, "< 0" means error, "0" no space and "1" means
- * we have freed enough spaces and let the caller try to allocate again.
- */
-static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
- unsigned int needed)
-{
- tid_t target;
- int ret = 0;
- unsigned int truncated_clusters;
-
- mutex_lock(&osb->osb_tl_inode->i_mutex);
- truncated_clusters = osb->truncated_clusters;
- mutex_unlock(&osb->osb_tl_inode->i_mutex);
-
- /*
- * Check whether we can succeed in allocating if we free
- * the truncate log.
- */
- if (truncated_clusters < needed)
- goto out;
-
- ret = ocfs2_flush_truncate_log(osb);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
- jbd2_log_wait_commit(osb->journal->j_journal, target);
- ret = 1;
- }
-out:
- return ret;
-}
-
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata,
- struct buffer_head *di_bh, struct page *mmap_page)
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
+ struct folio **foliop, void **fsdata,
+ struct buffer_head *di_bh, struct folio *mmap_folio)
{
int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
@@ -1722,7 +1639,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
int try_free = 1, ret1;
try_again:
- ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+ ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
if (ret) {
mlog_errno(ret);
return ret;
@@ -1730,7 +1647,7 @@ try_again:
if (ocfs2_supports_inline_data(osb)) {
ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
- mmap_page, wc);
+ mmap_folio, wc);
if (ret == 1) {
ret = 0;
goto success;
@@ -1741,14 +1658,17 @@ try_again:
}
}
- if (ocfs2_sparse_alloc(osb))
- ret = ocfs2_zero_tail(inode, di_bh, pos);
- else
- ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
- wc);
- if (ret) {
- mlog_errno(ret);
- goto out;
+ /* Direct io change i_size late, should not zero tail here. */
+ if (type != OCFS2_WRITE_DIRECT) {
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ len, wc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
}
ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -1757,7 +1677,7 @@ try_again:
goto out;
} else if (ret == 1) {
clusters_need = wc->w_clen;
- ret = ocfs2_refcount_cow(inode, filp, di_bh,
+ ret = ocfs2_refcount_cow(inode, di_bh,
wc->w_cpos, wc->w_clen, UINT_MAX);
if (ret) {
mlog_errno(ret);
@@ -1779,7 +1699,7 @@ try_again:
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(long long)i_size_read(inode),
le32_to_cpu(di->i_clusters),
- pos, len, flags, mmap_page,
+ pos, len, type, mmap_folio,
clusters_to_alloc, extents_to_split);
/*
@@ -1808,19 +1728,18 @@ try_again:
data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
credits = ocfs2_calc_extend_credits(inode->i_sb,
- &di->id2.i_list,
- clusters_to_alloc);
-
- }
+ &di->id2.i_list);
+ } else if (type == OCFS2_WRITE_DIRECT)
+ /* direct write needs not to start trans if no extents alloc. */
+ goto success;
/*
* We have to zero sparse allocated clusters, unwritten extent clusters,
* and non-sparse clusters we just extended. For non-sparse writes,
* we know zeros will only be needed in the first and/or last cluster.
*/
- if (clusters_to_alloc || extents_to_split ||
- (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
- wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+ if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+ wc->w_desc[wc->w_clen - 1].c_needs_zero))
cluster_of_pages = 1;
else
cluster_of_pages = 0;
@@ -1842,10 +1761,7 @@ try_again:
if (ret)
goto out_commit;
}
- /*
- * We don't want this to fail in ocfs2_write_end(), so do it
- * here.
- */
+
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -1854,26 +1770,26 @@ try_again:
}
/*
- * Fill our page array first. That way we've grabbed enough so
+ * Fill our folio array first. That way we've grabbed enough so
* that we can zero and flush if we error after adding the
* extent.
*/
- ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
- cluster_of_pages, mmap_page);
- if (ret && ret != -EAGAIN) {
- mlog_errno(ret);
- goto out_quota;
- }
+ ret = ocfs2_grab_folios_for_write(mapping, wc, wc->w_cpos, pos, len,
+ cluster_of_pages, mmap_folio);
+ if (ret) {
+ /*
+ * ocfs2_grab_folios_for_write() returns -EAGAIN if it
+ * could not lock the target folio. In this case, we exit
+ * with no error and no target folio. This will trigger
+ * the caller, page_mkwrite(), to re-try the operation.
+ */
+ if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) {
+ BUG_ON(wc->w_target_folio);
+ ret = 0;
+ goto out_quota;
+ }
- /*
- * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
- * the target page. In this case, we exit with no error and no target
- * page. This will trigger the caller, page_mkwrite(), to re-try
- * the operation.
- */
- if (ret == -EAGAIN) {
- BUG_ON(wc->w_target_page);
- ret = 0;
+ mlog_errno(ret);
goto out_quota;
}
@@ -1890,7 +1806,8 @@ try_again:
ocfs2_free_alloc_context(meta_ac);
success:
- *pagep = wc->w_target_page;
+ if (foliop)
+ *foliop = wc->w_target_folio;
*fsdata = wc;
return 0;
out_quota:
@@ -1901,12 +1818,26 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out:
- ocfs2_free_write_ctxt(wc);
+ /*
+ * The mmapped page won't be unlocked in ocfs2_free_write_ctxt(),
+ * even in case of error here like ENOSPC and ENOMEM. So, we need
+ * to unlock the target page manually to prevent deadlocks when
+ * retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED
+ * to VM code.
+ */
+ if (wc->w_target_locked)
+ folio_unlock(mmap_folio);
- if (data_ac)
+ ocfs2_free_write_ctxt(inode, wc);
+
+ if (data_ac) {
ocfs2_free_alloc_context(data_ac);
- if (meta_ac)
+ data_ac = NULL;
+ }
+ if (meta_ac) {
ocfs2_free_alloc_context(meta_ac);
+ meta_ac = NULL;
+ }
if (ret == -ENOSPC && try_free) {
/*
@@ -1926,9 +1857,10 @@ out:
return ret;
}
-static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+static int ocfs2_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
struct buffer_head *di_bh = NULL;
@@ -1943,14 +1875,14 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
/*
* Take alloc sem here to prevent concurrent lookups. That way
* the mapping, zeroing and tree manipulation within
- * ocfs2_write() will be safe against ->readpage(). This
+ * ocfs2_write() will be safe against ->read_folio(). This
* should also serve to lock out allocation from a shared
* writeable region.
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
- fsdata, di_bh, NULL);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
+ foliop, fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
goto out_fail;
@@ -1974,18 +1906,15 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
struct ocfs2_dinode *di,
struct ocfs2_write_ctxt *wc)
{
- void *kaddr;
-
if (unlikely(*copied < len)) {
- if (!PageUptodate(wc->w_target_page)) {
+ if (!folio_test_uptodate(wc->w_target_folio)) {
*copied = 0;
return;
}
}
- kaddr = kmap_atomic(wc->w_target_page);
- memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
- kunmap_atomic(kaddr);
+ memcpy_from_folio(di->id2.i_data.id_data + pos, wc->w_target_folio,
+ pos, *copied);
trace_ocfs2_write_end_inline(
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1994,42 +1923,72 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
le16_to_cpu(di->i_dyn_features));
}
-int ocfs2_write_end_nolock(struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos,
+ unsigned len, unsigned copied, void *fsdata)
{
- int i;
- unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+ int i, ret;
+ size_t from, to, start = pos & (PAGE_SIZE - 1);
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_write_ctxt *wc = fsdata;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
handle_t *handle = wc->w_handle;
- struct page *tmppage;
+
+ BUG_ON(!list_empty(&wc->w_unwritten_list));
+
+ if (handle) {
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
+ }
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
goto out_write_size;
}
- if (unlikely(copied < len)) {
- if (!PageUptodate(wc->w_target_page))
+ if (unlikely(copied < len) && wc->w_target_folio) {
+ loff_t new_isize;
+
+ if (!folio_test_uptodate(wc->w_target_folio))
copied = 0;
- ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
- start+len);
+ new_isize = max_t(loff_t, i_size_read(inode), pos + copied);
+ if (new_isize > folio_pos(wc->w_target_folio))
+ ocfs2_zero_new_buffers(wc->w_target_folio, start+copied,
+ start+len);
+ else {
+ /*
+ * When folio is fully beyond new isize (data copy
+ * failed), do not bother zeroing the folio. Invalidate
+ * it instead so that writeback does not get confused
+ * put page & buffer dirty bits into inconsistent
+ * state.
+ */
+ block_invalidate_folio(wc->w_target_folio, 0,
+ folio_size(wc->w_target_folio));
+ }
}
- flush_dcache_page(wc->w_target_page);
+ if (wc->w_target_folio)
+ flush_dcache_folio(wc->w_target_folio);
- for(i = 0; i < wc->w_num_pages; i++) {
- tmppage = wc->w_pages[i];
+ for (i = 0; i < wc->w_num_folios; i++) {
+ struct folio *folio = wc->w_folios[i];
- if (tmppage == wc->w_target_page) {
+ /* This is the direct io target folio */
+ if (folio == NULL)
+ continue;
+
+ if (folio == wc->w_target_folio) {
from = wc->w_target_from;
to = wc->w_target_to;
- BUG_ON(from > PAGE_CACHE_SIZE ||
- to > PAGE_CACHE_SIZE ||
+ BUG_ON(from > folio_size(folio) ||
+ to > folio_size(folio) ||
to < from);
} else {
/*
@@ -2038,46 +1997,67 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
* to flush their entire range.
*/
from = 0;
- to = PAGE_CACHE_SIZE;
+ to = folio_size(folio);
}
- if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(wc->w_handle, inode);
- block_commit_write(tmppage, from, to);
+ if (folio_buffers(folio)) {
+ if (handle && ocfs2_should_order_data(inode)) {
+ loff_t start_byte = folio_pos(folio) + from;
+ loff_t length = to - from;
+ ocfs2_jbd2_inode_add_write(handle, inode,
+ start_byte, length);
+ }
+ block_commit_write(folio, from, to);
}
}
out_write_size:
- pos += copied;
- if (pos > inode->i_size) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
- inode->i_blocks = ocfs2_inode_sector_count(inode);
- di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_journal_dirty(handle, wc->w_di_bh);
+ /* Direct io do not update i_size here. */
+ if (wc->w_type != OCFS2_WRITE_DIRECT) {
+ pos += copied;
+ if (pos > i_size_read(inode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
+ if (handle)
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ }
+ if (handle)
+ ocfs2_journal_dirty(handle, wc->w_di_bh);
- ocfs2_commit_trans(osb, handle);
+out:
+ /* unlock pages before dealloc since it needs acquiring j_trans_barrier
+ * lock, or it will cause a deadlock since journal commit threads holds
+ * this lock and will ask for the page lock when flushing the data.
+ * put it here to preserve the unlock order.
+ */
+ ocfs2_unlock_folios(wc);
+
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
- ocfs2_free_write_ctxt(wc);
+ brelse(wc->w_di_bh);
+ kfree(wc);
return copied;
}
-static int ocfs2_write_end(struct file *file, struct address_space *mapping,
+static int ocfs2_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
int ret;
struct inode *inode = mapping->host;
- ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+ ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_inode_unlock(inode, 1);
@@ -2085,17 +2065,396 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
return ret;
}
+struct ocfs2_dio_write_ctxt {
+ struct list_head dw_zero_list;
+ unsigned dw_zero_count;
+ int dw_orphaned;
+ pid_t dw_writer_pid;
+};
+
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+
+ if (bh->b_private)
+ return bh->b_private;
+
+ dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+ if (dwc == NULL)
+ return NULL;
+ INIT_LIST_HEAD(&dwc->dw_zero_list);
+ dwc->dw_zero_count = 0;
+ dwc->dw_orphaned = 0;
+ dwc->dw_writer_pid = task_pid_nr(current);
+ bh->b_private = dwc;
+ *alloc = 1;
+
+ return dwc;
+}
+
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc)
+{
+ ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+ kfree(dwc);
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ * "So what we do is to permit the ->get_blocks function to populate
+ * bh.b_size with the size of IO which is permitted at this offset and
+ * this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_write_ctxt *wc;
+ struct ocfs2_write_cluster_desc *desc = NULL;
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+ struct buffer_head *di_bh = NULL;
+ u64 p_blkno;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
+ unsigned len, total_len = bh_result->b_size;
+ int ret = 0, first_get_block = 0;
+
+ len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+ len = min(total_len, len);
+
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
+ mlog(0, "get block of %lu at %llu:%u req %u\n",
+ inode->i_ino, pos, len, total_len);
+
+ /*
+ * Because we need to change file size in ocfs2_dio_end_io_write(), or
+ * we may need to add it to orphan dir. So can not fall to fast path
+ * while file size will be changed.
+ */
+ if (pos + total_len <= i_size_read(inode)) {
+
+ /* This is the fast path for re-write. */
+ ret = ocfs2_lock_get_block(inode, iblock, bh_result, create);
+ if (buffer_mapped(bh_result) &&
+ !buffer_new(bh_result) &&
+ ret == 0)
+ goto out;
+
+ /* Clear state set by ocfs2_get_block. */
+ bh_result->b_state = 0;
+ }
+
+ dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+ if (unlikely(dwc == NULL)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+ !dwc->dw_orphaned) {
+ /*
+ * when we are going to alloc extents beyond file size, add the
+ * inode to orphan dir, so we can recall those spaces when
+ * system crashed during write.
+ */
+ ret = ocfs2_add_inode_to_orphan(osb, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dwc->dw_orphaned = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_write(&oi->ip_alloc_sem);
+
+ if (first_get_block) {
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ total_len, NULL);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+ OCFS2_WRITE_DIRECT, NULL,
+ (void **)&wc, di_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ desc = &wc->w_desc[0];
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+ BUG_ON(p_blkno == 0);
+ p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+
+ map_bh(bh_result, inode->i_sb, p_blkno);
+ bh_result->b_size = len;
+ if (desc->c_needs_zero)
+ set_buffer_new(bh_result);
+
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
+ /* May sleep in end_io. It should not happen in a irq context. So defer
+ * it to dio work queue. */
+ set_buffer_defer_completion(bh_result);
+
+ if (!list_empty(&wc->w_unwritten_list)) {
+ struct ocfs2_unwritten_extent *ue = NULL;
+
+ ue = list_first_entry(&wc->w_unwritten_list,
+ struct ocfs2_unwritten_extent,
+ ue_node);
+ BUG_ON(ue->ue_cpos != desc->c_cpos);
+ /* The physical address may be 0, fill it. */
+ ue->ue_phys = desc->c_phys;
+
+ list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+ dwc->dw_zero_count += wc->w_unwritten_count;
+ }
+
+ ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
+ BUG_ON(ret != len);
+ ret = 0;
+unlock:
+ up_write(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ return ret;
+}
+
+static int ocfs2_dio_end_io_write(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc,
+ loff_t offset,
+ ssize_t bytes)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = NULL;
+ loff_t end = offset + bytes;
+ int ret = 0, credits = 0;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /* We do clear unwritten, delete orphan, change i_size here. If neither
+ * of these happen, we can skip all this. */
+ if (list_empty(&dwc->dw_zero_list) &&
+ end <= i_size_read(inode) &&
+ !dwc->dw_orphaned)
+ goto out;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_write(&oi->ip_alloc_sem);
+
+ /* Delete orphan before acquire i_rwsem. */
+ if (dwc->dw_orphaned) {
+ BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+
+ end = end > i_size_read(inode) ? end : 0;
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+ !!end, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+
+ /* Attach dealloc with extent tree in case that we may reuse extents
+ * which are already unlinked from current extent tree due to extent
+ * rotation and merging.
+ */
+ et.et_dealloc = &dealloc;
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+ &data_ac, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto unlock;
+ }
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto commit;
+ }
+
+ list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_assure_trans_credits(handle, credits);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ ret = ocfs2_mark_extent_written(inode, &et, handle,
+ ue->ue_cpos, 1,
+ ue->ue_phys,
+ meta_ac, &dealloc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ if (end > i_size_read(inode)) {
+ ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+commit:
+ ocfs2_commit_trans(osb, handle);
+unlock:
+ up_write(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ ocfs2_run_deallocs(osb, &dealloc);
+ ocfs2_dio_free_write_ctx(inode, dwc);
+
+ return ret;
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static int ocfs2_dio_end_io(struct kiocb *iocb,
+ loff_t offset,
+ ssize_t bytes,
+ void *private)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ int level;
+ int ret = 0;
+
+ /* this io's submitter should not have unlocked this before we could */
+ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+ if (bytes <= 0)
+ mlog_ratelimited(ML_ERROR, "Direct IO failed, bytes = %lld",
+ (long long)bytes);
+ if (private) {
+ if (bytes > 0)
+ ret = ocfs2_dio_end_io_write(inode, private, offset,
+ bytes);
+ else
+ ocfs2_dio_free_write_ctx(inode, private);
+ }
+
+ ocfs2_iocb_clear_rw_locked(iocb);
+
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+ return ret;
+}
+
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ get_block_t *get_block;
+
+ /*
+ * Fallback to buffered I/O if we see an inode without
+ * extents.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ /* Fallback to buffered I/O if we do not support append dio. */
+ if (iocb->ki_pos + iter->count > i_size_read(inode) &&
+ !ocfs2_supports_append_dio(osb))
+ return 0;
+
+ if (iov_iter_rw(iter) == READ)
+ get_block = ocfs2_lock_get_block;
+ else
+ get_block = ocfs2_dio_wr_get_block;
+
+ return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, get_block,
+ ocfs2_dio_end_io, 0);
+}
+
const struct address_space_operations ocfs2_aops = {
- .readpage = ocfs2_readpage,
- .readpages = ocfs2_readpages,
- .writepage = ocfs2_writepage,
+ .dirty_folio = block_dirty_folio,
+ .read_folio = ocfs2_read_folio,
+ .readahead = ocfs2_readahead,
+ .writepages = ocfs2_writepages,
.write_begin = ocfs2_write_begin,
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
.direct_IO = ocfs2_direct_IO,
- .invalidatepage = ocfs2_invalidatepage,
- .releasepage = ocfs2_releasepage,
- .migratepage = buffer_migrate_page,
+ .invalidate_folio = block_invalidate_folio,
+ .release_folio = ocfs2_release_folio,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f671e49beb34..114efc9111e4 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -1,39 +1,18 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
-#include <linux/aio.h>
-
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
- struct page *page,
- unsigned from,
- unsigned to);
+#include <linux/fs.h>
-int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno,
struct inode *inode, unsigned int from,
unsigned int to, int new);
-void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages);
+void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios);
int walk_page_buffers( handle_t *handle,
struct buffer_head *head,
@@ -44,16 +23,20 @@ int walk_page_buffers( handle_t *handle,
struct buffer_head *bh));
int ocfs2_write_end_nolock(struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata);
+ loff_t pos, unsigned len, unsigned copied, void *fsdata);
+
+typedef enum {
+ OCFS2_WRITE_BUFFER = 0,
+ OCFS2_WRITE_DIRECT,
+ OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata,
- struct buffer_head *di_bh, struct page *mmap_page);
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
+ struct folio **foliop, void **fsdata,
+ struct buffer_head *di_bh, struct folio *mmap_folio);
-int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+int ocfs2_read_inline_data(struct inode *inode, struct folio *folio,
struct buffer_head *di_bh);
int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
@@ -74,37 +57,19 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
/*
* Using a named enum representing lock types in terms of #N bit stored in
* iocb->private, which is going to be used for communication between
- * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ * ocfs2_dio_end_io() and ocfs2_file_write/read_iter().
*/
enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL,
- OCFS2_IOCB_SEM,
- OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS
};
+#define ocfs2_iocb_init_rw_locked(iocb) \
+ (iocb->private = NULL)
#define ocfs2_iocb_clear_rw_locked(iocb) \
clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
#define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_sem_locked(iocb) \
- set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_sem_locked(iocb) \
- clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_sem_locked(iocb) \
- test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-
-#define ocfs2_iocb_set_unaligned_aio(iocb) \
- set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_unaligned_aio(iocb) \
- clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_unaligned_aio(iocb) \
- test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-
-#define OCFS2_IOEND_WQ_HASH_SZ 37
-#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
- OCFS2_IOEND_WQ_HASH_SZ])
-extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 0725e6054650..863a5316030b 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -1,20 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* blockcheck.c
*
* Checksum and ECC codes for the OCFS2 userspace library.
*
* Copyright (C) 2006, 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/kernel.h>
@@ -132,7 +122,7 @@ u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr
* parity bits that are part of the bit number
* representation. Huh?
*
- * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+ * <wikipedia href="https://en.wikipedia.org/wiki/Hamming_code">
* In other words, the parity bit at position 2^k
* checks bits in positions having bit k set in
* their binary representation. Conversely, for
@@ -237,70 +227,38 @@ static int blockcheck_u64_get(void *data, u64 *val)
*val = *(u64 *)data;
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
-
-static struct dentry *blockcheck_debugfs_create(const char *name,
- struct dentry *parent,
- u64 *value)
-{
- return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
- &blockcheck_fops);
-}
+DEFINE_DEBUGFS_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
{
if (stats) {
- debugfs_remove(stats->b_debug_check);
- stats->b_debug_check = NULL;
- debugfs_remove(stats->b_debug_failure);
- stats->b_debug_failure = NULL;
- debugfs_remove(stats->b_debug_recover);
- stats->b_debug_recover = NULL;
- debugfs_remove(stats->b_debug_dir);
+ debugfs_remove_recursive(stats->b_debug_dir);
stats->b_debug_dir = NULL;
}
}
-static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
- struct dentry *parent)
+static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent)
{
- int rc = -EINVAL;
+ struct dentry *dir;
- if (!stats)
- goto out;
-
- stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
- if (!stats->b_debug_dir)
- goto out;
+ dir = debugfs_create_dir("blockcheck", parent);
+ stats->b_debug_dir = dir;
- stats->b_debug_check =
- blockcheck_debugfs_create("blocks_checked",
- stats->b_debug_dir,
- &stats->b_check_count);
+ debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir,
+ &stats->b_check_count, &blockcheck_fops);
- stats->b_debug_failure =
- blockcheck_debugfs_create("checksums_failed",
- stats->b_debug_dir,
- &stats->b_failure_count);
+ debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir,
+ &stats->b_failure_count, &blockcheck_fops);
- stats->b_debug_recover =
- blockcheck_debugfs_create("ecc_recoveries",
- stats->b_debug_dir,
- &stats->b_recover_count);
- if (stats->b_debug_check && stats->b_debug_failure &&
- stats->b_debug_recover)
- rc = 0;
+ debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir,
+ &stats->b_recover_count, &blockcheck_fops);
-out:
- if (rc)
- ocfs2_blockcheck_debug_remove(stats);
- return rc;
}
#else
-static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
- struct dentry *parent)
+static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent)
{
- return 0;
}
static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
@@ -309,10 +267,10 @@ static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *
#endif /* CONFIG_DEBUG_FS */
/* Always-called wrappers for starting and stopping the debugfs files */
-int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
- struct dentry *parent)
+void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent)
{
- return ocfs2_blockcheck_debug_install(stats, parent);
+ ocfs2_blockcheck_debug_install(stats, parent);
}
void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
index d4b69febf70a..d0578e98ee8d 100644
--- a/fs/ocfs2/blockcheck.h
+++ b/fs/ocfs2/blockcheck.h
@@ -1,20 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* blockcheck.h
*
* Checksum and ECC codes for the OCFS2 userspace library.
*
* Copyright (C) 2004, 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef OCFS2_BLOCKCHECK_H
@@ -33,9 +23,6 @@ struct ocfs2_blockcheck_stats {
* ocfs2_blockcheck_stats_debugfs_install()
*/
struct dentry *b_debug_dir; /* Parent of the debugfs files */
- struct dentry *b_debug_check; /* Exposes b_check_count */
- struct dentry *b_debug_failure; /* Exposes b_failure_count */
- struct dentry *b_debug_recover; /* Exposes b_recover_count */
};
@@ -64,8 +51,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
struct ocfs2_blockcheck_stats *stats);
/* Debug Initialization */
-int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
- struct dentry *parent);
+void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent);
void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
/*
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 5d18ad10c27f..8f714406528d 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -1,31 +1,16 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* io.c
*
* Buffer cache handling
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
+#include <linux/bio.h>
#include <cluster/masklog.h>
@@ -79,7 +64,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
- submit_bh(WRITE, bh);
+ submit_bh(REQ_OP_WRITE, bh);
wait_on_buffer(bh);
@@ -90,7 +75,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
- put_bh(bh);
mlog_errno(ret);
}
@@ -99,25 +83,34 @@ out:
return ret;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[])
{
int status = 0;
unsigned int i;
struct buffer_head *bh;
+ int new_bh = 0;
trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
if (!nr)
goto bail;
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the caller can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(osb->sb, block++);
if (bhs[i] == NULL) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ break;
}
}
bh = bhs[i];
@@ -140,22 +133,43 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
lock_buffer(bh);
if (buffer_jbd(bh)) {
+#ifdef CATCH_BH_JBD_RACES
mlog(ML_ERROR,
"block %llu had the JBD bit set "
"while I was in lock_buffer!",
(unsigned long long)bh->b_blocknr);
BUG();
+#else
+ unlock_buffer(bh);
+ continue;
+#endif
}
- clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh);
+ submit_bh(REQ_OP_READ, bh);
}
+read_failure:
for (i = nr; i > 0; i--) {
bh = bhs[i - 1];
+ if (unlikely(status)) {
+ if (new_bh && bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * avoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i - 1] = NULL;
+ } else if (bh && buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
+ continue;
+ }
+
/* No need to wait on the buffer if it's managed by JBD. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -165,8 +179,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
* so we can safely record this and loop back
* to cleanup the other buffers. */
status = -EIO;
- put_bh(bh);
- bhs[i - 1] = NULL;
+ goto read_failure;
}
}
@@ -174,6 +187,9 @@ bail:
return status;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
@@ -183,6 +199,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
int i, ignore_cache = 0;
struct buffer_head *bh;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ int new_bh = 0;
trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
@@ -208,15 +225,20 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
goto bail;
}
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the caller can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
ocfs2_metadata_cache_io_lock(ci);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
- ocfs2_metadata_cache_io_unlock(ci);
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ /* Don't forget to put previous bh! */
+ break;
}
}
bh = bhs[i];
@@ -301,22 +323,38 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
continue;
}
- clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
if (validate)
set_buffer_needs_validate(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh);
+ submit_bh(REQ_OP_READ, bh);
continue;
}
}
- status = 0;
-
+read_failure:
for (i = (nr - 1); i >= 0; i--) {
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
+ if (unlikely(status)) {
+ /* Clear the buffers on error including those
+ * ever succeeded in reading
+ */
+ if (new_bh && bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * avoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i] = NULL;
+ } else if (bh && buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
+ continue;
+ }
/* We know this can't have changed as we hold the
* owner sem. Avoid doing any work on the bh if the
* journal has it. */
@@ -331,9 +369,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
- put_bh(bh);
- bhs[i] = NULL;
- continue;
+ clear_buffer_needs_validate(bh);
+ goto read_failure;
}
if (buffer_needs_validate(bh)) {
@@ -343,18 +380,16 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
BUG_ON(buffer_jbd(bh));
clear_buffer_needs_validate(bh);
status = validate(sb, bh);
- if (status) {
- put_bh(bh);
- bhs[i] = NULL;
- continue;
- }
+ if (status)
+ goto read_failure;
}
}
/* Always set the buffer in the cache, even if it was
* a forced read, or read-ahead which hasn't yet
* completed. */
- ocfs2_set_buffer_uptodate(ci, bh);
+ if (bh)
+ ocfs2_set_buffer_uptodate(ci, bh);
}
ocfs2_metadata_cache_io_unlock(ci);
@@ -414,13 +449,12 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
- submit_bh(WRITE, bh);
+ submit_bh(REQ_OP_WRITE, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ret = -EIO;
- put_bh(bh);
mlog_errno(ret);
}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index b97bcc6dde7c..2d51649fc090 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* ocfs2_buffer_head.h
*
* Buffer cache handling functions defined
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_BUFFER_HEAD_IO_H
@@ -28,9 +12,6 @@
#include <linux/buffer_head.h>
-void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
- int uptodate);
-
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
struct ocfs2_caching_info *ci);
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d8608..0e5b73215819 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
- quorum.o tcp.o netdebug.o ver.o
+ quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c1c864e81cc..724350925aff 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1,24 +1,9 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* Copyright (C) 2004, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
+#include "linux/kstrtox.h"
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/jiffies.h>
@@ -35,7 +20,8 @@
#include <linux/time.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
-
+#include <linux/bitmap.h>
+#include <linux/ktime.h>
#include "heartbeat.h"
#include "tcp.h"
#include "nodemanager.h"
@@ -105,10 +91,6 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
#define O2HB_DEBUG_REGION_PINNED "pinned"
static struct dentry *o2hb_debug_dir;
-static struct dentry *o2hb_debug_livenodes;
-static struct dentry *o2hb_debug_liveregions;
-static struct dentry *o2hb_debug_quorumregions;
-static struct dentry *o2hb_debug_failedregions;
static LIST_HEAD(o2hb_all_regions);
@@ -118,21 +100,19 @@ static struct o2hb_callback {
static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
-#define O2HB_DEFAULT_BLOCK_BITS 9
-
enum o2hb_heartbeat_modes {
O2HB_HEARTBEAT_LOCAL = 0,
O2HB_HEARTBEAT_GLOBAL,
O2HB_HEARTBEAT_NUM_MODES,
};
-char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
- "local", /* O2HB_HEARTBEAT_LOCAL */
- "global", /* O2HB_HEARTBEAT_GLOBAL */
+static const char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+ "local", /* O2HB_HEARTBEAT_LOCAL */
+ "global", /* O2HB_HEARTBEAT_GLOBAL */
};
unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
-unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+static unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
/*
* o2hb_dependent_users tracks the number of registered callbacks that depend
@@ -140,7 +120,7 @@ unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
* However only o2dlm depends on the heartbeat. It does not want the heartbeat
* to stop while a dlm domain is still active.
*/
-unsigned int o2hb_dependent_users;
+static unsigned int o2hb_dependent_users;
/*
* In global heartbeat mode, all regions are pinned if there are one or more
@@ -218,7 +198,8 @@ struct o2hb_region {
unsigned hr_unclean_stop:1,
hr_aborted_start:1,
hr_item_pinned:1,
- hr_item_dropped:1;
+ hr_item_dropped:1,
+ hr_node_deleted:1;
/* protected by the hr_callback_sem */
struct task_struct *hr_task;
@@ -233,7 +214,7 @@ struct o2hb_region {
unsigned int hr_num_pages;
struct page **hr_slot_data;
- struct block_device *hr_bdev;
+ struct file *hr_bdev_file;
struct o2hb_disk_slot *hr_slots;
/* live node map of this region */
@@ -241,10 +222,6 @@ struct o2hb_region {
unsigned int hr_region_num;
struct dentry *hr_debug_dir;
- struct dentry *hr_debug_livenodes;
- struct dentry *hr_debug_regnum;
- struct dentry *hr_debug_elapsed_time;
- struct dentry *hr_debug_pinned;
struct o2hb_debug_buf *hr_db_livenodes;
struct o2hb_debug_buf *hr_db_regnum;
struct o2hb_debug_buf *hr_db_elapsed_time;
@@ -259,8 +236,6 @@ struct o2hb_region {
* (hr_steady_iterations == 0) within hr_unsteady_iterations */
atomic_t hr_unsteady_iterations;
- char hr_dev_name[BDEVNAME_SIZE];
-
unsigned int hr_timeout_ms;
/* randomized as the region goes up and down so that a node
@@ -270,48 +245,65 @@ struct o2hb_region {
struct delayed_work hr_write_timeout_work;
unsigned long hr_last_timeout_start;
+ /* negotiate timer, used to negotiate extending hb timeout. */
+ struct delayed_work hr_nego_timeout_work;
+ unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
/* Used during o2hb_check_slot to hold a copy of the block
* being checked because we temporarily have to zero out the
* crc field. */
struct o2hb_disk_heartbeat_block *hr_tmp_block;
+
+ /* Message key for negotiate timeout message. */
+ unsigned int hr_key;
+ struct list_head hr_handler_list;
+
+ /* last hb status, 0 for success, other value for error. */
+ int hr_last_hb_status;
};
+static inline struct block_device *reg_bdev(struct o2hb_region *reg)
+{
+ return reg->hr_bdev_file ? file_bdev(reg->hr_bdev_file) : NULL;
+}
+
struct o2hb_bio_wait_ctxt {
atomic_t wc_num_reqs;
struct completion wc_io_complete;
int wc_error;
};
-static int o2hb_pop_count(void *map, int count)
-{
- int i = -1, pop = 0;
+#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2)
- while ((i = find_next_bit(map, count, i + 1)) < count)
- pop++;
- return pop;
-}
+enum {
+ O2HB_NEGO_TIMEOUT_MSG = 1,
+ O2HB_NEGO_APPROVE_MSG = 2,
+};
+
+struct o2hb_nego_msg {
+ u8 node_num;
+};
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
- unsigned long flags;
struct o2hb_region *reg =
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
- mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
- "milliseconds\n", reg->hr_dev_name,
+ mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u "
+ "milliseconds\n", reg_bdev(reg),
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
if (o2hb_global_heartbeat_active()) {
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
- failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+ failed = bitmap_weight(o2hb_failed_region_bitmap,
O2NM_MAX_REGIONS);
- quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ quorum = bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS);
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
quorum, failed);
@@ -327,7 +319,7 @@ static void o2hb_write_timeout(struct work_struct *work)
o2quo_disk_timeout();
}
-static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+static void o2hb_arm_timeout(struct o2hb_region *reg)
{
/* Arm writeout only after thread reaches steady state */
if (atomic_read(&reg->hr_steady_iterations) != 0)
@@ -342,14 +334,134 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
spin_unlock(&o2hb_live_lock);
}
cancel_delayed_work(&reg->hr_write_timeout_work);
- reg->hr_last_timeout_start = jiffies;
schedule_delayed_work(&reg->hr_write_timeout_work,
msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+
+ cancel_delayed_work(&reg->hr_nego_timeout_work);
+ /* negotiate timeout must be less than write timeout. */
+ schedule_delayed_work(&reg->hr_nego_timeout_work,
+ msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
+ bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES);
}
-static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+static void o2hb_disarm_timeout(struct o2hb_region *reg)
{
cancel_delayed_work_sync(&reg->hr_write_timeout_work);
+ cancel_delayed_work_sync(&reg->hr_nego_timeout_work);
+}
+
+static int o2hb_send_nego_msg(int key, int type, u8 target)
+{
+ struct o2hb_nego_msg msg;
+ int status, ret;
+
+ msg.node_num = o2nm_this_node();
+again:
+ ret = o2net_send_message(type, key, &msg, sizeof(msg),
+ target, &status);
+
+ if (ret == -EAGAIN || ret == -ENOMEM) {
+ msleep(100);
+ goto again;
+ }
+
+ return ret;
+}
+
+static void o2hb_nego_timeout(struct work_struct *work)
+{
+ unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int master_node, i, ret;
+ struct o2hb_region *reg;
+
+ reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work);
+ /* don't negotiate timeout if last hb failed since it is very
+ * possible io failed. Should let write timeout fence self.
+ */
+ if (reg->hr_last_hb_status)
+ return;
+
+ o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
+ /* lowest node as master node to make negotiate decision. */
+ master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES);
+
+ if (master_node == o2nm_this_node()) {
+ if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n",
+ o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
+ config_item_name(&reg->hr_item), reg_bdev(reg));
+ set_bit(master_node, reg->hr_nego_node_bitmap);
+ }
+ if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap,
+ O2NM_MAX_NODES)) {
+ /* check negotiate bitmap every second to do timeout
+ * approve decision.
+ */
+ schedule_delayed_work(&reg->hr_nego_timeout_work,
+ msecs_to_jiffies(1000));
+
+ return;
+ }
+
+ printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n",
+ config_item_name(&reg->hr_item),
+ reg_bdev(reg));
+ /* approve negotiate timeout request. */
+ o2hb_arm_timeout(reg);
+
+ i = -1;
+ while ((i = find_next_bit(live_node_bitmap,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ if (i == master_node)
+ continue;
+
+ mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i);
+ ret = o2hb_send_nego_msg(reg->hr_key,
+ O2HB_NEGO_APPROVE_MSG, i);
+ if (ret)
+ mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n",
+ i, ret);
+ }
+ } else {
+ /* negotiate timeout with master node. */
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n",
+ o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
+ reg_bdev(reg), master_node);
+ ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
+ master_node);
+ if (ret)
+ mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n",
+ master_node, ret);
+ }
+}
+
+static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct o2hb_region *reg = data;
+ struct o2hb_nego_msg *nego_msg;
+
+ nego_msg = (struct o2hb_nego_msg *)msg->buf;
+ printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n",
+ nego_msg->node_num, config_item_name(&reg->hr_item),
+ reg_bdev(reg));
+ if (nego_msg->node_num < O2NM_MAX_NODES)
+ set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
+ else
+ mlog(ML_ERROR, "got nego timeout message from bad node.\n");
+
+ return 0;
+}
+
+static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct o2hb_region *reg = data;
+
+ printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n",
+ config_item_name(&reg->hr_item), reg_bdev(reg));
+ o2hb_arm_timeout(reg);
+ return 0;
}
static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -373,21 +485,19 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
}
}
-static void o2hb_wait_on_io(struct o2hb_region *reg,
- struct o2hb_bio_wait_ctxt *wc)
+static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc)
{
o2hb_bio_wait_dec(wc, 1);
wait_for_completion(&wc->wc_io_complete);
}
-static void o2hb_bio_end_io(struct bio *bio,
- int error)
+static void o2hb_bio_end_io(struct bio *bio)
{
struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
- if (error) {
- mlog(ML_ERROR, "IO Error %d\n", error);
- wc->wc_error = error;
+ if (bio->bi_status) {
+ mlog(ML_ERROR, "IO Error %d\n", bio->bi_status);
+ wc->wc_error = blk_status_to_errno(bio->bi_status);
}
o2hb_bio_wait_dec(wc, 1);
@@ -399,7 +509,7 @@ static void o2hb_bio_end_io(struct bio *bio,
static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
struct o2hb_bio_wait_ctxt *wc,
unsigned int *current_slot,
- unsigned int max_slots)
+ unsigned int max_slots, blk_opf_t opf)
{
int len, current_page;
unsigned int vec_len, vec_start;
@@ -413,7 +523,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
* all together. */
- bio = bio_alloc(GFP_ATOMIC, 16);
+ bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
@@ -421,18 +531,17 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
}
/* Must put everything in 512 byte sectors for the bio... */
- bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
- bio->bi_bdev = reg->hr_bdev;
+ bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
- vec_start = (cs << bits) % PAGE_CACHE_SIZE;
+ vec_start = (cs << bits) % PAGE_SIZE;
while(cs < max_slots) {
current_page = cs / spp;
page = reg->hr_slot_data[current_page];
- vec_len = min(PAGE_CACHE_SIZE - vec_start,
- (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
+ vec_len = min(PAGE_SIZE - vec_start,
+ (max_slots-cs) * (PAGE_SIZE/spp) );
mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
current_page, vec_len, vec_start);
@@ -440,7 +549,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
len = bio_add_page(bio, page, vec_len, vec_start);
if (len != vec_len) break;
- cs += vec_len / (PAGE_CACHE_SIZE/spp);
+ cs += vec_len / (PAGE_SIZE/spp);
vec_start = 0;
}
@@ -450,9 +559,10 @@ bail:
}
static int o2hb_read_slots(struct o2hb_region *reg,
+ unsigned int begin_slot,
unsigned int max_slots)
{
- unsigned int current_slot=0;
+ unsigned int current_slot = begin_slot;
int status;
struct o2hb_bio_wait_ctxt wc;
struct bio *bio;
@@ -460,7 +570,8 @@ static int o2hb_read_slots(struct o2hb_region *reg,
o2hb_bio_wait_init(&wc);
while(current_slot < max_slots) {
- bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
+ bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots,
+ REQ_OP_READ);
if (IS_ERR(bio)) {
status = PTR_ERR(bio);
mlog_errno(status);
@@ -468,13 +579,13 @@ static int o2hb_read_slots(struct o2hb_region *reg,
}
atomic_inc(&wc.wc_num_reqs);
- submit_bio(READ, bio);
+ submit_bio(bio);
}
status = 0;
bail_and_wait:
- o2hb_wait_on_io(reg, &wc);
+ o2hb_wait_on_io(&wc);
if (wc.wc_error && !status)
status = wc.wc_error;
@@ -492,7 +603,8 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
slot = o2nm_this_node();
- bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
+ bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1,
+ REQ_OP_WRITE | REQ_SYNC);
if (IS_ERR(bio)) {
status = PTR_ERR(bio);
mlog_errno(status);
@@ -500,7 +612,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
}
atomic_inc(&write_wc->wc_num_reqs);
- submit_bio(WRITE_SYNC, bio);
+ submit_bio(bio);
status = 0;
bail:
@@ -582,8 +694,8 @@ static int o2hb_check_own_slot(struct o2hb_region *reg)
else
errstr = ERRSTR3;
- mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), "
- "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name,
+ mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), "
+ "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg_bdev(reg),
slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
(unsigned long long)slot->ds_last_time, hb_block->hb_node,
(unsigned long long)le64_to_cpu(hb_block->hb_generation),
@@ -606,7 +718,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
memset(hb_block, 0, reg->hr_block_bytes);
/* TODO: time stuff */
- cputime = CURRENT_TIME.tv_sec;
+ cputime = ktime_get_real_seconds();
if (!cputime)
cputime = 1;
@@ -628,11 +740,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
struct o2nm_node *node,
int idx)
{
- struct list_head *iter;
struct o2hb_callback_func *f;
- list_for_each(iter, &hbcall->list) {
- f = list_entry(iter, struct o2hb_callback_func, hc_item);
+ list_for_each_entry(f, &hbcall->list, hc_item) {
mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
(f->hc_func)(node, idx, f->hc_data);
}
@@ -641,16 +751,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
/* Will run the list in order until we process the passed event */
static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
{
- int empty;
struct o2hb_callback *hbcall;
struct o2hb_node_event *event;
- spin_lock(&o2hb_live_lock);
- empty = list_empty(&queued_event->hn_item);
- spin_unlock(&o2hb_live_lock);
- if (empty)
- return;
-
/* Holding callback sem assures we don't alter the callback
* lists when doing this, and serializes ourselves with other
* processes wanting callbacks. */
@@ -709,6 +812,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
struct o2hb_node_event event =
{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
struct o2nm_node *node;
+ int queued = 0;
node = o2nm_get_node_by_num(slot->ds_node_num);
if (!node)
@@ -726,11 +830,13 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
slot->ds_node_num);
+ queued = 1;
}
}
spin_unlock(&o2hb_live_lock);
- o2hb_run_event_list(&event);
+ if (queued)
+ o2hb_run_event_list(&event);
o2nm_node_put(node);
}
@@ -758,12 +864,12 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
* live nodes heartbeat on it. In other words, the region has been
* added to all nodes.
*/
- if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
- sizeof(o2hb_live_node_bitmap)))
+ if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+ O2NM_MAX_NODES))
goto unlock;
- printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
- config_item_name(&reg->hr_item), reg->hr_dev_name);
+ printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
+ config_item_name(&reg->hr_item), reg_bdev(reg));
set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
@@ -771,7 +877,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
* If global heartbeat active, unpin all regions if the
* region count > CUT_OFF
*/
- if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
o2hb_region_unpin(NULL);
unlock:
@@ -790,6 +896,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
unsigned int slot_dead_ms;
int tmp;
+ int queued = 0;
memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
@@ -820,8 +927,8 @@ static int o2hb_check_slot(struct o2hb_region *reg,
/* The node is live but pushed out a bad crc. We
* consider it a transient miss but don't populate any
* other values as they may be junk. */
- mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
- slot->ds_node_num, reg->hr_dev_name);
+ mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n",
+ slot->ds_node_num, reg_bdev(reg));
o2hb_dump_slot(hb_block);
slot->ds_equal_samples++;
@@ -883,6 +990,7 @@ fire_callbacks:
slot->ds_node_num);
changed = 1;
+ queued = 1;
}
list_add_tail(&slot->ds_live_item,
@@ -899,12 +1007,12 @@ fire_callbacks:
slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms);
if (slot_dead_ms && slot_dead_ms != dead_ms) {
/* TODO: Perhaps we can fail the region here. */
- mlog(ML_ERROR, "Node %d on device %s has a dead count "
+ mlog(ML_ERROR, "Node %d on device %pg has a dead count "
"of %u ms, but our count is %u ms.\n"
"Please double check your configuration values "
"for 'O2CB_HEARTBEAT_THRESHOLD'\n",
- slot->ds_node_num, reg->hr_dev_name, slot_dead_ms,
- dead_ms);
+ slot->ds_node_num, reg_bdev(reg),
+ slot_dead_ms, dead_ms);
}
goto out;
}
@@ -913,7 +1021,7 @@ fire_callbacks:
if (list_empty(&slot->ds_live_item))
goto out;
- /* live nodes only go dead after enough consequtive missed
+ /* live nodes only go dead after enough consecutive missed
* samples.. reset the missed counter whenever we see
* activity */
if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
@@ -934,6 +1042,7 @@ fire_callbacks:
node, slot->ds_node_num);
changed = 1;
+ queued = 1;
}
/* We don't clear this because the node is still
@@ -949,35 +1058,27 @@ fire_callbacks:
out:
spin_unlock(&o2hb_live_lock);
- o2hb_run_event_list(&event);
+ if (queued)
+ o2hb_run_event_list(&event);
if (node)
o2nm_node_put(node);
return changed;
}
-/* This could be faster if we just implmented a find_last_bit, but I
- * don't think the circumstances warrant it. */
-static int o2hb_highest_node(unsigned long *nodes,
- int numbits)
+static int o2hb_highest_node(unsigned long *nodes, int numbits)
{
- int highest, node;
-
- highest = numbits;
- node = -1;
- while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
- if (node >= numbits)
- break;
-
- highest = node;
- }
+ return find_last_bit(nodes, numbits);
+}
- return highest;
+static int o2hb_lowest_node(unsigned long *nodes, int numbits)
+{
+ return find_first_bit(nodes, numbits);
}
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
- int i, ret, highest_node;
+ int i, ret, highest_node, lowest_node;
int membership_change = 0, own_slot_ok = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -994,7 +1095,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* If a node is not configured but is in the livemap, we still need
* to read the slot so as to be able to remove it from the livemap.
*/
- o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
i = -1;
while ((i = find_next_bit(live_node_bitmap,
O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
@@ -1002,7 +1103,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
}
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
- if (highest_node >= O2NM_MAX_NODES) {
+ lowest_node = o2hb_lowest_node(configured_nodes, O2NM_MAX_NODES);
+ if (highest_node >= O2NM_MAX_NODES || lowest_node >= O2NM_MAX_NODES) {
mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
ret = -EINVAL;
goto bail;
@@ -1012,7 +1114,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* yet. Of course, if the node definitions have holes in them
* then we're reading an empty slot anyway... Consider this
* best-effort. */
- ret = o2hb_read_slots(reg, highest_node + 1);
+ ret = o2hb_read_slots(reg, lowest_node, highest_node + 1);
if (ret < 0) {
mlog_errno(ret);
goto bail;
@@ -1043,13 +1145,13 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* before we can go to steady state. This ensures that
* people we find in our steady state have seen us.
*/
- o2hb_wait_on_io(reg, &write_wc);
+ o2hb_wait_on_io(&write_wc);
if (write_wc.wc_error) {
/* Do not re-arm the write timeout on I/O error - we
* can't be sure that the new block ever made it to
* disk */
- mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
- write_wc.wc_error, reg->hr_dev_name);
+ mlog(ML_ERROR, "Write error %d on device \"%pg\"\n",
+ write_wc.wc_error, reg_bdev(reg));
ret = write_wc.wc_error;
goto bail;
}
@@ -1057,7 +1159,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
/* Skip disarming the timeout if own slot has stale/bad data */
if (own_slot_ok) {
o2hb_set_quorum_device(reg);
- o2hb_arm_write_timeout(reg);
+ o2hb_arm_timeout(reg);
+ reg->hr_last_timeout_start = jiffies;
}
bail:
@@ -1072,9 +1175,9 @@ bail:
if (atomic_read(&reg->hr_steady_iterations) != 0) {
if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
printk(KERN_NOTICE "o2hb: Unable to stabilize "
- "heartbeart on region %s (%s)\n",
+ "heartbeat on region %s (%pg)\n",
config_item_name(&reg->hr_item),
- reg->hr_dev_name);
+ reg_bdev(reg));
atomic_set(&reg->hr_steady_iterations, 0);
reg->hr_aborted_start = 1;
wake_up(&o2hb_steady_queue);
@@ -1085,37 +1188,6 @@ bail:
return ret;
}
-/* Subtract b from a, storing the result in a. a *must* have a larger
- * value than b. */
-static void o2hb_tv_subtract(struct timeval *a,
- struct timeval *b)
-{
- /* just return 0 when a is after b */
- if (a->tv_sec < b->tv_sec ||
- (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
- a->tv_sec = 0;
- a->tv_usec = 0;
- return;
- }
-
- a->tv_sec -= b->tv_sec;
- a->tv_usec -= b->tv_usec;
- while ( a->tv_usec < 0 ) {
- a->tv_sec--;
- a->tv_usec += 1000000;
- }
-}
-
-static unsigned int o2hb_elapsed_msecs(struct timeval *start,
- struct timeval *end)
-{
- struct timeval res = *end;
-
- o2hb_tv_subtract(&res, start);
-
- return res.tv_sec * 1000 + res.tv_usec / 1000;
-}
-
/*
* we ride the region ref that the region dir holds. before the region
* dir is removed and drops it ref it will wait to tear down this
@@ -1126,15 +1198,21 @@ static int o2hb_thread(void *data)
int i, ret;
struct o2hb_region *reg = data;
struct o2hb_bio_wait_ctxt write_wc;
- struct timeval before_hb, after_hb;
+ ktime_t before_hb, after_hb;
unsigned int elapsed_msec;
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
- set_user_nice(current, -20);
+ set_user_nice(current, MIN_NICE);
/* Pin node */
- o2nm_depend_this_node();
+ ret = o2nm_depend_this_node();
+ if (ret) {
+ mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
+ reg->hr_node_deleted = 1;
+ wake_up(&o2hb_steady_queue);
+ return 0;
+ }
while (!kthread_should_stop() &&
!reg->hr_unclean_stop && !reg->hr_aborted_start) {
@@ -1143,18 +1221,19 @@ static int o2hb_thread(void *data)
* hr_timeout_ms between disk writes. On busy systems
* this should result in a heartbeat which is less
* likely to time itself out. */
- do_gettimeofday(&before_hb);
+ before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
+ reg->hr_last_hb_status = ret;
+
+ after_hb = ktime_get_real();
- do_gettimeofday(&after_hb);
- elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+ elapsed_msec = (unsigned int)
+ ktime_ms_delta(after_hb, before_hb);
mlog(ML_HEARTBEAT,
- "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
- before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
- after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
- elapsed_msec);
+ "start = %lld, end = %lld, msec = %u, ret = %d\n",
+ before_hb, after_hb, elapsed_msec, ret);
if (!kthread_should_stop() &&
elapsed_msec < reg->hr_timeout_ms) {
@@ -1164,7 +1243,7 @@ static int o2hb_thread(void *data)
}
}
- o2hb_disarm_write_timeout(reg);
+ o2hb_disarm_timeout(reg);
/* unclean stop is only used in very bad situation */
for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
@@ -1179,7 +1258,7 @@ static int o2hb_thread(void *data)
o2hb_prepare_block(reg, 0);
ret = o2hb_issue_node_write(reg, &write_wc);
if (ret == 0)
- o2hb_wait_on_io(reg, &write_wc);
+ o2hb_wait_on_io(&write_wc);
else
mlog_errno(ret);
}
@@ -1229,7 +1308,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
case O2HB_DB_TYPE_REGION_NUMBER:
reg = (struct o2hb_region *)db->db_data;
- out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+ out += scnprintf(buf + out, PAGE_SIZE - out, "%d\n",
reg->hr_region_num);
goto done;
@@ -1239,12 +1318,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
/* If 0, it has never been set before */
if (lts)
lts = jiffies_to_msecs(jiffies - lts);
- out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
+ out += scnprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
goto done;
case O2HB_DB_TYPE_REGION_PINNED:
reg = (struct o2hb_region *)db->db_data;
- out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+ out += scnprintf(buf + out, PAGE_SIZE - out, "%u\n",
!!reg->hr_item_pinned);
goto done;
@@ -1253,8 +1332,8 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
}
while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
- out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
- out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+ out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += scnprintf(buf + out, PAGE_SIZE - out, "\n");
done:
i_size_write(inode, out);
@@ -1303,107 +1382,60 @@ static const struct file_operations o2hb_debug_fops = {
void o2hb_exit(void)
{
+ debugfs_remove_recursive(o2hb_debug_dir);
kfree(o2hb_db_livenodes);
kfree(o2hb_db_liveregions);
kfree(o2hb_db_quorumregions);
kfree(o2hb_db_failedregions);
- debugfs_remove(o2hb_debug_failedregions);
- debugfs_remove(o2hb_debug_quorumregions);
- debugfs_remove(o2hb_debug_liveregions);
- debugfs_remove(o2hb_debug_livenodes);
- debugfs_remove(o2hb_debug_dir);
}
-static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
- struct o2hb_debug_buf **db, int db_len,
- int type, int size, int len, void *data)
+static void o2hb_debug_create(const char *name, struct dentry *dir,
+ struct o2hb_debug_buf **db, int db_len, int type,
+ int size, int len, void *data)
{
*db = kmalloc(db_len, GFP_KERNEL);
if (!*db)
- return NULL;
+ return;
(*db)->db_type = type;
(*db)->db_size = size;
(*db)->db_len = len;
(*db)->db_data = data;
- return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
- &o2hb_debug_fops);
+ debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops);
}
-static int o2hb_debug_init(void)
+static void o2hb_debug_init(void)
{
- int ret = -ENOMEM;
-
o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
- if (!o2hb_debug_dir) {
- mlog_errno(ret);
- goto bail;
- }
-
- o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
- o2hb_debug_dir,
- &o2hb_db_livenodes,
- sizeof(*o2hb_db_livenodes),
- O2HB_DB_TYPE_LIVENODES,
- sizeof(o2hb_live_node_bitmap),
- O2NM_MAX_NODES,
- o2hb_live_node_bitmap);
- if (!o2hb_debug_livenodes) {
- mlog_errno(ret);
- goto bail;
- }
- o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
- o2hb_debug_dir,
- &o2hb_db_liveregions,
- sizeof(*o2hb_db_liveregions),
- O2HB_DB_TYPE_LIVEREGIONS,
- sizeof(o2hb_live_region_bitmap),
- O2NM_MAX_REGIONS,
- o2hb_live_region_bitmap);
- if (!o2hb_debug_liveregions) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_LIVENODES, o2hb_debug_dir,
+ &o2hb_db_livenodes, sizeof(*o2hb_db_livenodes),
+ O2HB_DB_TYPE_LIVENODES, sizeof(o2hb_live_node_bitmap),
+ O2NM_MAX_NODES, o2hb_live_node_bitmap);
- o2hb_debug_quorumregions =
- o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
- o2hb_debug_dir,
- &o2hb_db_quorumregions,
- sizeof(*o2hb_db_quorumregions),
- O2HB_DB_TYPE_QUORUMREGIONS,
- sizeof(o2hb_quorum_region_bitmap),
- O2NM_MAX_REGIONS,
- o2hb_quorum_region_bitmap);
- if (!o2hb_debug_quorumregions) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, o2hb_debug_dir,
+ &o2hb_db_liveregions, sizeof(*o2hb_db_liveregions),
+ O2HB_DB_TYPE_LIVEREGIONS,
+ sizeof(o2hb_live_region_bitmap), O2NM_MAX_REGIONS,
+ o2hb_live_region_bitmap);
- o2hb_debug_failedregions =
- o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
- o2hb_debug_dir,
- &o2hb_db_failedregions,
- sizeof(*o2hb_db_failedregions),
- O2HB_DB_TYPE_FAILEDREGIONS,
- sizeof(o2hb_failed_region_bitmap),
- O2NM_MAX_REGIONS,
- o2hb_failed_region_bitmap);
- if (!o2hb_debug_failedregions) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, o2hb_debug_dir,
+ &o2hb_db_quorumregions,
+ sizeof(*o2hb_db_quorumregions),
+ O2HB_DB_TYPE_QUORUMREGIONS,
+ sizeof(o2hb_quorum_region_bitmap), O2NM_MAX_REGIONS,
+ o2hb_quorum_region_bitmap);
- ret = 0;
-bail:
- if (ret)
- o2hb_exit();
-
- return ret;
+ o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, o2hb_debug_dir,
+ &o2hb_db_failedregions,
+ sizeof(*o2hb_db_failedregions),
+ O2HB_DB_TYPE_FAILEDREGIONS,
+ sizeof(o2hb_failed_region_bitmap), O2NM_MAX_REGIONS,
+ o2hb_failed_region_bitmap);
}
-int o2hb_init(void)
+void o2hb_init(void)
{
int i;
@@ -1413,38 +1445,34 @@ int o2hb_init(void)
for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
INIT_LIST_HEAD(&o2hb_live_slots[i]);
- INIT_LIST_HEAD(&o2hb_node_events);
-
- memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
- memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
- memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
- memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
- memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+ bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES);
+ bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS);
o2hb_dependent_users = 0;
- return o2hb_debug_init();
+ o2hb_debug_init();
}
/* if we're already in a callback then we're already serialized by the sem */
static void o2hb_fill_node_map_from_callback(unsigned long *map,
- unsigned bytes)
+ unsigned int bits)
{
- BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
- memcpy(map, &o2hb_live_node_bitmap, bytes);
+ bitmap_copy(map, o2hb_live_node_bitmap, bits);
}
/*
* get a map of all nodes that are heartbeating in any regions
*/
-void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+void o2hb_fill_node_map(unsigned long *map, unsigned int bits)
{
/* callers want to serialize this map and callbacks so that they
* can trust that they don't miss nodes coming to the party */
down_read(&o2hb_callback_sem);
spin_lock(&o2hb_live_lock);
- o2hb_fill_node_map_from_callback(map, bytes);
+ o2hb_fill_node_map_from_callback(map, bits);
spin_unlock(&o2hb_live_lock);
up_read(&o2hb_callback_sem);
}
@@ -1469,7 +1497,7 @@ static void o2hb_region_release(struct config_item *item)
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
- mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
+ mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg_bdev(reg));
kfree(reg->hr_tmp_block);
@@ -1482,38 +1510,37 @@ static void o2hb_region_release(struct config_item *item)
kfree(reg->hr_slot_data);
}
- if (reg->hr_bdev)
- blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
+ if (reg->hr_bdev_file)
+ fput(reg->hr_bdev_file);
kfree(reg->hr_slots);
- kfree(reg->hr_db_regnum);
+ debugfs_remove_recursive(reg->hr_debug_dir);
kfree(reg->hr_db_livenodes);
- debugfs_remove(reg->hr_debug_livenodes);
- debugfs_remove(reg->hr_debug_regnum);
- debugfs_remove(reg->hr_debug_elapsed_time);
- debugfs_remove(reg->hr_debug_pinned);
- debugfs_remove(reg->hr_debug_dir);
+ kfree(reg->hr_db_regnum);
+ kfree(reg->hr_db_elapsed_time);
+ kfree(reg->hr_db_pinned);
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
spin_unlock(&o2hb_live_lock);
+ o2net_unregister_handler_list(&reg->hr_handler_list);
kfree(reg);
}
static int o2hb_read_block_input(struct o2hb_region *reg,
const char *page,
- size_t count,
unsigned long *ret_bytes,
unsigned int *ret_bits)
{
unsigned long bytes;
char *p = (char *)page;
+ int ret;
- bytes = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ ret = kstrtoul(p, 0, &bytes);
+ if (ret)
+ return ret;
/* Heartbeat and fs min / max block sizes are the same. */
if (bytes > 4096 || bytes < 512)
@@ -1529,25 +1556,26 @@ static int o2hb_read_block_input(struct o2hb_region *reg,
return 0;
}
-static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+static ssize_t o2hb_region_block_bytes_show(struct config_item *item,
char *page)
{
- return sprintf(page, "%u\n", reg->hr_block_bytes);
+ return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes);
}
-static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
int status;
unsigned long block_bytes;
unsigned int block_bits;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_file)
return -EINVAL;
- status = o2hb_read_block_input(reg, page, count,
- &block_bytes, &block_bits);
+ status = o2hb_read_block_input(reg, page, &block_bytes,
+ &block_bits);
if (status)
return status;
@@ -1557,24 +1585,26 @@ static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+static ssize_t o2hb_region_start_block_show(struct config_item *item,
char *page)
{
- return sprintf(page, "%llu\n", reg->hr_start_block);
+ return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block);
}
-static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_start_block_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
unsigned long long tmp;
char *p = (char *)page;
+ ssize_t ret;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_file)
return -EINVAL;
- tmp = simple_strtoull(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
+ ret = kstrtoull(p, 0, &tmp);
+ if (ret)
return -EINVAL;
reg->hr_start_block = tmp;
@@ -1582,25 +1612,26 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", reg->hr_blocks);
+ return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks);
}
-static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_blocks_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
unsigned long tmp;
char *p = (char *)page;
+ int ret;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_file)
return -EINVAL;
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ ret = kstrtoul(p, 0, &tmp);
+ if (ret)
+ return ret;
if (tmp > O2NM_MAX_NODES || tmp == 0)
return -ERANGE;
@@ -1610,20 +1641,19 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
{
unsigned int ret = 0;
- if (reg->hr_bdev)
- ret = sprintf(page, "%s\n", reg->hr_dev_name);
+ if (to_o2hb_region(item)->hr_bdev_file)
+ ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item)));
return ret;
}
static void o2hb_init_region_params(struct o2hb_region *reg)
{
- reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
+ reg->hr_slots_per_page = PAGE_SIZE >> reg->hr_block_bits;
reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
@@ -1644,17 +1674,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
- if (reg->hr_tmp_block == NULL) {
- mlog_errno(-ENOMEM);
+ if (reg->hr_tmp_block == NULL)
return -ENOMEM;
- }
reg->hr_slots = kcalloc(reg->hr_blocks,
sizeof(struct o2hb_disk_slot), GFP_KERNEL);
- if (reg->hr_slots == NULL) {
- mlog_errno(-ENOMEM);
+ if (reg->hr_slots == NULL)
return -ENOMEM;
- }
for(i = 0; i < reg->hr_blocks; i++) {
slot = &reg->hr_slots[i];
@@ -1670,17 +1696,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
GFP_KERNEL);
- if (!reg->hr_slot_data) {
- mlog_errno(-ENOMEM);
+ if (!reg->hr_slot_data)
return -ENOMEM;
- }
for(i = 0; i < reg->hr_num_pages; i++) {
page = alloc_page(GFP_KERNEL);
- if (!page) {
- mlog_errno(-ENOMEM);
+ if (!page)
return -ENOMEM;
- }
reg->hr_slot_data[i] = page;
@@ -1711,11 +1733,9 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
- ret = o2hb_read_slots(reg, reg->hr_blocks);
- if (ret) {
- mlog_errno(ret);
+ ret = o2hb_read_slots(reg, 0, reg->hr_blocks);
+ if (ret)
goto out;
- }
/* We only want to get an idea of the values initially in each
* slot, so we do no verification - o2hb_check_slot will
@@ -1735,61 +1755,57 @@ out:
return ret;
}
-/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
-static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+/*
+ * this is acting as commit; we set up all of hr_bdev_file and hr_task or
+ * nothing
+ */
+static ssize_t o2hb_region_dev_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
struct task_struct *hb_task;
long fd;
int sectsize;
char *p = (char *)page;
- struct fd f;
- struct inode *inode;
ssize_t ret = -EINVAL;
int live_threshold;
- if (reg->hr_bdev)
- goto out;
+ if (reg->hr_bdev_file)
+ return -EINVAL;
/* We can't heartbeat without having had our node number
* configured yet. */
if (o2nm_this_node() == O2NM_MAX_NODES)
- goto out;
+ return -EINVAL;
- fd = simple_strtol(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- goto out;
+ ret = kstrtol(p, 0, &fd);
+ if (ret < 0)
+ return -EINVAL;
if (fd < 0 || fd >= INT_MAX)
- goto out;
+ return -EINVAL;
- f = fdget(fd);
- if (f.file == NULL)
- goto out;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EINVAL;
if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
reg->hr_block_bytes == 0)
- goto out2;
-
- inode = igrab(f.file->f_mapping->host);
- if (inode == NULL)
- goto out2;
+ return -EINVAL;
- if (!S_ISBLK(inode->i_mode))
- goto out3;
+ if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode))
+ return -EINVAL;
- reg->hr_bdev = I_BDEV(f.file->f_mapping->host);
- ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
- if (ret) {
- reg->hr_bdev = NULL;
- goto out3;
+ reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev,
+ BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
+ if (IS_ERR(reg->hr_bdev_file)) {
+ ret = PTR_ERR(reg->hr_bdev_file);
+ reg->hr_bdev_file = NULL;
+ return ret;
}
- inode = NULL;
-
- bdevname(reg->hr_bdev, reg->hr_dev_name);
- sectsize = bdev_logical_block_size(reg->hr_bdev);
+ sectsize = bdev_logical_block_size(reg_bdev(reg));
if (sectsize != reg->hr_block_bytes) {
mlog(ML_ERROR,
"blocksize %u incorrect for device, expected %d",
@@ -1819,6 +1835,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
}
INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
+ INIT_DELAYED_WORK(&reg->hr_nego_timeout_work, o2hb_nego_timeout);
/*
* A node is considered live after it has beat LIVE_THRESHOLD
@@ -1831,14 +1848,14 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
live_threshold = O2HB_LIVE_THRESHOLD;
if (o2hb_global_heartbeat_active()) {
spin_lock(&o2hb_live_lock);
- if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
+ if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
live_threshold <<= 1;
spin_unlock(&o2hb_live_lock);
}
++live_threshold;
atomic_set(&reg->hr_steady_iterations, live_threshold);
- /* unsteady_iterations is double the steady_iterations */
- atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
+ /* unsteady_iterations is triple the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold * 3));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
@@ -1853,7 +1870,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
spin_unlock(&o2hb_live_lock);
ret = wait_event_interruptible(o2hb_steady_queue,
- atomic_read(&reg->hr_steady_iterations) == 0);
+ atomic_read(&reg->hr_steady_iterations) == 0 ||
+ reg->hr_node_deleted);
if (ret) {
atomic_set(&reg->hr_steady_iterations, 0);
reg->hr_aborted_start = 1;
@@ -1864,6 +1882,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
goto out3;
}
+ if (reg->hr_node_deleted) {
+ ret = -EINVAL;
+ goto out3;
+ }
+
/* Ok, we were woken. Make sure it wasn't by drop_item() */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
@@ -1877,26 +1900,20 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = -EIO;
if (hb_task && o2hb_global_heartbeat_active())
- printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
- config_item_name(&reg->hr_item), reg->hr_dev_name);
+ printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n",
+ config_item_name(&reg->hr_item), reg_bdev(reg));
out3:
- iput(inode);
-out2:
- fdput(f);
-out:
if (ret < 0) {
- if (reg->hr_bdev) {
- blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
- reg->hr_bdev = NULL;
- }
+ fput(reg->hr_bdev_file);
+ reg->hr_bdev_file = NULL;
}
return ret;
}
-static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_pid_show(struct config_item *item, char *page)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
pid_t pid = 0;
spin_lock(&o2hb_live_lock);
@@ -1910,95 +1927,26 @@ static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
return sprintf(page, "%u\n", pid);
}
-struct o2hb_region_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2hb_region *, char *);
- ssize_t (*store)(struct o2hb_region *, const char *, size_t);
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "block_bytes",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_block_bytes_read,
- .store = o2hb_region_block_bytes_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_start_block = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "start_block",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_start_block_read,
- .store = o2hb_region_start_block_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_blocks = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "blocks",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_blocks_read,
- .store = o2hb_region_blocks_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_dev = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "dev",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_dev_read,
- .store = o2hb_region_dev_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_pid = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "pid",
- .ca_mode = S_IRUGO | S_IRUSR },
- .show = o2hb_region_pid_read,
-};
+CONFIGFS_ATTR(o2hb_region_, block_bytes);
+CONFIGFS_ATTR(o2hb_region_, start_block);
+CONFIGFS_ATTR(o2hb_region_, blocks);
+CONFIGFS_ATTR(o2hb_region_, dev);
+CONFIGFS_ATTR_RO(o2hb_region_, pid);
static struct configfs_attribute *o2hb_region_attrs[] = {
- &o2hb_region_attr_block_bytes.attr,
- &o2hb_region_attr_start_block.attr,
- &o2hb_region_attr_blocks.attr,
- &o2hb_region_attr_dev.attr,
- &o2hb_region_attr_pid.attr,
+ &o2hb_region_attr_block_bytes,
+ &o2hb_region_attr_start_block,
+ &o2hb_region_attr_blocks,
+ &o2hb_region_attr_dev,
+ &o2hb_region_attr_pid,
NULL,
};
-static ssize_t o2hb_region_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2hb_region *reg = to_o2hb_region(item);
- struct o2hb_region_attribute *o2hb_region_attr =
- container_of(attr, struct o2hb_region_attribute, attr);
- ssize_t ret = 0;
-
- if (o2hb_region_attr->show)
- ret = o2hb_region_attr->show(reg, page);
- return ret;
-}
-
-static ssize_t o2hb_region_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2hb_region *reg = to_o2hb_region(item);
- struct o2hb_region_attribute *o2hb_region_attr =
- container_of(attr, struct o2hb_region_attribute, attr);
- ssize_t ret = -EINVAL;
-
- if (o2hb_region_attr->store)
- ret = o2hb_region_attr->store(reg, page, count);
- return ret;
-}
-
static struct configfs_item_operations o2hb_region_item_ops = {
.release = o2hb_region_release,
- .show_attribute = o2hb_region_show,
- .store_attribute = o2hb_region_store,
};
-static struct config_item_type o2hb_region_type = {
+static const struct config_item_type o2hb_region_type = {
.ct_item_ops = &o2hb_region_item_ops,
.ct_attrs = o2hb_region_attrs,
.ct_owner = THIS_MODULE,
@@ -2018,69 +1966,33 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
: NULL;
}
-static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+static void o2hb_debug_region_init(struct o2hb_region *reg,
+ struct dentry *parent)
{
- int ret = -ENOMEM;
+ struct dentry *dir;
- reg->hr_debug_dir =
- debugfs_create_dir(config_item_name(&reg->hr_item), dir);
- if (!reg->hr_debug_dir) {
- mlog_errno(ret);
- goto bail;
- }
+ dir = debugfs_create_dir(config_item_name(&reg->hr_item), parent);
+ reg->hr_debug_dir = dir;
- reg->hr_debug_livenodes =
- o2hb_debug_create(O2HB_DEBUG_LIVENODES,
- reg->hr_debug_dir,
- &(reg->hr_db_livenodes),
- sizeof(*(reg->hr_db_livenodes)),
- O2HB_DB_TYPE_REGION_LIVENODES,
- sizeof(reg->hr_live_node_bitmap),
- O2NM_MAX_NODES, reg);
- if (!reg->hr_debug_livenodes) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes),
+ sizeof(*(reg->hr_db_livenodes)),
+ O2HB_DB_TYPE_REGION_LIVENODES,
+ sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES,
+ reg);
- reg->hr_debug_regnum =
- o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
- reg->hr_debug_dir,
- &(reg->hr_db_regnum),
- sizeof(*(reg->hr_db_regnum)),
- O2HB_DB_TYPE_REGION_NUMBER,
- 0, O2NM_MAX_NODES, reg);
- if (!reg->hr_debug_regnum) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum),
+ sizeof(*(reg->hr_db_regnum)),
+ O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg);
- reg->hr_debug_elapsed_time =
- o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
- reg->hr_debug_dir,
- &(reg->hr_db_elapsed_time),
- sizeof(*(reg->hr_db_elapsed_time)),
- O2HB_DB_TYPE_REGION_ELAPSED_TIME,
- 0, 0, reg);
- if (!reg->hr_debug_elapsed_time) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir,
+ &(reg->hr_db_elapsed_time),
+ sizeof(*(reg->hr_db_elapsed_time)),
+ O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg);
- reg->hr_debug_pinned =
- o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
- reg->hr_debug_dir,
- &(reg->hr_db_pinned),
- sizeof(*(reg->hr_db_pinned)),
- O2HB_DB_TYPE_REGION_PINNED,
- 0, 0, reg);
- if (!reg->hr_debug_pinned) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned),
+ sizeof(*(reg->hr_db_pinned)),
+ O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg);
- ret = 0;
-bail:
- return ret;
}
static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
@@ -2115,13 +2027,39 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
- ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
- if (ret) {
- config_item_put(&reg->hr_item);
- goto free;
- }
+ /* this is the same way to generate msg key as dlm, for local heartbeat,
+ * name is also the same, so make initial crc value different to avoid
+ * message key conflict.
+ */
+ reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS,
+ name, strlen(name));
+ INIT_LIST_HEAD(&reg->hr_handler_list);
+ ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key,
+ sizeof(struct o2hb_nego_msg),
+ o2hb_nego_timeout_handler,
+ reg, NULL, &reg->hr_handler_list);
+ if (ret)
+ goto remove_item;
+
+ ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key,
+ sizeof(struct o2hb_nego_msg),
+ o2hb_nego_approve_handler,
+ reg, NULL, &reg->hr_handler_list);
+ if (ret)
+ goto unregister_handler;
+
+ o2hb_debug_region_init(reg, o2hb_debug_dir);
return &reg->hr_item;
+
+unregister_handler:
+ o2net_unregister_handler_list(&reg->hr_handler_list);
+remove_item:
+ spin_lock(&o2hb_live_lock);
+ list_del(&reg->hr_all_item);
+ if (o2hb_global_heartbeat_active())
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
free:
kfree(reg);
return ERR_PTR(ret);
@@ -2152,10 +2090,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
quorum_region = 1;
clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
spin_unlock(&o2hb_live_lock);
- printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+ printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n",
((atomic_read(&reg->hr_steady_iterations) == 0) ?
"stopped" : "start aborted"), config_item_name(item),
- reg->hr_dev_name);
+ reg_bdev(reg));
}
/*
@@ -2182,7 +2120,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
if (!o2hb_dependent_users)
goto unlock;
- if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
o2hb_region_pin(NULL);
@@ -2190,56 +2128,22 @@ unlock:
spin_unlock(&o2hb_live_lock);
}
-struct o2hb_heartbeat_group_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
- ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
-};
-
-static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
- struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
- container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
- ssize_t ret = 0;
-
- if (o2hb_heartbeat_group_attr->show)
- ret = o2hb_heartbeat_group_attr->show(reg, page);
- return ret;
-}
-
-static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
- struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
- container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
- ssize_t ret = -EINVAL;
-
- if (o2hb_heartbeat_group_attr->store)
- ret = o2hb_heartbeat_group_attr->store(reg, page, count);
- return ret;
-}
-
-static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
- char *page)
+static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item,
+ char *page)
{
return sprintf(page, "%u\n", o2hb_dead_threshold);
}
-static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
- const char *page,
- size_t count)
+static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item,
+ const char *page, size_t count)
{
unsigned long tmp;
char *p = (char *)page;
+ int ret;
- tmp = simple_strtoul(p, &p, 10);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ ret = kstrtoul(p, 10, &tmp);
+ if (ret)
+ return ret;
/* this will validate ranges for us. */
o2hb_dead_threshold_set((unsigned int) tmp);
@@ -2247,17 +2151,15 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
return count;
}
-static
-ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
- char *page)
+static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item,
+ char *page)
{
return sprintf(page, "%s\n",
o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
}
-static
-ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
- const char *page, size_t count)
+static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
+ const char *page, size_t count)
{
unsigned int i;
int ret;
@@ -2268,7 +2170,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
return -EINVAL;
for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
- if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+ if (strncasecmp(page, o2hb_heartbeat_mode_desc[i], len))
continue;
ret = o2hb_global_heartbeat_mode_set(i);
@@ -2282,41 +2184,22 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
}
-static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "dead_threshold",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_heartbeat_group_threshold_show,
- .store = o2hb_heartbeat_group_threshold_store,
-};
-
-static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "mode",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_heartbeat_group_mode_show,
- .store = o2hb_heartbeat_group_mode_store,
-};
+CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
- &o2hb_heartbeat_group_attr_threshold.attr,
- &o2hb_heartbeat_group_attr_mode.attr,
+ &o2hb_heartbeat_group_attr_dead_threshold,
+ &o2hb_heartbeat_group_attr_mode,
NULL,
};
-static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
- .show_attribute = o2hb_heartbeat_group_show,
- .store_attribute = o2hb_heartbeat_group_store,
-};
-
static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
.make_item = o2hb_heartbeat_group_make_item,
.drop_item = o2hb_heartbeat_group_drop_item,
};
-static struct config_item_type o2hb_heartbeat_group_type = {
+static const struct config_item_type o2hb_heartbeat_group_type = {
.ct_group_ops = &o2hb_heartbeat_group_group_ops,
- .ct_item_ops = &o2hb_heartbeat_group_item_ops,
.ct_attrs = o2hb_heartbeat_group_attrs,
.ct_owner = THIS_MODULE,
};
@@ -2482,7 +2365,7 @@ static int o2hb_region_inc_user(const char *region_uuid)
if (o2hb_dependent_users > 1)
goto unlock;
- if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
ret = o2hb_region_pin(NULL);
@@ -2491,7 +2374,7 @@ unlock:
return ret;
}
-void o2hb_region_dec_user(const char *region_uuid)
+static void o2hb_region_dec_user(const char *region_uuid)
{
spin_lock(&o2hb_live_lock);
@@ -2516,8 +2399,7 @@ unlock:
int o2hb_register_callback(const char *region_uuid,
struct o2hb_callback_func *hc)
{
- struct o2hb_callback_func *tmp;
- struct list_head *iter;
+ struct o2hb_callback_func *f;
struct o2hb_callback *hbcall;
int ret;
@@ -2540,10 +2422,9 @@ int o2hb_register_callback(const char *region_uuid,
down_write(&o2hb_callback_sem);
- list_for_each(iter, &hbcall->list) {
- tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
- if (hc->hc_priority < tmp->hc_priority) {
- list_add_tail(&hc->hc_item, iter);
+ list_for_each_entry(f, &hbcall->list, hc_item) {
+ if (hc->hc_priority < f->hc_priority) {
+ list_add_tail(&hc->hc_item, &f->hc_item);
break;
}
}
@@ -2582,11 +2463,13 @@ void o2hb_unregister_callback(const char *region_uuid,
}
EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
-int o2hb_check_node_heartbeating(u8 node_num)
+int o2hb_check_node_heartbeating_no_sem(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- o2hb_fill_node_map(testing_map, sizeof(testing_map));
+ spin_lock(&o2hb_live_lock);
+ o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
+ spin_unlock(&o2hb_live_lock);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
@@ -2596,13 +2479,13 @@ int o2hb_check_node_heartbeating(u8 node_num)
return 1;
}
-EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem);
int o2hb_check_node_heartbeating_from_callback(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
@@ -2614,23 +2497,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
}
EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
-/* Makes sure our local node is configured with a node number, and is
- * heartbeating. */
-int o2hb_check_local_node_heartbeating(void)
-{
- u8 node_num;
-
- /* if this node was set then we have networking */
- node_num = o2nm_this_node();
- if (node_num == O2NM_MAX_NODES) {
- mlog(ML_HEARTBEAT, "this node has not been configured.\n");
- return 0;
- }
-
- return o2hb_check_node_heartbeating(node_num);
-}
-EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
-
/*
* this is just a hack until we get the plumbing which flips file systems
* read only and drops the hb ref instead of killing the node dead.
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 00ad8e8fea51..8ef8c1b9eeb7 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* heartbeat.h
*
* Function prototypes
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef O2CLUSTER_HEARTBEAT_H
@@ -76,12 +59,11 @@ int o2hb_register_callback(const char *region_uuid,
void o2hb_unregister_callback(const char *region_uuid,
struct o2hb_callback_func *hc);
void o2hb_fill_node_map(unsigned long *map,
- unsigned bytes);
+ unsigned int bits);
void o2hb_exit(void);
-int o2hb_init(void);
-int o2hb_check_node_heartbeating(u8 node_num);
+void o2hb_init(void);
+int o2hb_check_node_heartbeating_no_sem(u8 node_num);
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
-int o2hb_check_local_node_heartbeating(void);
void o2hb_stop_all_regions(void);
int o2hb_get_all_regions(char *region_uuids, u8 numregions);
int o2hb_global_heartbeat_active(void);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 07ac24fd9252..563881ddbf00 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -1,22 +1,6 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* Copyright (C) 2004, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/module.h>
@@ -24,7 +8,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "masklog.h"
@@ -49,13 +33,13 @@ static ssize_t mlog_mask_show(u64 mask, char *buf)
static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
{
- if (!strnicmp(buf, "allow", 5)) {
+ if (!strncasecmp(buf, "allow", 5)) {
__mlog_set_u64(mask, mlog_and_bits);
__mlog_clear_u64(mask, mlog_not_bits);
- } else if (!strnicmp(buf, "deny", 4)) {
+ } else if (!strncasecmp(buf, "deny", 4)) {
__mlog_set_u64(mask, mlog_not_bits);
__mlog_clear_u64(mask, mlog_and_bits);
- } else if (!strnicmp(buf, "off", 3)) {
+ } else if (!strncasecmp(buf, "off", 3)) {
__mlog_clear_u64(mask, mlog_not_bits);
__mlog_clear_u64(mask, mlog_and_bits);
} else
@@ -64,6 +48,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
return count;
}
+void __mlog_printk(const u64 *mask, const char *func, int line,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ const char *level;
+ const char *prefix = "";
+
+ if (!__mlog_test_u64(*mask, mlog_and_bits) ||
+ __mlog_test_u64(*mask, mlog_not_bits))
+ return;
+
+ if (*mask & ML_ERROR) {
+ level = KERN_ERR;
+ prefix = "ERROR: ";
+ } else if (*mask & ML_NOTICE) {
+ level = KERN_NOTICE;
+ } else {
+ level = KERN_INFO;
+ }
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%s(%s,%u,%u):%s:%d %s%pV",
+ level, current->comm, task_pid_nr(current),
+ raw_smp_processor_id(), func, line, prefix, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(__mlog_printk);
+
struct mlog_attribute {
struct attribute attr;
u64 mask;
@@ -102,7 +120,8 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(KTHREAD),
};
-static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
+static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, };
+ATTRIBUTE_GROUPS(mlog_default);
static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
char *buf)
@@ -126,8 +145,8 @@ static const struct sysfs_ops mlog_attr_ops = {
};
static struct kobj_type mlog_ktype = {
- .default_attrs = mlog_attr_ptrs,
- .sysfs_ops = &mlog_attr_ops,
+ .default_groups = mlog_default_groups,
+ .sysfs_ops = &mlog_attr_ops,
};
static struct kset mlog_kset = {
@@ -139,10 +158,10 @@ int mlog_sys_init(struct kset *o2cb_kset)
int i = 0;
while (mlog_attrs[i].attr.mode) {
- mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
+ mlog_default_attrs[i] = &mlog_attrs[i].attr;
i++;
}
- mlog_attr_ptrs[i] = NULL;
+ mlog_default_attrs[i] = NULL;
kobject_set_name(&mlog_kset.kobj, "logmask");
mlog_kset.kobj.kset = o2cb_kset;
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index baa2b9ef7eef..630bd5a3dd0d 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -1,22 +1,6 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef O2CLUSTER_MASKLOG_H
@@ -45,7 +29,7 @@
* just calling printk() so that this can eventually make its way through
* relayfs along with the debugging messages. Everything else gets KERN_DEBUG.
* The inline tests and macro dance give GCC the opportunity to quite cleverly
- * only emit the appropriage printk() when the caller passes in a constant
+ * only emit the appropriate printk() when the caller passes in a constant
* mask, as is almost always the case.
*
* All this bitmask nonsense is managed from the files under
@@ -162,46 +146,39 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
#endif
+__printf(4, 5)
+void __mlog_printk(const u64 *m, const char *func, int line,
+ const char *fmt, ...);
+
/*
- * smp_processor_id() "helpfully" screams when called outside preemptible
- * regions in current kernels. sles doesn't have the variants that don't
- * scream. just do this instead of trying to guess which we're building
- * against.. *sigh*.
+ * Testing before the __mlog_printk call lets the compiler eliminate the
+ * call completely when (m & ML_ALLOWED_BITS) is 0.
*/
-#define __mlog_cpu_guess ({ \
- unsigned long _cpu = get_cpu(); \
- put_cpu(); \
- _cpu; \
-})
+#define mlog(mask, fmt, ...) \
+do { \
+ u64 _m = MLOG_MASK_PREFIX | (mask); \
+ if (_m & ML_ALLOWED_BITS) \
+ __mlog_printk(&_m, __func__, __LINE__, fmt, \
+ ##__VA_ARGS__); \
+} while (0)
-/* In the following two macros, the whitespace after the ',' just
- * before ##args is intentional. Otherwise, gcc 2.95 will eat the
- * previous token if args expands to nothing.
- */
-#define __mlog_printk(level, fmt, args...) \
- printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
- task_pid_nr(current), __mlog_cpu_guess, \
- __PRETTY_FUNCTION__, __LINE__ , ##args)
-
-#define mlog(mask, fmt, args...) do { \
- u64 __m = MLOG_MASK_PREFIX | (mask); \
- if ((__m & ML_ALLOWED_BITS) && \
- __mlog_test_u64(__m, mlog_and_bits) && \
- !__mlog_test_u64(__m, mlog_not_bits)) { \
- if (__m & ML_ERROR) \
- __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
- else if (__m & ML_NOTICE) \
- __mlog_printk(KERN_NOTICE, fmt , ##args); \
- else __mlog_printk(KERN_INFO, fmt , ##args); \
- } \
+#define mlog_ratelimited(mask, fmt, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ mlog(mask, fmt, ##__VA_ARGS__); \
} while (0)
-#define mlog_errno(st) do { \
+#define mlog_errno(st) ({ \
int _st = (st); \
if (_st != -ERESTARTSYS && _st != -EINTR && \
- _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \
+ _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \
+ _st != -EDQUOT) \
mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
-} while (0)
+ _st; \
+})
#define mlog_bug_on_msg(cond, fmt, args...) do { \
if (cond) { \
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 73ba81928bce..bc27301eab6d 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* netdebug.c
*
* debug functionality for o2net
*
* Copyright (C) 2005, 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifdef CONFIG_DEBUG_FS
@@ -53,10 +36,6 @@
#define SHOW_SOCK_STATS 1
static struct dentry *o2net_dentry;
-static struct dentry *sc_dentry;
-static struct dentry *nst_dentry;
-static struct dentry *stats_dentry;
-static struct dentry *nodes_dentry;
static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -65,17 +44,17 @@ static LIST_HEAD(send_tracking);
void o2net_debug_add_nst(struct o2net_send_tracking *nst)
{
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
list_add(&nst->st_net_debug_item, &send_tracking);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
}
void o2net_debug_del_nst(struct o2net_send_tracking *nst)
{
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
if (!list_empty(&nst->st_net_debug_item))
list_del_init(&nst->st_net_debug_item);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
}
static struct o2net_send_tracking
@@ -105,9 +84,9 @@ static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
{
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
nst = next_nst(dummy_nst);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return nst;
}
@@ -116,13 +95,13 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
nst = next_nst(dummy_nst);
list_del_init(&dummy_nst->st_net_debug_item);
if (nst)
list_add(&dummy_nst->st_net_debug_item,
&nst->st_net_debug_item);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return nst; /* unused, just needs to be null when done */
}
@@ -133,7 +112,7 @@ static int nst_seq_show(struct seq_file *seq, void *v)
ktime_t now;
s64 sock, send, status;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
nst = next_nst(dummy_nst);
if (!nst)
goto out;
@@ -166,7 +145,7 @@ static int nst_seq_show(struct seq_file *seq, void *v)
(long long)status);
out:
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return 0;
}
@@ -185,29 +164,13 @@ static const struct seq_operations nst_seq_ops = {
static int nst_fop_open(struct inode *inode, struct file *file)
{
struct o2net_send_tracking *dummy_nst;
- struct seq_file *seq;
- int ret;
-
- dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
- if (dummy_nst == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- dummy_nst->st_task = NULL;
-
- ret = seq_open(file, &nst_seq_ops);
- if (ret)
- goto out;
- seq = file->private_data;
- seq->private = dummy_nst;
+ dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst));
+ if (!dummy_nst)
+ return -ENOMEM;
o2net_debug_add_nst(dummy_nst);
- dummy_nst = NULL;
-
-out:
- kfree(dummy_nst);
- return ret;
+ return 0;
}
static int nst_fop_release(struct inode *inode, struct file *file)
@@ -228,16 +191,16 @@ static const struct file_operations nst_seq_fops = {
void o2net_debug_add_sc(struct o2net_sock_container *sc)
{
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
list_add(&sc->sc_net_debug_item, &sock_containers);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
}
void o2net_debug_del_sc(struct o2net_sock_container *sc)
{
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
list_del_init(&sc->sc_net_debug_item);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
}
struct o2net_sock_debug {
@@ -273,9 +236,9 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
struct o2net_sock_debug *sd = seq->private;
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
sc = next_sc(dummy_sc);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return sc;
}
@@ -285,12 +248,12 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct o2net_sock_debug *sd = seq->private;
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
sc = next_sc(dummy_sc);
list_del_init(&dummy_sc->sc_net_debug_item);
if (sc)
list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return sc; /* unused, just needs to be null when done */
}
@@ -365,7 +328,7 @@ static void sc_show_sock_container(struct seq_file *seq,
" func key: 0x%08x\n"
" func type: %u\n",
sc,
- atomic_read(&sc->sc_kref.refcount),
+ kref_read(&sc->sc_kref),
&saddr, inet ? ntohs(sport) : 0,
&daddr, inet ? ntohs(dport) : 0,
sc->sc_node->nd_name,
@@ -386,7 +349,7 @@ static int sc_seq_show(struct seq_file *seq, void *v)
struct o2net_sock_debug *sd = seq->private;
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
sc = next_sc(dummy_sc);
if (sc) {
@@ -396,7 +359,7 @@ static int sc_seq_show(struct seq_file *seq, void *v)
sc_show_sock_stats(seq, sc);
}
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return 0;
}
@@ -412,33 +375,27 @@ static const struct seq_operations sc_seq_ops = {
.show = sc_seq_show,
};
-static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
+static int sc_common_open(struct file *file, int ctxt)
{
+ struct o2net_sock_debug *sd;
struct o2net_sock_container *dummy_sc;
- struct seq_file *seq;
- int ret;
- dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
- if (dummy_sc == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- dummy_sc->sc_page = NULL;
+ dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL);
+ if (!dummy_sc)
+ return -ENOMEM;
- ret = seq_open(file, &sc_seq_ops);
- if (ret)
- goto out;
+ sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd));
+ if (!sd) {
+ kfree(dummy_sc);
+ return -ENOMEM;
+ }
- seq = file->private_data;
- seq->private = sd;
+ sd->dbg_ctxt = ctxt;
sd->dbg_sock = dummy_sc;
- o2net_debug_add_sc(dummy_sc);
- dummy_sc = NULL;
+ o2net_debug_add_sc(dummy_sc);
-out:
- kfree(dummy_sc);
- return ret;
+ return 0;
}
static int sc_fop_release(struct inode *inode, struct file *file)
@@ -448,21 +405,13 @@ static int sc_fop_release(struct inode *inode, struct file *file)
struct o2net_sock_container *dummy_sc = sd->dbg_sock;
o2net_debug_del_sc(dummy_sc);
+ kfree(dummy_sc);
return seq_release_private(inode, file);
}
static int stats_fop_open(struct inode *inode, struct file *file)
{
- struct o2net_sock_debug *sd;
-
- sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
- if (sd == NULL)
- return -ENOMEM;
-
- sd->dbg_ctxt = SHOW_SOCK_STATS;
- sd->dbg_sock = NULL;
-
- return sc_common_open(file, sd);
+ return sc_common_open(file, SHOW_SOCK_STATS);
}
static const struct file_operations stats_seq_fops = {
@@ -474,16 +423,7 @@ static const struct file_operations stats_seq_fops = {
static int sc_fop_open(struct inode *inode, struct file *file)
{
- struct o2net_sock_debug *sd;
-
- sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
- if (sd == NULL)
- return -ENOMEM;
-
- sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
- sd->dbg_sock = NULL;
-
- return sc_common_open(file, sd);
+ return sc_common_open(file, SHOW_SOCK_CONTAINERS);
}
static const struct file_operations sc_seq_fops = {
@@ -498,11 +438,11 @@ static int o2net_fill_bitmap(char *buf, int len)
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
int i = -1, out = 0;
- o2net_fill_node_map(map, sizeof(map));
+ o2net_fill_node_map(map, O2NM_MAX_NODES);
while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
- out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
- out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+ out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += scnprintf(buf + out, PAGE_SIZE - out, "\n");
return out;
}
@@ -544,36 +484,23 @@ static const struct file_operations nodes_fops = {
void o2net_debugfs_exit(void)
{
- debugfs_remove(nodes_dentry);
- debugfs_remove(stats_dentry);
- debugfs_remove(sc_dentry);
- debugfs_remove(nst_dentry);
- debugfs_remove(o2net_dentry);
+ debugfs_remove_recursive(o2net_dentry);
}
-int o2net_debugfs_init(void)
+void o2net_debugfs_init(void)
{
umode_t mode = S_IFREG|S_IRUSR;
o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
- if (o2net_dentry)
- nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
- o2net_dentry, NULL, &nst_seq_fops);
- if (nst_dentry)
- sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
- o2net_dentry, NULL, &sc_seq_fops);
- if (sc_dentry)
- stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
- o2net_dentry, NULL, &stats_seq_fops);
- if (stats_dentry)
- nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
- o2net_dentry, NULL, &nodes_fops);
- if (nodes_dentry)
- return 0;
-
- o2net_debugfs_exit();
- mlog_errno(-ENOMEM);
- return -ENOMEM;
+
+ debugfs_create_file(NST_DEBUG_NAME, mode, o2net_dentry, NULL,
+ &nst_seq_fops);
+ debugfs_create_file(SC_DEBUG_NAME, mode, o2net_dentry, NULL,
+ &sc_seq_fops);
+ debugfs_create_file(STATS_DEBUG_NAME, mode, o2net_dentry, NULL,
+ &stats_seq_fops);
+ debugfs_create_file(NODES_DEBUG_NAME, mode, o2net_dentry, NULL,
+ &nodes_fops);
}
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index bb240647ca5f..2f61d39e4e50 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -1,22 +1,6 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* Copyright (C) 2004, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/slab.h>
@@ -29,18 +13,20 @@
#include "heartbeat.h"
#include "masklog.h"
#include "sys.h"
-#include "ver.h"
/* for now we operate under the assertion that there can be only one
* cluster active at a time. Changing this will require trickling
* cluster references throughout where nodes are looked up */
struct o2nm_cluster *o2nm_single_cluster = NULL;
-char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
- "reset", /* O2NM_FENCE_RESET */
- "panic", /* O2NM_FENCE_PANIC */
+static const char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
+ "reset", /* O2NM_FENCE_RESET */
+ "panic", /* O2NM_FENCE_PANIC */
};
+static inline void o2nm_lock_subsystem(void);
+static inline void o2nm_unlock_subsystem(void);
+
struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
{
struct o2nm_node *node = NULL;
@@ -68,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
return -EINVAL;
read_lock(&cluster->cl_nodes_lock);
- memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+ bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES);
read_unlock(&cluster->cl_nodes_lock);
return 0;
@@ -173,31 +159,35 @@ static void o2nm_node_release(struct config_item *item)
kfree(node);
}
-static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_num_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", node->nd_num);
+ return sprintf(page, "%d\n", to_o2nm_node(item)->nd_num);
}
static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
{
/* through the first node_set .parent
* mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
- return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+ if (node->nd_item.ci_parent)
+ return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+ else
+ return NULL;
}
enum {
O2NM_NODE_ATTR_NUM = 0,
O2NM_NODE_ATTR_PORT,
O2NM_NODE_ATTR_ADDRESS,
- O2NM_NODE_ATTR_LOCAL,
};
-static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
+static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
size_t count)
{
- struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+ struct o2nm_node *node = to_o2nm_node(item);
+ struct o2nm_cluster *cluster;
unsigned long tmp;
char *p = (char *)page;
+ int ret = 0;
tmp = simple_strtoul(p, &p, 0);
if (!p || (*p && (*p != '\n')))
@@ -214,28 +204,41 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
!test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
return -EINVAL; /* XXX */
+ o2nm_lock_subsystem();
+ cluster = to_o2nm_cluster_from_node(node);
+ if (!cluster) {
+ o2nm_unlock_subsystem();
+ return -EINVAL;
+ }
+
write_lock(&cluster->cl_nodes_lock);
if (cluster->cl_nodes[tmp])
- p = NULL;
+ ret = -EEXIST;
+ else if (test_and_set_bit(O2NM_NODE_ATTR_NUM,
+ &node->nd_set_attributes))
+ ret = -EBUSY;
else {
cluster->cl_nodes[tmp] = node;
node->nd_num = tmp;
set_bit(tmp, cluster->cl_nodes_bitmap);
}
write_unlock(&cluster->cl_nodes_lock);
- if (p == NULL)
- return -EEXIST;
+ o2nm_unlock_subsystem();
+
+ if (ret)
+ return ret;
return count;
}
-static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_ipv4_port_show(struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+ return sprintf(page, "%u\n", ntohs(to_o2nm_node(item)->nd_ipv4_port));
}
-static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
+static ssize_t o2nm_node_ipv4_port_store(struct config_item *item,
const char *page, size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
unsigned long tmp;
char *p = (char *)page;
@@ -248,21 +251,24 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
if (tmp >= (u16)-1)
return -ERANGE;
+ if (test_and_set_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+ return -EBUSY;
node->nd_ipv4_port = htons(tmp);
return count;
}
-static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_ipv4_address_show(struct config_item *item, char *page)
{
- return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
+ return sprintf(page, "%pI4\n", &to_o2nm_node(item)->nd_ipv4_address);
}
-static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
+static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
const char *page,
size_t count)
{
- struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+ struct o2nm_node *node = to_o2nm_node(item);
+ struct o2nm_cluster *cluster;
int ret, i;
struct rb_node **p, *parent;
unsigned int octets[4];
@@ -279,15 +285,27 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
}
+ o2nm_lock_subsystem();
+ cluster = to_o2nm_cluster_from_node(node);
+ if (!cluster) {
+ o2nm_unlock_subsystem();
+ return -EINVAL;
+ }
+
ret = 0;
write_lock(&cluster->cl_nodes_lock);
if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
ret = -EEXIST;
+ else if (test_and_set_bit(O2NM_NODE_ATTR_ADDRESS,
+ &node->nd_set_attributes))
+ ret = -EBUSY;
else {
rb_link_node(&node->nd_ip_node, parent, p);
rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
}
write_unlock(&cluster->cl_nodes_lock);
+ o2nm_unlock_subsystem();
+
if (ret)
return ret;
@@ -296,15 +314,16 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
return count;
}
-static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_local_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", node->nd_local);
+ return sprintf(page, "%d\n", to_o2nm_node(item)->nd_local);
}
-static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
+static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
size_t count)
{
- struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+ struct o2nm_node *node = to_o2nm_node(item);
+ struct o2nm_cluster *cluster;
unsigned long tmp;
char *p = (char *)page;
ssize_t ret;
@@ -322,17 +341,26 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
!test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
return -EINVAL; /* XXX */
+ o2nm_lock_subsystem();
+ cluster = to_o2nm_cluster_from_node(node);
+ if (!cluster) {
+ ret = -EINVAL;
+ goto out;
+ }
+
/* the only failure case is trying to set a new local node
* when a different one is already set */
if (tmp && tmp == cluster->cl_has_local &&
- cluster->cl_local_node != node->nd_num)
- return -EBUSY;
+ cluster->cl_local_node != node->nd_num) {
+ ret = -EBUSY;
+ goto out;
+ }
/* bring up the rx thread if we're setting the new local node. */
if (tmp && !cluster->cl_has_local) {
ret = o2net_start_listening(node);
if (ret)
- return ret;
+ goto out;
}
if (!tmp && cluster->cl_has_local &&
@@ -347,114 +375,31 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
cluster->cl_local_node = node->nd_num;
}
- return count;
-}
-
-struct o2nm_node_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2nm_node *, char *);
- ssize_t (*store)(struct o2nm_node *, const char *, size_t);
-};
+ ret = count;
-static struct o2nm_node_attribute o2nm_node_attr_num = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "num",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_num_read,
- .store = o2nm_node_num_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "ipv4_port",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_ipv4_port_read,
- .store = o2nm_node_ipv4_port_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "ipv4_address",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_ipv4_address_read,
- .store = o2nm_node_ipv4_address_write,
-};
+out:
+ o2nm_unlock_subsystem();
+ return ret;
+}
-static struct o2nm_node_attribute o2nm_node_attr_local = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "local",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_local_read,
- .store = o2nm_node_local_write,
-};
+CONFIGFS_ATTR(o2nm_node_, num);
+CONFIGFS_ATTR(o2nm_node_, ipv4_port);
+CONFIGFS_ATTR(o2nm_node_, ipv4_address);
+CONFIGFS_ATTR(o2nm_node_, local);
static struct configfs_attribute *o2nm_node_attrs[] = {
- [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
- [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
- [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
- [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
+ &o2nm_node_attr_num,
+ &o2nm_node_attr_ipv4_port,
+ &o2nm_node_attr_ipv4_address,
+ &o2nm_node_attr_local,
NULL,
};
-static int o2nm_attr_index(struct configfs_attribute *attr)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
- if (attr == o2nm_node_attrs[i])
- return i;
- }
- BUG();
- return 0;
-}
-
-static ssize_t o2nm_node_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2nm_node *node = to_o2nm_node(item);
- struct o2nm_node_attribute *o2nm_node_attr =
- container_of(attr, struct o2nm_node_attribute, attr);
- ssize_t ret = 0;
-
- if (o2nm_node_attr->show)
- ret = o2nm_node_attr->show(node, page);
- return ret;
-}
-
-static ssize_t o2nm_node_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2nm_node *node = to_o2nm_node(item);
- struct o2nm_node_attribute *o2nm_node_attr =
- container_of(attr, struct o2nm_node_attribute, attr);
- ssize_t ret;
- int attr_index = o2nm_attr_index(attr);
-
- if (o2nm_node_attr->store == NULL) {
- ret = -EINVAL;
- goto out;
- }
-
- if (test_bit(attr_index, &node->nd_set_attributes))
- return -EBUSY;
-
- ret = o2nm_node_attr->store(node, page, count);
- if (ret < count)
- goto out;
-
- set_bit(attr_index, &node->nd_set_attributes);
-out:
- return ret;
-}
-
static struct configfs_item_operations o2nm_node_item_ops = {
.release = o2nm_node_release,
- .show_attribute = o2nm_node_show,
- .store_attribute = o2nm_node_store,
};
-static struct config_item_type o2nm_node_type = {
+static const struct config_item_type o2nm_node_type = {
.ct_item_ops = &o2nm_node_item_ops,
.ct_attrs = o2nm_node_attrs,
.ct_owner = THIS_MODULE,
@@ -476,12 +421,6 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
}
#endif
-struct o2nm_cluster_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2nm_cluster *, char *);
- ssize_t (*store)(struct o2nm_cluster *, const char *, size_t);
-};
-
static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
unsigned int *val)
{
@@ -502,15 +441,16 @@ static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
return count;
}
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_idle_timeout_ms_show(struct config_item *item,
+ char *page)
{
- return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
+ return sprintf(page, "%u\n", to_o2nm_cluster(item)->cl_idle_timeout_ms);
}
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_idle_timeout_ms_store(struct config_item *item,
+ const char *page, size_t count)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret;
unsigned int val;
@@ -537,15 +477,17 @@ static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
return ret;
}
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_keepalive_delay_ms_show(
+ struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
+ return sprintf(page, "%u\n",
+ to_o2nm_cluster(item)->cl_keepalive_delay_ms);
}
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_keepalive_delay_ms_store(
+ struct config_item *item, const char *page, size_t count)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret;
unsigned int val;
@@ -572,22 +514,24 @@ static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
return ret;
}
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_reconnect_delay_ms_show(
+ struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
+ return sprintf(page, "%u\n",
+ to_o2nm_cluster(item)->cl_reconnect_delay_ms);
}
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_reconnect_delay_ms_store(
+ struct config_item *item, const char *page, size_t count)
{
return o2nm_cluster_attr_write(page, count,
- &cluster->cl_reconnect_delay_ms);
+ &to_o2nm_cluster(item)->cl_reconnect_delay_ms);
}
-static ssize_t o2nm_cluster_attr_fence_method_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_fence_method_show(
+ struct config_item *item, char *page)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret = 0;
if (cluster)
@@ -596,8 +540,8 @@ static ssize_t o2nm_cluster_attr_fence_method_read(
return ret;
}
-static ssize_t o2nm_cluster_attr_fence_method_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_fence_method_store(
+ struct config_item *item, const char *page, size_t count)
{
unsigned int i;
@@ -609,10 +553,10 @@ static ssize_t o2nm_cluster_attr_fence_method_write(
continue;
if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
continue;
- if (cluster->cl_fence_method != i) {
+ if (to_o2nm_cluster(item)->cl_fence_method != i) {
printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
o2nm_fence_method_desc[i]);
- cluster->cl_fence_method = i;
+ to_o2nm_cluster(item)->cl_fence_method = i;
}
return count;
}
@@ -621,79 +565,18 @@ bail:
return -EINVAL;
}
-static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "idle_timeout_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_idle_timeout_ms_read,
- .store = o2nm_cluster_attr_idle_timeout_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "keepalive_delay_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_keepalive_delay_ms_read,
- .store = o2nm_cluster_attr_keepalive_delay_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "reconnect_delay_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_reconnect_delay_ms_read,
- .store = o2nm_cluster_attr_reconnect_delay_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "fence_method",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_fence_method_read,
- .store = o2nm_cluster_attr_fence_method_write,
-};
+CONFIGFS_ATTR(o2nm_cluster_, idle_timeout_ms);
+CONFIGFS_ATTR(o2nm_cluster_, keepalive_delay_ms);
+CONFIGFS_ATTR(o2nm_cluster_, reconnect_delay_ms);
+CONFIGFS_ATTR(o2nm_cluster_, fence_method);
static struct configfs_attribute *o2nm_cluster_attrs[] = {
- &o2nm_cluster_attr_idle_timeout_ms.attr,
- &o2nm_cluster_attr_keepalive_delay_ms.attr,
- &o2nm_cluster_attr_reconnect_delay_ms.attr,
- &o2nm_cluster_attr_fence_method.attr,
+ &o2nm_cluster_attr_idle_timeout_ms,
+ &o2nm_cluster_attr_keepalive_delay_ms,
+ &o2nm_cluster_attr_reconnect_delay_ms,
+ &o2nm_cluster_attr_fence_method,
NULL,
};
-static ssize_t o2nm_cluster_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- struct o2nm_cluster_attribute *o2nm_cluster_attr =
- container_of(attr, struct o2nm_cluster_attribute, attr);
- ssize_t ret = 0;
-
- if (o2nm_cluster_attr->show)
- ret = o2nm_cluster_attr->show(cluster, page);
- return ret;
-}
-
-static ssize_t o2nm_cluster_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- struct o2nm_cluster_attribute *o2nm_cluster_attr =
- container_of(attr, struct o2nm_cluster_attribute, attr);
- ssize_t ret;
-
- if (o2nm_cluster_attr->store == NULL) {
- ret = -EINVAL;
- goto out;
- }
-
- ret = o2nm_cluster_attr->store(cluster, page, count);
- if (ret < count)
- goto out;
-out:
- return ret;
-}
static struct config_item *o2nm_node_group_make_item(struct config_group *group,
const char *name)
@@ -722,13 +605,15 @@ static void o2nm_node_group_drop_item(struct config_group *group,
struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
- o2net_disconnect_node(node);
+ if (cluster->cl_nodes[node->nd_num] == node) {
+ o2net_disconnect_node(node);
- if (cluster->cl_has_local &&
- (cluster->cl_local_node == node->nd_num)) {
- cluster->cl_has_local = 0;
- cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
- o2net_stop_listening(node);
+ if (cluster->cl_has_local &&
+ (cluster->cl_local_node == node->nd_num)) {
+ cluster->cl_has_local = 0;
+ cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+ o2net_stop_listening(node);
+ }
}
/* XXX call into net to stop this node from trading messages */
@@ -757,7 +642,7 @@ static struct configfs_group_operations o2nm_node_group_group_ops = {
.drop_item = o2nm_node_group_drop_item,
};
-static struct config_item_type o2nm_node_group_type = {
+static const struct config_item_type o2nm_node_group_type = {
.ct_group_ops = &o2nm_node_group_group_ops,
.ct_owner = THIS_MODULE,
};
@@ -768,17 +653,14 @@ static void o2nm_cluster_release(struct config_item *item)
{
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- kfree(cluster->cl_group.default_groups);
kfree(cluster);
}
static struct configfs_item_operations o2nm_cluster_item_ops = {
.release = o2nm_cluster_release,
- .show_attribute = o2nm_cluster_show,
- .store_attribute = o2nm_cluster_store,
};
-static struct config_item_type o2nm_cluster_type = {
+static const struct config_item_type o2nm_cluster_type = {
.ct_item_ops = &o2nm_cluster_item_ops,
.ct_attrs = o2nm_cluster_attrs,
.ct_owner = THIS_MODULE,
@@ -806,29 +688,26 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
struct o2nm_cluster *cluster = NULL;
struct o2nm_node_group *ns = NULL;
struct config_group *o2hb_group = NULL, *ret = NULL;
- void *defs = NULL;
- /* this runs under the parent dir's i_mutex; there can be only
+ /* this runs under the parent dir's i_rwsem; there can be only
* one caller in here at a time */
if (o2nm_single_cluster)
return ERR_PTR(-ENOSPC);
cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
- defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
o2hb_group = o2hb_alloc_hb_set();
- if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+ if (cluster == NULL || ns == NULL || o2hb_group == NULL)
goto out;
config_group_init_type_name(&cluster->cl_group, name,
&o2nm_cluster_type);
+ configfs_add_default_group(&ns->ns_group, &cluster->cl_group);
+
config_group_init_type_name(&ns->ns_group, "node",
&o2nm_node_group_type);
+ configfs_add_default_group(o2hb_group, &cluster->cl_group);
- cluster->cl_group.default_groups = defs;
- cluster->cl_group.default_groups[0] = &ns->ns_group;
- cluster->cl_group.default_groups[1] = o2hb_group;
- cluster->cl_group.default_groups[2] = NULL;
rwlock_init(&cluster->cl_nodes_lock);
cluster->cl_node_ip_tree = RB_ROOT;
cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
@@ -844,7 +723,6 @@ out:
kfree(cluster);
kfree(ns);
o2hb_free_hb_set(o2hb_group);
- kfree(defs);
ret = ERR_PTR(-ENOMEM);
}
@@ -854,18 +732,11 @@ out:
static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
{
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- int i;
- struct config_item *killme;
BUG_ON(o2nm_single_cluster != cluster);
o2nm_single_cluster = NULL;
- for (i = 0; cluster->cl_group.default_groups[i]; i++) {
- killme = &cluster->cl_group.default_groups[i]->cg_item;
- cluster->cl_group.default_groups[i] = NULL;
- config_item_put(killme);
- }
-
+ configfs_remove_default_groups(&cluster->cl_group);
config_item_put(item);
}
@@ -874,7 +745,7 @@ static struct configfs_group_operations o2nm_cluster_group_group_ops = {
.drop_item = o2nm_cluster_group_drop_item,
};
-static struct config_item_type o2nm_cluster_group_type = {
+static const struct config_item_type o2nm_cluster_group_type = {
.ct_group_ops = &o2nm_cluster_group_group_ops,
.ct_owner = THIS_MODULE,
};
@@ -890,6 +761,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
},
};
+static inline void o2nm_lock_subsystem(void)
+{
+ mutex_lock(&o2nm_cluster_group.cs_subsys.su_mutex);
+}
+
+static inline void o2nm_unlock_subsystem(void)
+{
+ mutex_unlock(&o2nm_cluster_group.cs_subsys.su_mutex);
+}
+
int o2nm_depend_item(struct config_item *item)
{
return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
@@ -897,7 +778,7 @@ int o2nm_depend_item(struct config_item *item)
void o2nm_undepend_item(struct config_item *item)
{
- configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+ configfs_undepend_item(item);
}
int o2nm_depend_this_node(void)
@@ -943,13 +824,9 @@ static void __exit exit_o2nm(void)
static int __init init_o2nm(void)
{
- int ret = -1;
+ int ret;
- cluster_print_version();
-
- ret = o2hb_init();
- if (ret)
- goto out;
+ o2hb_init();
ret = o2net_init();
if (ret)
@@ -984,6 +861,7 @@ out:
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster management");
module_init(init_o2nm)
module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 09ea2d388bbb..3490e77a952d 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* nodemanager.h
*
* Function prototypes
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef O2CLUSTER_NODEMANAGER_H
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
index 3f4151da9709..6088c9f974dd 100644
--- a/fs/ocfs2/cluster/ocfs2_heartbeat.h
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* ocfs2_heartbeat.h
*
* On-disk structures for ocfs2_heartbeat
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef _OCFS2_HEARTBEAT_H
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 49b594325bec..c9a0b77443e7 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -1,28 +1,11 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* ocfs2_nodemanager.h
*
* Header describing the interface between userspace and the kernel
* for the ocfs2_nodemanager module.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef _OCFS2_NODEMANAGER_H
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 1ec141e758d7..bfb8b456876c 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -1,23 +1,7 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
*
* Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
/* This quorum hack is only here until we transition to some more rational
@@ -39,7 +23,7 @@
* race between when we see a node start heartbeating and when we connect
* to it.
*
- * So nodes that are in this transtion put a hold on the quorum decision
+ * So nodes that are in this transition put a hold on the quorum decision
* with a counter. As they fall out of this transition they drop the count
* and if they're the last, they fire off the decision.
*/
@@ -76,20 +60,21 @@ static void o2quo_fence_self(void)
switch (o2nm_single_cluster->cl_fence_method) {
case O2NM_FENCE_PANIC:
panic("*** ocfs2 is very sorry to be fencing this system by "
- "panicing ***\n");
+ "panicking ***\n");
break;
default:
WARN_ON(o2nm_single_cluster->cl_fence_method >=
O2NM_FENCE_METHODS);
+ fallthrough;
case O2NM_FENCE_RESET:
printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
"system by restarting ***\n");
emergency_restart();
break;
- };
+ }
}
-/* Indicate that a timeout occurred on a hearbeat region write. The
+/* Indicate that a timeout occurred on a heartbeat region write. The
* other nodes in the cluster may consider us dead at that time so we
* want to "fence" ourselves so that we don't scribble on the disk
* after they think they've recovered us. This can't solve all
@@ -108,7 +93,7 @@ static void o2quo_make_decision(struct work_struct *work)
int lowest_hb, lowest_reachable = 0, fence = 0;
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
if (lowest_hb != O2NM_MAX_NODES)
@@ -160,9 +145,18 @@ static void o2quo_make_decision(struct work_struct *work)
}
out:
- spin_unlock(&qs->qs_lock);
- if (fence)
+ if (fence) {
+ spin_unlock_bh(&qs->qs_lock);
o2quo_fence_self();
+ } else {
+ mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
+ "connected: %d, lowest: %d (%sreachable)\n",
+ qs->qs_heartbeating, qs->qs_connected, lowest_hb,
+ lowest_reachable ? "" : "un");
+ spin_unlock_bh(&qs->qs_lock);
+
+ }
+
}
static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
@@ -195,14 +189,14 @@ static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
}
/* as a node comes up we delay the quorum decision until we know the fate of
- * the connection. the hold will be droped in conn_up or hb_down. it might be
+ * the connection. the hold will be dropped in conn_up or hb_down. it might be
* perpetuated by con_err until hb_down. if we already have a conn, we might
* be dropping a hold that conn_up got. */
void o2quo_hb_up(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
qs->qs_heartbeating++;
mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
@@ -217,7 +211,7 @@ void o2quo_hb_up(u8 node)
else
o2quo_clear_hold(qs, node);
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
/* hb going down releases any holds we might have had due to this node from
@@ -226,7 +220,7 @@ void o2quo_hb_down(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
qs->qs_heartbeating--;
mlog_bug_on_msg(qs->qs_heartbeating < 0,
@@ -239,7 +233,7 @@ void o2quo_hb_down(u8 node)
o2quo_clear_hold(qs, node);
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
/* this tells us that we've decided that the node is still heartbeating
@@ -251,18 +245,18 @@ void o2quo_hb_still_up(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
mlog(0, "node %u\n", node);
qs->qs_pending = 1;
o2quo_clear_hold(qs, node);
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
/* This is analogous to hb_up. as a node's connection comes up we delay the
- * quorum decision until we see it heartbeating. the hold will be droped in
+ * quorum decision until we see it heartbeating. the hold will be dropped in
* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
* it's already heartbeating we might be dropping a hold that conn_up got.
* */
@@ -270,7 +264,7 @@ void o2quo_conn_up(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
qs->qs_connected++;
mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
@@ -285,7 +279,7 @@ void o2quo_conn_up(u8 node)
else
o2quo_clear_hold(qs, node);
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
/* we've decided that we won't ever be connecting to the node again. if it's
@@ -296,7 +290,7 @@ void o2quo_conn_err(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
if (test_bit(node, qs->qs_conn_bm)) {
qs->qs_connected--;
@@ -305,14 +299,15 @@ void o2quo_conn_err(u8 node)
node, qs->qs_connected);
clear_bit(node, qs->qs_conn_bm);
+
+ if (test_bit(node, qs->qs_hb_bm))
+ o2quo_set_hold(qs, node);
}
mlog(0, "node %u, %d total\n", node, qs->qs_connected);
- if (test_bit(node, qs->qs_hb_bm))
- o2quo_set_hold(qs, node);
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
void o2quo_init(void)
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
index 6649cc6f67c9..d64bf4482a4a 100644
--- a/fs/ocfs2/cluster/quorum.h
+++ b/fs/ocfs2/cluster/quorum.h
@@ -1,23 +1,6 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef O2CLUSTER_QUORUM_H
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index a4b07730b2e1..022f716c74ff 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* sys.c
*
* OCFS2 cluster sysfs interface
*
* Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation,
- * version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#include <linux/kernel.h>
@@ -41,7 +24,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
}
static struct kobj_attribute attr_version =
- __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
+ __ATTR(interface_revision, S_IRUGO, version_show, NULL);
static struct attribute *o2cb_attrs[] = {
&attr_version.attr,
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
index d66b8ab0045e..70aaba65317e 100644
--- a/fs/ocfs2/cluster/sys.h
+++ b/fs/ocfs2/cluster/sys.h
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* sys.h
*
* Function prototypes for o2cb sysfs interface
*
* Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation,
- * version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef O2CLUSTER_SYS_H
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d644dc611425..79b281e32f4c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1,33 +1,17 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
* ----
*
- * Callers for this were originally written against a very simple synchronus
+ * Callers for this were originally written against a very simple synchronous
* API. This implementation reflects those simple callers. Some day I'm sure
* we'll need to move to a more robust posting/callback mechanism.
*
* Transmit calls pass in kernel virtual addresses and block copying this into
* the socket's tx buffers via a usual blocking sendmsg. They'll block waiting
- * for a failed socket to timeout. TX callers can also pass in a poniter to an
+ * for a failed socket to timeout. TX callers can also pass in a pointer to an
* 'int' which gets filled with an errno off the wire in response to the
* message they send.
*
@@ -54,6 +38,7 @@
*/
#include <linux/kernel.h>
+#include <linux/sched/mm.h>
#include <linux/jiffies.h>
#include <linux/slab.h>
#include <linux/idr.h>
@@ -61,8 +46,9 @@
#include <linux/net.h>
#include <linux/export.h>
#include <net/tcp.h>
+#include <trace/events/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "heartbeat.h"
#include "tcp.h"
@@ -97,7 +83,7 @@
typeof(sc) __sc = (sc); \
mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \
"pg_off %zu] " fmt, __sc, \
- atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \
+ kref_read(&__sc->sc_kref), __sc->sc_sock, \
__sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \
##args); \
} while (0)
@@ -108,14 +94,14 @@ static struct rb_root o2net_handler_tree = RB_ROOT;
static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
/* XXX someday we'll need better accounting */
-static struct socket *o2net_listen_sock = NULL;
+static struct socket *o2net_listen_sock;
/*
* listen work is only queued by the listening socket callbacks on the
* o2net_wq. teardown detaches the callbacks before destroying the workqueue.
* quorum work is queued as sock containers are shutdown.. stop_listening
* tears down all the node's sock containers, preventing future shutdowns
- * and queued quroum work, before canceling delayed quorum work and
+ * and queued quorum work, before canceling delayed quorum work and
* destroying the work queue.
*/
static struct workqueue_struct *o2net_wq;
@@ -137,9 +123,9 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] =
static void o2net_sc_connect_completed(struct work_struct *work);
static void o2net_rx_until_empty(struct work_struct *work);
static void o2net_shutdown_sc(struct work_struct *work);
-static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_listen_data_ready(struct sock *sk);
static void o2net_sc_send_keep_req(struct work_struct *work);
-static void o2net_idle_timer(unsigned long data);
+static void o2net_idle_timer(struct timer_list *t);
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
@@ -262,17 +248,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc)
#endif /* CONFIG_OCFS2_FS_STATS */
-static inline int o2net_reconnect_delay(void)
+static inline unsigned int o2net_reconnect_delay(void)
{
return o2nm_single_cluster->cl_reconnect_delay_ms;
}
-static inline int o2net_keepalive_delay(void)
+static inline unsigned int o2net_keepalive_delay(void)
{
return o2nm_single_cluster->cl_keepalive_delay_ms;
}
-static inline int o2net_idle_timeout(void)
+static inline unsigned int o2net_idle_timeout(void)
{
return o2nm_single_cluster->cl_idle_timeout_ms;
}
@@ -449,9 +435,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
- init_timer(&sc->sc_idle_timeout);
- sc->sc_idle_timeout.function = o2net_idle_timer;
- sc->sc_idle_timeout.data = (unsigned long)sc;
+ timer_setup(&sc->sc_idle_timeout, o2net_idle_timer, 0);
sclog(sc, "alloced\n");
@@ -536,15 +520,16 @@ static void o2net_set_nn_state(struct o2net_node *nn,
if (nn->nn_persistent_error || nn->nn_sc_valid)
wake_up(&nn->nn_sc_wq);
- if (!was_err && nn->nn_persistent_error) {
+ if (was_valid && !was_err && nn->nn_persistent_error) {
o2quo_conn_err(o2net_num_from_nn(nn));
queue_delayed_work(o2net_wq, &nn->nn_still_up,
msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
}
if (was_valid && !valid) {
- printk(KERN_NOTICE "o2net: No longer connected to "
- SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
+ if (old_sc)
+ printk(KERN_NOTICE "o2net: No longer connected to "
+ SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
o2net_complete_nodes_nsw(nn);
}
@@ -596,13 +581,16 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
/* see o2net_register_callbacks() */
-static void o2net_data_ready(struct sock *sk, int bytes)
+static void o2net_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
+ struct o2net_sock_container *sc;
- read_lock(&sk->sk_callback_lock);
- if (sk->sk_user_data) {
- struct o2net_sock_container *sc = sk->sk_user_data;
+ trace_sk_data_ready(sk);
+
+ read_lock_bh(&sk->sk_callback_lock);
+ sc = sk->sk_user_data;
+ if (sc) {
sclog(sc, "data_ready hit\n");
o2net_set_data_ready_time(sc);
o2net_sc_queue_work(sc, &sc->sc_rx_work);
@@ -610,9 +598,9 @@ static void o2net_data_ready(struct sock *sk, int bytes)
} else {
ready = sk->sk_data_ready;
}
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
- ready(sk, bytes);
+ ready(sk);
}
/* see o2net_register_callbacks() */
@@ -621,7 +609,7 @@ static void o2net_state_change(struct sock *sk)
void (*state_change)(struct sock *sk);
struct o2net_sock_container *sc;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
sc = sk->sk_user_data;
if (sc == NULL) {
state_change = sk->sk_state_change;
@@ -648,7 +636,7 @@ static void o2net_state_change(struct sock *sk)
break;
}
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
state_change(sk);
}
@@ -736,7 +724,7 @@ static void o2net_shutdown_sc(struct work_struct *work)
if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
/* we shouldn't flush as we're in the thread, the
* races with pending sc work structs are harmless */
- del_timer_sync(&sc->sc_idle_timeout);
+ timer_delete_sync(&sc->sc_idle_timeout);
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
sc_put(sc);
kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
@@ -765,32 +753,32 @@ static struct o2net_msg_handler *
o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
struct rb_node **ret_parent)
{
- struct rb_node **p = &o2net_handler_tree.rb_node;
- struct rb_node *parent = NULL;
+ struct rb_node **p = &o2net_handler_tree.rb_node;
+ struct rb_node *parent = NULL;
struct o2net_msg_handler *nmh, *ret = NULL;
int cmp;
- while (*p) {
- parent = *p;
- nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
+ while (*p) {
+ parent = *p;
+ nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
cmp = o2net_handler_cmp(nmh, msg_type, key);
- if (cmp < 0)
- p = &(*p)->rb_left;
- else if (cmp > 0)
- p = &(*p)->rb_right;
- else {
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else {
ret = nmh;
- break;
+ break;
}
- }
+ }
- if (ret_p != NULL)
- *ret_p = p;
- if (ret_parent != NULL)
- *ret_parent = parent;
+ if (ret_p != NULL)
+ *ret_p = p;
+ if (ret_parent != NULL)
+ *ret_parent = parent;
- return ret;
+ return ret;
}
static void o2net_handler_kref_release(struct kref *kref)
@@ -871,8 +859,6 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
"for type %u key %08x\n", msg_type, key);
}
write_unlock(&o2net_handler_lock);
- if (ret)
- goto out;
out:
if (ret)
@@ -915,74 +901,51 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
- int ret;
- mm_segment_t oldfs;
- struct kvec vec = {
- .iov_len = len,
- .iov_base = data,
- };
- struct msghdr msg = {
- .msg_iovlen = 1,
- .msg_iov = (struct iovec *)&vec,
- .msg_flags = MSG_DONTWAIT,
- };
-
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
- set_fs(oldfs);
-
- return ret;
+ struct kvec vec = { .iov_len = len, .iov_base = data, };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, len);
+ return sock_recvmsg(sock, &msg, MSG_DONTWAIT);
}
static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
size_t veclen, size_t total)
{
int ret;
- mm_segment_t oldfs;
- struct msghdr msg = {
- .msg_iov = (struct iovec *)vec,
- .msg_iovlen = veclen,
- };
+ struct msghdr msg = {.msg_flags = 0,};
if (sock == NULL) {
ret = -EINVAL;
goto out;
}
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_sendmsg(sock, &msg, total);
- set_fs(oldfs);
- if (ret != total) {
- mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
- total);
- if (ret >= 0)
- ret = -EPIPE; /* should be smarter, I bet */
- goto out;
- }
-
- ret = 0;
+ ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
+ if (likely(ret == total))
+ return 0;
+ mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
+ if (ret >= 0)
+ ret = -EPIPE; /* should be smarter, I bet */
out:
- if (ret < 0)
- mlog(0, "returning error: %d\n", ret);
+ mlog(0, "returning error: %d\n", ret);
return ret;
}
static void o2net_sendpage(struct o2net_sock_container *sc,
- void *kmalloced_virt,
- size_t size)
+ void *virt, size_t size)
{
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+ struct msghdr msg = {};
+ struct bio_vec bv;
ssize_t ret;
+ bvec_set_virt(&bv, virt, size);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, size);
+
while (1) {
+ msg.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES;
mutex_lock(&sc->sc_send_lock);
- ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
- virt_to_page(kmalloced_virt),
- (long)kmalloced_virt & ~PAGE_MASK,
- size, MSG_DONTWAIT);
+ ret = sock_sendmsg(sc->sc_sock, &msg);
mutex_unlock(&sc->sc_send_lock);
+
if (ret == size)
break;
if (ret == (ssize_t)-EAGAIN) {
@@ -1033,16 +996,15 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
}
/* Get a map of all nodes to which this node is currently connected to */
-void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+void o2net_fill_node_map(unsigned long *map, unsigned int bits)
{
struct o2net_sock_container *sc;
int node, ret;
- BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
- memset(map, 0, bytes);
+ bitmap_zero(map, bits);
for (node = 0; node < O2NM_MAX_NODES; ++node) {
- o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+ if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
+ continue;
if (!ret) {
set_bit(node, map);
sc_put(sc);
@@ -1102,7 +1064,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
o2net_set_nst_sock_container(&nst, sc);
veclen = caller_veclen + 1;
- vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
+ vec = kmalloc_array(veclen, sizeof(struct kvec), GFP_ATOMIC);
if (vec == NULL) {
mlog(0, "failed to %zu element kvec!\n", veclen);
ret = -ENOMEM;
@@ -1238,7 +1200,6 @@ static int o2net_process_message(struct o2net_sock_container *sc,
msglog(hdr, "bad magic\n");
ret = -EINVAL;
goto out;
- break;
}
/* find a handler for it */
@@ -1458,7 +1419,7 @@ out:
return ret;
}
-/* this work func is triggerd by data ready. it reads until it can read no
+/* this work func is triggered by data ready. it reads until it can read no
* more. it interprets 0, eof, as fatal. if data_ready hits while we're doing
* our work the work struct will be marked and we'll be called again. */
static void o2net_rx_until_empty(struct work_struct *work)
@@ -1481,31 +1442,6 @@ static void o2net_rx_until_empty(struct work_struct *work)
sc_put(sc);
}
-static int o2net_set_nodelay(struct socket *sock)
-{
- int ret, val = 1;
- mm_segment_t oldfs;
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
-
- /*
- * Dear unsuspecting programmer,
- *
- * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level
- * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
- * silently turn into SO_DEBUG.
- *
- * Yours,
- * Keeper of hilariously fragile interfaces.
- */
- ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char __user *)&val, sizeof(val));
-
- set_fs(oldfs);
- return ret;
-}
-
static void o2net_initialize_handshake(void)
{
o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1547,12 +1483,13 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
sc_put(sc);
}
-/* socket shutdown does a del_timer_sync against this as it tears down.
+/* socket shutdown does a timer_delete_sync against this as it tears down.
* we can't start this timer until we've got to the point in sc buildup
* where shutdown is going to be involved */
-static void o2net_idle_timer(unsigned long data)
+static void o2net_idle_timer(struct timer_list *t)
{
- struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+ struct o2net_sock_container *sc = timer_container_of(sc, t,
+ sc_idle_timeout);
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
#ifdef CONFIG_DEBUG_FS
unsigned long msecs = ktime_to_ms(ktime_get()) -
@@ -1562,16 +1499,20 @@ static void o2net_idle_timer(unsigned long data)
#endif
printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
- "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
- msecs / 1000, msecs % 1000);
+ "idle for %lu.%lu secs.\n",
+ SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000);
- /*
- * Initialize the nn_timeout so that the next connection attempt
- * will continue in o2net_start_connect.
+ /* idle timerout happen, don't shutdown the connection, but
+ * make fence decision. Maybe the connection can recover before
+ * the decision is made.
*/
atomic_set(&nn->nn_timeout, 1);
+ o2quo_conn_err(o2net_num_from_nn(nn));
+ queue_delayed_work(o2net_wq, &nn->nn_still_up,
+ msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+
+ o2net_sc_reset_idle_timer(sc);
- o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
}
static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
@@ -1586,6 +1527,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
{
+ struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+ /* clear fence decision since the connection recover from timeout*/
+ if (atomic_read(&nn->nn_timeout)) {
+ o2quo_conn_up(o2net_num_from_nn(nn));
+ cancel_delayed_work(&nn->nn_still_up);
+ atomic_set(&nn->nn_timeout, 0);
+ }
+
/* Only push out an existing timer */
if (timer_pending(&sc->sc_idle_timeout))
o2net_sc_reset_idle_timer(sc);
@@ -1606,23 +1556,25 @@ static void o2net_start_connect(struct work_struct *work)
struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
int ret = 0, stop;
unsigned int timeout;
+ unsigned int nofs_flag;
+ /*
+ * sock_create allocates the sock with GFP_KERNEL. We must
+ * prevent the filesystem from being reentered by memory reclaim.
+ */
+ nofs_flag = memalloc_nofs_save();
/* if we're greater we initiate tx, otherwise we accept */
if (o2nm_this_node() <= o2net_num_from_nn(nn))
goto out;
/* watch for racing with tearing a node down */
node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
- if (node == NULL) {
- ret = 0;
+ if (node == NULL)
goto out;
- }
mynode = o2nm_get_node_by_num(o2nm_this_node());
- if (mynode == NULL) {
- ret = 0;
+ if (mynode == NULL)
goto out;
- }
spin_lock(&nn->nn_lock);
/*
@@ -1657,12 +1609,13 @@ static void o2net_start_connect(struct work_struct *work)
sc->sc_sock = sock; /* freed by sc_kref_release */
sock->sk->sk_allocation = GFP_ATOMIC;
+ sock->sk->sk_use_task_frag = false;
myaddr.sin_family = AF_INET;
myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
myaddr.sin_port = htons(0); /* any port */
- ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
+ ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&myaddr,
sizeof(myaddr));
if (ret) {
mlog(ML_ERROR, "bind failed with %d at address %pI4\n",
@@ -1670,11 +1623,8 @@ static void o2net_start_connect(struct work_struct *work)
goto out;
}
- ret = o2net_set_nodelay(sc->sc_sock);
- if (ret) {
- mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
- goto out;
- }
+ tcp_sock_set_nodelay(sc->sc_sock->sk);
+ tcp_sock_set_user_timeout(sock->sk, O2NET_TCP_USER_TIMEOUT);
o2net_register_callbacks(sc->sc_sock->sk, sc);
@@ -1688,20 +1638,19 @@ static void o2net_start_connect(struct work_struct *work)
remoteaddr.sin_port = node->nd_ipv4_port;
ret = sc->sc_sock->ops->connect(sc->sc_sock,
- (struct sockaddr *)&remoteaddr,
+ (struct sockaddr_unsized *)&remoteaddr,
sizeof(remoteaddr),
O_NONBLOCK);
if (ret == -EINPROGRESS)
ret = 0;
out:
- if (ret) {
+ if (ret && sc) {
printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
" failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
/* 0 err so that another will be queued and attempted
* from set_nn_state */
- if (sc)
- o2net_ensure_shutdown(nn, sc, 0);
+ o2net_ensure_shutdown(nn, sc, 0);
}
if (sc)
sc_put(sc);
@@ -1710,6 +1659,7 @@ out:
if (mynode)
o2nm_node_put(mynode);
+ memalloc_nofs_restore(nofs_flag);
return;
}
@@ -1721,12 +1671,13 @@ static void o2net_connect_expired(struct work_struct *work)
spin_lock(&nn->nn_lock);
if (!nn->nn_sc_valid) {
printk(KERN_NOTICE "o2net: No connection established with "
- "node %u after %u.%u seconds, giving up.\n",
+ "node %u after %u.%u seconds, check network and"
+ " cluster configuration.\n",
o2net_num_from_nn(nn),
o2net_idle_timeout() / 1000,
o2net_idle_timeout() % 1000);
- o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+ o2net_set_nn_state(nn, NULL, 0, 0);
}
spin_unlock(&nn->nn_lock);
}
@@ -1787,7 +1738,7 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
if (node_num != o2nm_this_node()) {
- /* believe it or not, accept and node hearbeating testing
+ /* believe it or not, accept and node heartbeating testing
* can succeed for this node before we got here.. so
* only use set_nn_state to clear the persistent error
* if that hasn't already happened */
@@ -1826,17 +1777,28 @@ int o2net_register_hb_callbacks(void)
/* ------------------------------------------------------------ */
-static int o2net_accept_one(struct socket *sock)
+static int o2net_accept_one(struct socket *sock, int *more)
{
- int ret, slen;
+ int ret;
struct sockaddr_in sin;
struct socket *new_sock = NULL;
struct o2nm_node *node = NULL;
struct o2nm_node *local_node = NULL;
struct o2net_sock_container *sc = NULL;
+ struct proto_accept_arg arg = {
+ .flags = O_NONBLOCK,
+ };
struct o2net_node *nn;
+ unsigned int nofs_flag;
+
+ /*
+ * sock_create_lite allocates the sock with GFP_KERNEL. We must
+ * prevent the filesystem from being reentered by memory reclaim.
+ */
+ nofs_flag = memalloc_nofs_save();
BUG_ON(sock == NULL);
+ *more = 0;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
if (ret)
@@ -1844,21 +1806,17 @@ static int o2net_accept_one(struct socket *sock)
new_sock->type = sock->type;
new_sock->ops = sock->ops;
- ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+ ret = sock->ops->accept(sock, new_sock, &arg);
if (ret < 0)
goto out;
+ *more = 1;
new_sock->sk->sk_allocation = GFP_ATOMIC;
- ret = o2net_set_nodelay(new_sock);
- if (ret) {
- mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
- goto out;
- }
+ tcp_sock_set_nodelay(new_sock->sk);
+ tcp_sock_set_user_timeout(new_sock->sk, O2NET_TCP_USER_TIMEOUT);
- slen = sizeof(sin);
- ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
- &slen, 1);
+ ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1);
if (ret < 0)
goto out;
@@ -1873,12 +1831,16 @@ static int o2net_accept_one(struct socket *sock)
if (o2nm_this_node() >= node->nd_num) {
local_node = o2nm_get_node_by_num(o2nm_this_node());
- printk(KERN_NOTICE "o2net: Unexpected connect attempt seen "
- "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
- "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
- &(local_node->nd_ipv4_address),
- ntohs(local_node->nd_ipv4_port), node->nd_name,
- node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+ if (local_node)
+ printk(KERN_NOTICE "o2net: Unexpected connect attempt "
+ "seen at node '%s' (%u, %pI4:%d) from "
+ "node '%s' (%u, %pI4:%d)\n",
+ local_node->nd_name, local_node->nd_num,
+ &(local_node->nd_ipv4_address),
+ ntohs(local_node->nd_ipv4_port),
+ node->nd_name,
+ node->nd_num, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
@@ -1939,39 +1901,78 @@ out:
o2nm_node_put(local_node);
if (sc)
sc_put(sc);
+
+ memalloc_nofs_restore(nofs_flag);
return ret;
}
+/*
+ * This function is invoked in response to one or more
+ * pending accepts at softIRQ level. We must drain the
+ * entire que before returning.
+ */
+
static void o2net_accept_many(struct work_struct *work)
{
struct socket *sock = o2net_listen_sock;
- while (o2net_accept_one(sock) == 0)
+ int more;
+
+ /*
+ * It is critical to note that due to interrupt moderation
+ * at the network driver level, we can't assume to get a
+ * softIRQ for every single conn since tcp SYN packets
+ * can arrive back-to-back, and therefore many pending
+ * accepts may result in just 1 softIRQ. If we terminate
+ * the o2net_accept_one() loop upon seeing an err, what happens
+ * to the rest of the conns in the queue? If no new SYN
+ * arrives for hours, no softIRQ will be delivered,
+ * and the connections will just sit in the queue.
+ */
+
+ for (;;) {
+ o2net_accept_one(sock, &more);
+ if (!more)
+ break;
cond_resched();
+ }
}
-static void o2net_listen_data_ready(struct sock *sk, int bytes)
+static void o2net_listen_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
+
+ trace_sk_data_ready(sk);
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
ready = sk->sk_user_data;
if (ready == NULL) { /* check for teardown race */
ready = sk->sk_data_ready;
goto out;
}
- /* ->sk_data_ready is also called for a newly established child socket
- * before it has been accepted and the acceptor has set up their
- * data_ready.. we only want to queue listen work for our listening
- * socket */
+ /* This callback may called twice when a new connection
+ * is being established as a child socket inherits everything
+ * from a parent LISTEN socket, including the data_ready cb of
+ * the parent. This leads to a hazard. In o2net_accept_one()
+ * we are still initializing the child socket but have not
+ * changed the inherited data_ready callback yet when
+ * data starts arriving.
+ * We avoid this hazard by checking the state.
+ * For the listening socket, the state will be TCP_LISTEN; for the new
+ * socket, will be TCP_ESTABLISHED. Also, in this case,
+ * sk->sk_user_data is not a valid function pointer.
+ */
+
if (sk->sk_state == TCP_LISTEN) {
- mlog(ML_TCP, "bytes: %d\n", bytes);
queue_work(o2net_wq, &o2net_listen_work);
+ } else {
+ ready = NULL;
}
out:
- read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ read_unlock_bh(&sk->sk_callback_lock);
+ if (ready != NULL)
+ ready(sk);
}
static int o2net_open_listening_sock(__be32 addr, __be16 port)
@@ -2001,7 +2002,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
INIT_WORK(&o2net_listen_work, o2net_accept_many);
sock->sk->sk_reuse = SK_CAN_REUSE;
- ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+ ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin));
if (ret < 0) {
printk(KERN_ERR "o2net: Error %d while binding socket at "
"%pI4:%u\n", ret, &addr, ntohs(port));
@@ -2037,7 +2038,7 @@ int o2net_start_listening(struct o2nm_node *node)
BUG_ON(o2net_listen_sock != NULL);
mlog(ML_KTHREAD, "starting o2net thread...\n");
- o2net_wq = create_singlethread_workqueue("o2net");
+ o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM);
if (o2net_wq == NULL) {
mlog(ML_ERROR, "unable to launch o2net thread\n");
return -ENOMEM; /* ? */
@@ -2093,22 +2094,23 @@ void o2net_stop_listening(struct o2nm_node *node)
int o2net_init(void)
{
+ struct folio *folio;
+ void *p;
unsigned long i;
o2quo_init();
+ o2net_debugfs_init();
- if (o2net_debugfs_init())
- return -ENOMEM;
+ folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+ if (!folio)
+ goto out;
- o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
- o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
- o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
- if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
- kfree(o2net_hand);
- kfree(o2net_keep_req);
- kfree(o2net_keep_resp);
- return -ENOMEM;
- }
+ p = folio_address(folio);
+ o2net_hand = p;
+ p += sizeof(struct o2net_handshake);
+ o2net_keep_req = p;
+ p += sizeof(struct o2net_msg);
+ o2net_keep_resp = p;
o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
o2net_hand->connector_id = cpu_to_be64(1);
@@ -2133,13 +2135,16 @@ int o2net_init(void)
}
return 0;
+
+out:
+ o2net_debugfs_exit();
+ o2quo_exit();
+ return -ENOMEM;
}
void o2net_exit(void)
{
o2quo_exit();
- kfree(o2net_hand);
- kfree(o2net_keep_req);
- kfree(o2net_keep_resp);
o2net_debugfs_exit();
+ folio_put(virt_to_folio(o2net_hand));
}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 5bada2a69b50..a75b551d31c7 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* tcp.h
*
* Function prototypes
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef O2CLUSTER_TCP_H
@@ -47,7 +30,7 @@ struct o2net_msg
__be32 status;
__be32 key;
__be32 msg_num;
- __u8 buf[0];
+ __u8 buf[];
};
typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data,
@@ -63,6 +46,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
+#define O2NET_TCP_USER_TIMEOUT 0x7fffffff
/* TODO: figure this out.... */
static inline int o2net_link_down(int err, struct socket *sock)
@@ -123,16 +107,15 @@ struct o2net_send_tracking;
struct o2net_sock_container;
#ifdef CONFIG_DEBUG_FS
-int o2net_debugfs_init(void);
+void o2net_debugfs_init(void);
void o2net_debugfs_exit(void);
void o2net_debug_add_nst(struct o2net_send_tracking *nst);
void o2net_debug_del_nst(struct o2net_send_tracking *nst);
void o2net_debug_add_sc(struct o2net_sock_container *sc);
void o2net_debug_del_sc(struct o2net_sock_container *sc);
#else
-static inline int o2net_debugfs_init(void)
+static inline void o2net_debugfs_init(void)
{
- return 0;
}
static inline void o2net_debugfs_exit(void)
{
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4cbcb65784a3..601c99bd2611 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -1,22 +1,6 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef O2CLUSTER_TCP_INTERNAL_H
@@ -107,12 +91,12 @@ struct o2net_node {
struct list_head nn_status_list;
/* connects are attempted from when heartbeat comes up until either hb
- * goes down, the node is unconfigured, no connect attempts succeed
- * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work
- * is queued from set_nn_state both from hb up and from itself if a
- * connect attempt fails and so can be self-arming. shutdown is
- * careful to first mark the nn such that no connects will be attempted
- * before canceling delayed connect work and flushing the queue. */
+ * goes down, the node is unconfigured, or a connect succeeds.
+ * connect_work is queued from set_nn_state both from hb up and from
+ * itself if a connect attempt fails and so can be self-arming.
+ * shutdown is careful to first mark the nn such that no connects will
+ * be attempted before canceling delayed connect work and flushing the
+ * queue. */
struct delayed_work nn_connect_work;
unsigned long nn_last_connect_attempt;
@@ -165,7 +149,7 @@ struct o2net_sock_container {
/* original handlers for the sockets */
void (*sc_state_change)(struct sock *sk);
- void (*sc_data_ready)(struct sock *sk, int bytes);
+ void (*sc_data_ready)(struct sock *sk);
u32 sc_msg_key;
u16 sc_msg_type;
@@ -196,7 +180,7 @@ struct o2net_msg_handler {
u32 nh_msg_type;
u32 nh_key;
o2net_msg_handler_func *nh_func;
- o2net_msg_handler_func *nh_func_data;
+ void *nh_func_data;
o2net_post_msg_handler_func
*nh_post_func;
struct kref nh_kref;
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad3..000000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define CLUSTER_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
-
-void cluster_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c2..000000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef O2CLUSTER_VER_H
-#define O2CLUSTER_VER_H
-
-void cluster_print_version(void);
-
-#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index ef999729e274..1873bbbb7e5b 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dcache.c
*
* dentry cache handling code
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -37,19 +21,19 @@
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
-#include "super.h"
#include "ocfs2_trace.h"
void ocfs2_dentry_attach_gen(struct dentry *dentry)
{
unsigned long gen =
- OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
- BUG_ON(dentry->d_inode);
+ OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen;
+ BUG_ON(d_inode(dentry));
dentry->d_fsdata = (void *)gen;
}
-static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int ocfs2_dentry_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
int ret = 0; /* if all else fails, just return false */
@@ -58,11 +42,10 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
osb = OCFS2_SB(dentry->d_sb);
- trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
- dentry->d_name.name);
+ trace_ocfs2_dentry_revalidate(dentry, name->len, name->name);
/* For a negative dentry -
* check the generation number of the parent and compare with the
@@ -70,11 +53,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
*/
if (inode == NULL) {
unsigned long gen = (unsigned long) dentry->d_fsdata;
- unsigned long pgen =
- OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
-
- trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
- dentry->d_name.name,
+ unsigned long pgen = OCFS2_I(dir)->ip_dir_lock_gen;
+ trace_ocfs2_dentry_revalidate_negative(name->len, name->name,
pgen, gen);
if (gen != pgen)
goto bail;
@@ -140,17 +120,10 @@ static int ocfs2_match_dentry(struct dentry *dentry,
if (!dentry->d_fsdata)
return 0;
- if (!dentry->d_parent)
- return 0;
-
if (skip_unhashed && d_unhashed(dentry))
return 0;
- parent = dentry->d_parent->d_inode;
- /* Negative parent dentry? */
- if (!parent)
- return 0;
-
+ parent = d_inode(dentry->d_parent);
/* Name is in a different directory. */
if (OCFS2_I(parent)->ip_blkno != parent_blkno)
return 0;
@@ -172,7 +145,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
struct dentry *dentry;
spin_lock(&inode->i_lock);
- hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
spin_lock(&dentry->d_lock);
if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
trace_ocfs2_find_local_alias(dentry->d_name.len,
@@ -243,7 +216,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
if (!inode)
return 0;
- if (!dentry->d_inode && dentry->d_fsdata) {
+ if (d_really_is_negative(dentry) && dentry->d_fsdata) {
/* Converting a negative dentry to positive
Clear dentry->d_fsdata */
dentry->d_fsdata = dl = NULL;
@@ -251,8 +224,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
if (dl) {
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
- " \"%.*s\": old parent: %llu, new: %llu\n",
- dentry->d_name.len, dentry->d_name.name,
+ " \"%pd\": old parent: %llu, new: %llu\n",
+ dentry,
(unsigned long long)parent_blkno,
(unsigned long long)dl->dl_parent_blkno);
return 0;
@@ -277,8 +250,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
- " \"%.*s\": old parent: %llu, new: %llu\n",
- dentry->d_name.len, dentry->d_name.name,
+ " \"%pd\": old parent: %llu, new: %llu\n",
+ dentry,
(unsigned long long)parent_blkno,
(unsigned long long)dl->dl_parent_blkno);
@@ -310,6 +283,18 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
out_attach:
spin_lock(&dentry_attach_lock);
+ if (unlikely(dentry->d_fsdata && !alias)) {
+ /* d_fsdata is set by a racing thread which is doing
+ * the same thing as this thread is doing. Leave the racing
+ * thread going ahead and we return here.
+ */
+ spin_unlock(&dentry_attach_lock);
+ iput(dl->dl_inode);
+ ocfs2_lock_res_free(&dl->dl_lockres);
+ kfree(dl);
+ return 0;
+ }
+
dentry->d_fsdata = dl;
dl->dl_count++;
spin_unlock(&dentry_attach_lock);
@@ -345,52 +330,6 @@ out_attach:
return ret;
}
-DEFINE_SPINLOCK(dentry_list_lock);
-
-/* We limit the number of dentry locks to drop in one go. We have
- * this limit so that we don't starve other users of ocfs2_wq. */
-#define DL_INODE_DROP_COUNT 64
-
-/* Drop inode references from dentry locks */
-static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
-{
- struct ocfs2_dentry_lock *dl;
-
- spin_lock(&dentry_list_lock);
- while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
- dl = osb->dentry_lock_list;
- osb->dentry_lock_list = dl->dl_next;
- spin_unlock(&dentry_list_lock);
- iput(dl->dl_inode);
- kfree(dl);
- spin_lock(&dentry_list_lock);
- }
- spin_unlock(&dentry_list_lock);
-}
-
-void ocfs2_drop_dl_inodes(struct work_struct *work)
-{
- struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
- dentry_lock_work);
-
- __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
- /*
- * Don't queue dropping if umount is in progress. We flush the
- * list in ocfs2_dismount_volume
- */
- spin_lock(&dentry_list_lock);
- if (osb->dentry_lock_list &&
- !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
- queue_work(ocfs2_wq, &osb->dentry_lock_work);
- spin_unlock(&dentry_list_lock);
-}
-
-/* Flush the whole work queue */
-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
-{
- __ocfs2_drop_dl_inodes(osb, -1);
-}
-
/*
* ocfs2_dentry_iput() and friends.
*
@@ -415,24 +354,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
+ iput(dl->dl_inode);
ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
ocfs2_lock_res_free(&dl->dl_lockres);
-
- /* We leave dropping of inode reference to ocfs2_wq as that can
- * possibly lead to inode deletion which gets tricky */
- spin_lock(&dentry_list_lock);
- if (!osb->dentry_lock_list &&
- !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
- queue_work(ocfs2_wq, &osb->dentry_lock_work);
- dl->dl_next = osb->dentry_lock_list;
- osb->dentry_lock_list = dl;
- spin_unlock(&dentry_list_lock);
+ kfree(dl);
}
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
- int unlock;
+ int unlock = 0;
BUG_ON(dl->dl_count == 0);
@@ -460,17 +391,15 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
if (inode)
ino = (unsigned long long)OCFS2_I(inode)->ip_blkno;
mlog(ML_ERROR, "Dentry is missing cluster lock. "
- "inode: %llu, d_flags: 0x%x, d_name: %.*s\n",
- ino, dentry->d_flags, dentry->d_name.len,
- dentry->d_name.name);
+ "inode: %llu, d_flags: 0x%x, d_name: %pd\n",
+ ino, dentry->d_flags, dentry);
}
goto out;
}
- mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n",
- dentry->d_name.len, dentry->d_name.name,
- dl->dl_count);
+ mlog_bug_on_msg(dl->dl_count == 0, "dentry: %pd, count: %u\n",
+ dentry, dl->dl_count);
ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl);
@@ -502,7 +431,7 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
/*
* Move within the same directory, so the actual lock info won't
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index b79eff709958..7f246c5692d8 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* dcache.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_DCACHE_H
@@ -29,13 +13,8 @@
extern const struct dentry_operations ocfs2_dentry_ops;
struct ocfs2_dentry_lock {
- /* Use count of dentry lock */
unsigned int dl_count;
- union {
- /* Linked list of dentry locks to release */
- struct ocfs2_dentry_lock *dl_next;
- u64 dl_parent_blkno;
- };
+ u64 dl_parent_blkno;
/*
* The ocfs2_dentry_lock keeps an inode reference until
@@ -49,14 +28,9 @@ struct ocfs2_dentry_lock {
int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
u64 parent_blkno);
-extern spinlock_t dentry_list_lock;
-
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl);
-void ocfs2_drop_dl_inodes(struct work_struct *work);
-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
-
struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
int skip_unhashed);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index eb760d8acd50..2785ff245e79 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1,6 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dir.c
*
* Creates, reads, walks and deletes directory-nodes
@@ -18,22 +17,7 @@
*
* linux/fs/minix/dir.c
*
- * Copyright (C) 1991, 1992 Linux Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
+ * Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <linux/fs.h>
@@ -42,6 +26,7 @@
#include <linux/highmem.h>
#include <linux/quotaops.h>
#include <linux/sort.h>
+#include <linux/iversion.h>
#include <cluster/masklog.h>
@@ -68,10 +53,6 @@
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-static unsigned char ocfs2_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
static int ocfs2_do_extend_dir(struct super_block *sb,
handle_t *handle,
struct inode *dir,
@@ -313,13 +294,29 @@ out:
* bh passed here can be an inode block or a dir data block, depending
* on the inode inline data flag.
*/
-static int ocfs2_check_dir_entry(struct inode * dir,
- struct ocfs2_dir_entry * de,
- struct buffer_head * bh,
+static int ocfs2_check_dir_entry(struct inode *dir,
+ struct ocfs2_dir_entry *de,
+ struct buffer_head *bh,
+ char *buf,
+ unsigned int size,
unsigned long offset)
{
const char *error_msg = NULL;
- const int rlen = le16_to_cpu(de->rec_len);
+ unsigned long next_offset;
+ int rlen;
+
+ if (offset > size - OCFS2_DIR_REC_LEN(1)) {
+ /* Dirent is (maybe partially) beyond the buffer
+ * boundaries so touching 'de' members is unsafe.
+ */
+ mlog(ML_ERROR, "directory entry (#%llu: offset=%lu) "
+ "too close to end or out-of-bounds",
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, offset);
+ return 0;
+ }
+
+ rlen = le16_to_cpu(de->rec_len);
+ next_offset = ((char *) de - buf) + rlen;
if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
@@ -327,9 +324,11 @@ static int ocfs2_check_dir_entry(struct inode * dir,
error_msg = "rec_len % 4 != 0";
else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (unlikely(
- ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
- error_msg = "directory entry across blocks";
+ else if (unlikely(next_offset > size))
+ error_msg = "directory entry overrun";
+ else if (unlikely(next_offset > size - OCFS2_DIR_REC_LEN(1)) &&
+ next_offset != size)
+ error_msg = "directory entry too close to end";
if (unlikely(error_msg != NULL))
mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
@@ -371,16 +370,17 @@ static inline int ocfs2_search_dirblock(struct buffer_head *bh,
de_buf = first_de;
dlimit = de_buf + bytes;
- while (de_buf < dlimit) {
+ while (de_buf < dlimit - OCFS2_DIR_MEMBER_LEN) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
de = (struct ocfs2_dir_entry *) de_buf;
- if (de_buf + namelen <= dlimit &&
+ if (de->name + namelen <= dlimit &&
ocfs2_match(namelen, name, de)) {
/* found a match - just to be sure, do a full check */
- if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+ if (!ocfs2_check_dir_entry(dir, de, bh, first_de,
+ bytes, offset)) {
ret = -1;
goto bail;
}
@@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Invalid dirblock #%llu: "
- "signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- trailer->db_signature);
+ rc = ocfs2_error(dir->i_sb,
+ "Invalid dirblock #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ trailer->db_signature);
goto out;
}
if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu has an invalid "
- "db_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu has an invalid db_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
if (le64_to_cpu(trailer->db_parent_dinode) !=
OCFS2_I(dir)->ip_blkno) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu on dinode "
- "#%llu has an invalid parent_dinode "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
out:
@@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
- ocfs2_error(sb,
- "Dir Index Root # %llu has bad signature %.*s",
- (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
- 7, dx_root->dr_signature);
- return -EINVAL;
+ ret = ocfs2_error(sb,
+ "Dir Index Root # %llu has bad signature %.*s\n",
+ (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+ 7, dx_root->dr_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
- ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
- 7, dx_leaf->dl_signature);
- return -EROFS;
+ ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
+ 7, dx_leaf->dl_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -702,7 +693,7 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
int ra_ptr = 0; /* Current index into readahead
buffer */
int num = 0;
- int nblocks, i, err;
+ int nblocks, i;
sb = dir->i_sb;
@@ -734,7 +725,7 @@ restart:
num++;
bh = NULL;
- err = ocfs2_read_dir_block(dir, b++, &bh,
+ ocfs2_read_dir_block(dir, b++, &bh,
OCFS2_BH_READAHEAD);
bh_use[ra_max] = bh;
}
@@ -744,7 +735,7 @@ restart:
if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
/* read error, skip block & hope for the best.
* ocfs2_read_dir_block() has released the bh. */
- ocfs2_error(dir->i_sb, "reading directory %llu, "
+ mlog(ML_ERROR, "reading directory %llu, "
"offset %lu\n",
(unsigned long long)OCFS2_I(dir)->ip_blkno,
block);
@@ -800,6 +791,14 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_rec *rec = NULL;
+ if (le16_to_cpu(el->l_count) !=
+ ocfs2_extent_recs_per_dx_root(inode->i_sb)) {
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has invalid extent list length %u\n",
+ inode->i_ino, le16_to_cpu(el->l_count));
+ goto out;
+ }
+
if (el->l_tree_depth) {
ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
&eb_bh);
@@ -812,15 +811,22 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "btree tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in btree tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
+ if (le16_to_cpu(el->l_next_free_rec) == 0) {
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has empty extent list at depth %u\n",
+ inode->i_ino,
+ le16_to_cpu(el->l_tree_depth));
+ goto out;
+ }
+
found = 0;
for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
rec = &el->l_recs[i];
@@ -832,11 +838,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
}
if (!found) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in btree", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
@@ -875,9 +881,9 @@ static int ocfs2_dx_dir_lookup(struct inode *inode,
u64 *ret_phys_blkno)
{
int ret = 0;
- unsigned int cend, uninitialized_var(clen);
- u32 uninitialized_var(cpos);
- u64 uninitialized_var(blkno);
+ unsigned int cend, clen;
+ u32 cpos;
+ u64 blkno;
u32 name_hash = hinfo->major_hash;
ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
@@ -921,7 +927,7 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
struct ocfs2_dir_lookup_result *res)
{
int ret, i, found;
- u64 uninitialized_var(phys);
+ u64 phys;
struct buffer_head *dx_leaf_bh = NULL;
struct ocfs2_dx_leaf *dx_leaf;
struct ocfs2_dx_entry *dx_entry = NULL;
@@ -1088,26 +1094,39 @@ int ocfs2_find_entry(const char *name, int namelen,
{
struct buffer_head *bh;
struct ocfs2_dir_entry *res_dir = NULL;
+ int ret = 0;
if (ocfs2_dir_indexed(dir))
return ocfs2_find_entry_dx(name, namelen, dir, lookup);
+ if (unlikely(i_size_read(dir) <= 0)) {
+ ret = -EFSCORRUPTED;
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The unindexed dir code only uses part of the lookup
* structure, so there's no reason to push it down further
* than this.
*/
- if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if (unlikely(i_size_read(dir) > dir->i_sb->s_blocksize)) {
+ ret = -EFSCORRUPTED;
+ mlog_errno(ret);
+ goto out;
+ }
bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
- else
+ } else {
bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
+ }
if (bh == NULL)
return -ENOENT;
lookup->dl_leaf_bh = bh;
lookup->dl_entry = res_dir;
- return 0;
+out:
+ return ret;
}
/*
@@ -1167,7 +1186,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
pde = NULL;
de = (struct ocfs2_dir_entry *) first_de;
while (i < bytes) {
- if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
+ if (!ocfs2_check_dir_entry(dir, de, bh, first_de, bytes, i)) {
status = -EIO;
mlog_errno(status);
goto bail;
@@ -1184,7 +1203,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
le16_add_cpu(&pde->rec_len,
le16_to_cpu(de->rec_len));
de->inode = 0;
- dir->i_version++;
+ inode_inc_iversion(dir);
ocfs2_journal_dirty(handle, bh);
goto bail;
}
@@ -1617,14 +1636,11 @@ int __ocfs2_add_entry(handle_t *handle,
struct ocfs2_dir_entry *de, *de1;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
struct super_block *sb = dir->i_sb;
- int retval, status;
+ int retval;
unsigned int size = sb->s_blocksize;
struct buffer_head *insert_bh = lookup->dl_leaf_bh;
char *data_start = insert_bh->b_data;
- if (!namelen)
- return -EINVAL;
-
if (ocfs2_dir_indexed(dir)) {
struct buffer_head *bh;
@@ -1667,7 +1683,8 @@ int __ocfs2_add_entry(handle_t *handle,
/* These checks should've already been passed by the
* prepare function, but I guess we can leave them
* here anyway. */
- if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
+ if (!ocfs2_check_dir_entry(dir, de, insert_bh, data_start,
+ size, offset)) {
retval = -ENOENT;
goto bail;
}
@@ -1687,7 +1704,8 @@ int __ocfs2_add_entry(handle_t *handle,
offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
if (ocfs2_dirent_would_fit(de, rec_len)) {
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_current(dir));
retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (retval < 0) {
mlog_errno(retval);
@@ -1695,25 +1713,25 @@ int __ocfs2_add_entry(handle_t *handle,
}
if (insert_bh == parent_fe_bh)
- status = ocfs2_journal_access_di(handle,
+ retval = ocfs2_journal_access_di(handle,
INODE_CACHE(dir),
insert_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
else {
- status = ocfs2_journal_access_db(handle,
+ retval = ocfs2_journal_access_db(handle,
INODE_CACHE(dir),
insert_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
- if (ocfs2_dir_indexed(dir)) {
- status = ocfs2_dx_dir_insert(dir,
+ if (!retval && ocfs2_dir_indexed(dir))
+ retval = ocfs2_dx_dir_insert(dir,
handle,
lookup);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
- }
+ }
+
+ if (retval) {
+ mlog_errno(retval);
+ goto bail;
}
/* By now the buffer is marked for journaling */
@@ -1727,7 +1745,7 @@ int __ocfs2_add_entry(handle_t *handle,
de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
de = de1;
}
- de->file_type = OCFS2_FT_UNKNOWN;
+ de->file_type = FT_UNKNOWN;
if (blkno) {
de->inode = cpu_to_le64(blkno);
ocfs2_set_de_type(de, inode->i_mode);
@@ -1739,7 +1757,7 @@ int __ocfs2_add_entry(handle_t *handle,
if (ocfs2_dir_indexed(dir))
ocfs2_recalc_free_list(dir, handle, lookup);
- dir->i_version++;
+ inode_inc_iversion(dir);
ocfs2_journal_dirty(handle, insert_bh);
retval = 0;
goto bail;
@@ -1785,7 +1803,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (*f_version != inode->i_version) {
+ if (!inode_eq_iversion(inode, *f_version)) {
for (i = 0; i < i_size_read(inode) && i < offset; ) {
de = (struct ocfs2_dir_entry *)
(data->id_data + i);
@@ -1801,24 +1819,21 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
i += le16_to_cpu(de->rec_len);
}
ctx->pos = offset = i;
- *f_version = inode->i_version;
+ *f_version = inode_query_iversion(inode);
}
de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
- if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
+ if (!ocfs2_check_dir_entry(inode, de, di_bh, (char *)data->id_data,
+ i_size_read(inode), ctx->pos)) {
/* On error, skip the f_pos to the end. */
ctx->pos = i_size_read(inode);
break;
}
offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- unsigned char d_type = DT_UNKNOWN;
-
- if (de->file_type < OCFS2_FT_MAX)
- d_type = ocfs2_filetype_table[de->file_type];
-
if (!dir_emit(ctx, de->name, de->name_len,
- le64_to_cpu(de->inode), d_type))
+ le64_to_cpu(de->inode),
+ fs_ftype_to_dtype(de->file_type)))
goto out;
}
ctx->pos += le16_to_cpu(de->rec_len);
@@ -1879,7 +1894,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (*f_version != inode->i_version) {
+ if (!inode_eq_iversion(inode, *f_version)) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ocfs2_dir_entry *) (bh->b_data + i);
/* It's too expensive to do a full
@@ -1896,28 +1911,24 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
offset = i;
ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- *f_version = inode->i_version;
+ *f_version = inode_query_iversion(inode);
}
while (ctx->pos < i_size_read(inode)
&& offset < sb->s_blocksize) {
de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
- if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+ if (!ocfs2_check_dir_entry(inode, de, bh, bh->b_data,
+ sb->s_blocksize, offset)) {
/* On error, skip the f_pos to the
next block. */
ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
- brelse(bh);
- continue;
+ break;
}
if (le64_to_cpu(de->inode)) {
- unsigned char d_type = DT_UNKNOWN;
-
- if (de->file_type < OCFS2_FT_MAX)
- d_type = ocfs2_filetype_table[de->file_type];
if (!dir_emit(ctx, de->name,
de->name_len,
le64_to_cpu(de->inode),
- d_type)) {
+ fs_ftype_to_dtype(de->file_type))) {
brelse(bh);
return 0;
}
@@ -1950,7 +1961,7 @@ static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
*/
int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
{
- u64 version = inode->i_version;
+ u64 version = inode_query_iversion(inode);
ocfs2_dir_foreach_blk(inode, &version, ctx, true);
return 0;
}
@@ -1963,11 +1974,12 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
{
int error = 0;
struct inode *inode = file_inode(file);
+ struct ocfs2_file_private *fp = file->private_data;
int lock_level = 0;
trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
- error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
+ error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1);
if (lock_level && error >= 0) {
/* We release EX lock which used to update atime
* and get PR lock again to reduce contention
@@ -1983,7 +1995,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
goto bail_nolock;
}
- error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
+ error = ocfs2_dir_foreach_blk(inode, &fp->cookie, ctx, false);
ocfs2_inode_unlock(inode, lock_level);
if (error)
@@ -1995,7 +2007,7 @@ bail_nolock:
}
/*
- * NOTE: this should always be called with parent dir i_mutex taken.
+ * NOTE: this should always be called with parent dir i_rwsem taken.
*/
int ocfs2_find_files_on_disk(const char *name,
int namelen,
@@ -2040,29 +2052,30 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
*
* Return 0 if the name does not exist
* Return -EEXIST if the directory contains the name
+ * Return -EFSCORRUPTED if found corruption
*
- * Callers should have i_mutex + a cluster lock on dir
+ * Callers should have i_rwsem + a cluster lock on dir
*/
int ocfs2_check_dir_for_entry(struct inode *dir,
const char *name,
int namelen)
{
- int ret;
+ int ret = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
trace_ocfs2_check_dir_for_entry(
(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
- ret = -EEXIST;
- if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
- goto bail;
+ ret = ocfs2_find_entry(name, namelen, dir, &lookup);
+ if (ret == 0) {
+ ret = -EEXIST;
+ mlog_errno(ret);
+ } else if (ret == -ENOENT) {
+ ret = 0;
+ }
- ret = 0;
-bail:
ocfs2_free_dir_lookup_result(&lookup);
- if (ret)
- mlog_errno(ret);
return ret;
}
@@ -2073,10 +2086,12 @@ struct ocfs2_empty_dir_priv {
unsigned seen_other;
unsigned dx_dir;
};
-static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
- loff_t pos, u64 ino, unsigned type)
+static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
+ int name_len, loff_t pos, u64 ino,
+ unsigned type)
{
- struct ocfs2_empty_dir_priv *p = priv;
+ struct ocfs2_empty_dir_priv *p =
+ container_of(ctx, struct ocfs2_empty_dir_priv, ctx);
/*
* Check the positions of "." and ".." records to be sure
@@ -2091,7 +2106,7 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
*/
if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
p->seen_dot = 1;
- return 0;
+ return true;
}
if (name_len == 2 && !strncmp("..", name, 2) &&
@@ -2099,13 +2114,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
p->seen_dot_dot = 1;
if (p->dx_dir && p->seen_dot)
- return 1;
+ return false;
- return 0;
+ return true;
}
p->seen_other = 1;
- return 1;
+ return false;
}
static int ocfs2_empty_dir_dx(struct inode *inode,
@@ -2153,11 +2168,9 @@ int ocfs2_empty_dir(struct inode *inode)
{
int ret;
struct ocfs2_empty_dir_priv priv = {
- .ctx.actor = ocfs2_empty_dir_filldir
+ .ctx.actor = ocfs2_empty_dir_filldir,
};
- memset(&priv, 0, sizeof(priv));
-
if (ocfs2_dir_indexed(inode)) {
ret = ocfs2_empty_dir_dx(inode, &priv);
if (ret)
@@ -2351,7 +2364,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
dx_root_bh = sb_getblk(osb->sb, dr_blkno);
if (dx_root_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
goto out;
}
ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
@@ -2424,7 +2437,7 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
for (i = 0; i < num_dx_leaves; i++) {
bh = sb_getblk(osb->sb, start_blk + i);
if (bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
goto out;
}
dx_leaves[i] = bh;
@@ -2931,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
dirdata_bh = sb_getblk(sb, blkno);
if (!dirdata_bh) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
goto out_commit;
}
@@ -2959,6 +2972,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_init_dir_trailer(dir, dirdata_bh, i);
}
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, dirdata_bh);
if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3002,11 +3016,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_dinode_new_extent_list(dir, di);
i_size_write(dir, sb->s_blocksize);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
di->i_size = cpu_to_le64(sb->s_blocksize);
- di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir));
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir));
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
/*
* This should never fail as our extent list is empty and all
@@ -3082,7 +3097,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* We need to return the correct block within the
* cluster which should hold our entry.
*/
- off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+ off = ocfs2_dx_dir_hash_idx(osb,
&lookup->dl_hinfo);
get_bh(dx_leaves[off]);
lookup->dl_dx_leaf_bh = dx_leaves[off];
@@ -3161,7 +3176,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
*new_bh = sb_getblk(sb, p_blkno);
if (!*new_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -3260,7 +3275,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
spin_unlock(&OCFS2_I(dir)->ip_lock);
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
parent_fe_bh);
- num_free_extents = ocfs2_num_free_extents(osb, &et);
+ num_free_extents = ocfs2_num_free_extents(&et);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
@@ -3286,7 +3301,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
if (ocfs2_dir_resv_allowed(osb))
data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
- credits = ocfs2_calc_extend_credits(sb, el, 1);
+ credits = ocfs2_calc_extend_credits(sb, el);
} else {
spin_unlock(&OCFS2_I(dir)->ip_lock);
credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -3340,6 +3355,7 @@ do_extend:
} else {
de->rec_len = cpu_to_le16(sb->s_blocksize);
}
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, new_bh);
dir_i_size += dir->i_sb->s_blocksize;
@@ -3379,9 +3395,9 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
struct super_block *sb = dir->i_sb;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_dir_entry *de, *last_de = NULL;
- char *de_buf, *limit;
+ char *first_de, *de_buf, *limit;
unsigned long offset = 0;
- unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+ unsigned int rec_len, new_rec_len, free_space;
/*
* This calculates how many free bytes we'd have in block zero, should
@@ -3392,14 +3408,16 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
else
free_space = dir->i_sb->s_blocksize - i_size_read(dir);
- de_buf = di->id2.i_data.id_data;
+ first_de = di->id2.i_data.id_data;
+ de_buf = first_de;
limit = de_buf + i_size_read(dir);
rec_len = OCFS2_DIR_REC_LEN(namelen);
while (de_buf < limit) {
de = (struct ocfs2_dir_entry *)de_buf;
- if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
+ if (!ocfs2_check_dir_entry(dir, de, di_bh, first_de,
+ i_size_read(dir), offset)) {
ret = -ENOENT;
goto out;
}
@@ -3426,6 +3444,14 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
offset += le16_to_cpu(de->rec_len);
}
+ if (!last_de) {
+ ret = ocfs2_error(sb, "Directory entry (#%llu: size=%lld) "
+ "is unexpectedly short",
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ i_size_read(dir));
+ goto out;
+ }
+
/*
* We're going to require expansion of the directory - figure
* out how many blocks we'll need so that a place for the
@@ -3453,10 +3479,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
int blocksize = dir->i_sb->s_blocksize;
status = ocfs2_read_dir_block(dir, 0, &bh, 0);
- if (status) {
- mlog_errno(status);
+ if (status)
goto bail;
- }
rec_len = OCFS2_DIR_REC_LEN(namelen);
offset = 0;
@@ -3477,14 +3501,14 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
status = ocfs2_read_dir_block(dir,
offset >> sb->s_blocksize_bits,
&bh, 0);
- if (status) {
- mlog_errno(status);
+ if (status)
goto bail;
- }
+
/* move to next block */
de = (struct ocfs2_dir_entry *) bh->b_data;
}
- if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+ if (!ocfs2_check_dir_entry(dir, de, bh, bh->b_data, blocksize,
+ offset)) {
status = -ENOENT;
goto bail;
}
@@ -3510,7 +3534,6 @@ next:
de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
}
- status = 0;
bail:
brelse(bh);
if (status)
@@ -3543,19 +3566,6 @@ static int dx_leaf_sort_cmp(const void *a, const void *b)
return 0;
}
-static void dx_leaf_sort_swap(void *a, void *b, int size)
-{
- struct ocfs2_dx_entry *entry1 = a;
- struct ocfs2_dx_entry *entry2 = b;
- struct ocfs2_dx_entry tmp;
-
- BUG_ON(size != sizeof(*entry1));
-
- tmp = *entry1;
- *entry1 = *entry2;
- *entry2 = tmp;
-}
-
static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
{
struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
@@ -3679,7 +3689,7 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
int i, j, num_used;
u32 major_hash;
struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
- struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+ struct ocfs2_dx_entry_list *orig_list, *tmp_list;
struct ocfs2_dx_entry *dx_entry;
tmp_list = &tmp_dx_leaf->dl_list;
@@ -3688,7 +3698,6 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
orig_list = &orig_dx_leaf->dl_list;
new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
- new_list = &new_dx_leaf->dl_list;
num_used = le16_to_cpu(orig_list->de_num_used);
@@ -3716,9 +3725,9 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
struct ocfs2_dx_root_block *dx_root)
{
- int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
+ int credits = ocfs2_clusters_to_blocks(osb->sb, 3);
- credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+ credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);
credits += ocfs2_quota_trans_credits(osb->sb);
return credits;
}
@@ -3817,7 +3826,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
*/
sort(dx_leaf->dl_list.de_entries, num_used,
sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
- dx_leaf_sort_swap);
+ NULL);
ocfs2_journal_dirty(handle, dx_leaf_bh);
@@ -3898,6 +3907,7 @@ out_commit:
dquot_free_space_nodirty(dir,
ocfs2_clusters_to_bytes(dir->i_sb, 1));
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_commit_trans(osb, handle);
out:
@@ -4123,10 +4133,15 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
}
dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
- memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
- offsetof(struct ocfs2_dx_root_block, dr_list));
+
+ dx_root->dr_list.l_tree_depth = 0;
dx_root->dr_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+ dx_root->dr_list.l_next_free_rec = 0;
+ memset(&dx_root->dr_list.l_recs, 0,
+ osb->sb->s_blocksize -
+ (offsetof(struct ocfs2_dx_root_block, dr_list) +
+ offsetof(struct ocfs2_extent_list, l_recs)));
/* This should never fail considering we start with an empty
* dx_root. */
@@ -4136,6 +4151,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
mlog_errno(ret);
did_quota = 0;
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, dx_root_bh);
out_commit:
@@ -4288,12 +4304,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
trace_ocfs2_prepare_dir_for_insert(
(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
- if (!namelen) {
- ret = -EINVAL;
- mlog_errno(ret);
- goto out;
- }
-
/*
* Do this up front to reduce confusion.
*
@@ -4376,7 +4386,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
mlog_errno(ret);
goto out;
}
- mutex_lock(&dx_alloc_inode->i_mutex);
+ inode_lock(dx_alloc_inode);
ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
if (ret) {
@@ -4403,6 +4413,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
spin_unlock(&OCFS2_I(dir)->ip_lock);
di->i_dx_root = cpu_to_le64(0ULL);
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, di_bh);
@@ -4424,7 +4435,7 @@ out_unlock:
ocfs2_inode_unlock(dx_alloc_inode, 1);
out_mutex:
- mutex_unlock(&dx_alloc_inode->i_mutex);
+ inode_unlock(dx_alloc_inode);
brelse(dx_alloc_bh);
out:
iput(dx_alloc_inode);
@@ -4434,9 +4445,9 @@ out:
int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
{
int ret;
- unsigned int uninitialized_var(clen);
- u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
- u64 uninitialized_var(blkno);
+ unsigned int clen;
+ u32 major_hash = UINT_MAX, p_cpos, cpos;
+ u64 blkno;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct buffer_head *dx_root_bh = NULL;
struct ocfs2_dx_root_block *dx_root;
@@ -4473,7 +4484,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
- &dealloc, 0);
+ &dealloc, 0, false);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index f0344b75b14d..4b9f5a12c7d2 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* dir.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_DIR_H
@@ -72,7 +56,7 @@ static inline int ocfs2_add_entry(handle_t *handle,
struct buffer_head *parent_fe_bh,
struct ocfs2_dir_lookup_result *lookup)
{
- return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
+ return __ocfs2_add_entry(handle, d_inode(dentry->d_parent),
dentry->d_name.name, dentry->d_name.len,
inode, blkno, parent_fe_bh, lookup);
}
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index c8a044efbb15..5e700b45d32d 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,7 +1,5 @@
-ccflags-y := -Ifs/ocfs2
-
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
- dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
-
+ dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index 3cfa114aa391..1969db8ffa9c 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* dlmapi.h
*
* externally exported dlm interfaces
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef DLMAPI_H
@@ -79,8 +62,6 @@ enum dlm_status {
DLM_MAXSTATS, /* 41: upper limit for return code validation */
};
-/* for pretty-printing dlm_status error messages */
-const char *dlm_errmsg(enum dlm_status err);
/* for pretty-printing dlm_status error names */
const char *dlm_errname(enum dlm_status err);
@@ -137,7 +118,7 @@ struct dlm_lockstatus {
#define LKM_VALBLK 0x00000100 /* lock value block request */
#define LKM_NOQUEUE 0x00000200 /* non blocking request */
#define LKM_CONVERT 0x00000400 /* conversion request */
-#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */
+#define LKM_NODLCKWT 0x00000800 /* this lock won't deadlock (U) */
#define LKM_UNLOCK 0x00001000 /* deallocate this lock */
#define LKM_CANCEL 0x00002000 /* cancel conversion request */
#define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index fbec0be62326..c681ba957932 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmast.c
*
* AST and BAST functionality for local and remote nodes
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
@@ -38,15 +21,15 @@
#include <linux/spinlock.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock);
@@ -180,16 +163,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
spin_unlock(&lock->spinlock);
}
-void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
-{
- BUG_ON(!dlm);
- BUG_ON(!lock);
-
- spin_lock(&dlm->ast_lock);
- __dlm_queue_bast(dlm, lock);
- spin_unlock(&dlm->ast_lock);
-}
-
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
@@ -224,14 +197,12 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
dlm_astlockfunc_t *fn;
- struct dlm_lockstatus *lksb;
mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
- lksb = lock->lksb;
fn = lock->ast;
BUG_ON(lock->ml.node != dlm->node_num);
@@ -292,7 +263,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_lock *lock = NULL;
struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
char *name;
- struct list_head *iter, *head=NULL;
+ struct list_head *head = NULL;
__be64 cookie;
u32 flags;
u8 node;
@@ -373,8 +344,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
/* try convert queue for both ast/bast */
head = &res->converting;
lock = NULL;
- list_for_each(iter, head) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry(lock, head, list) {
if (lock->ml.cookie == cookie)
goto do_ast;
}
@@ -385,10 +355,13 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
else
head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry (iter, struct dlm_lock, list);
- if (lock->ml.cookie == cookie)
+ list_for_each_entry(lock, head, list) {
+ /* if lock is found but unlock is pending ignore the bast */
+ if (lock->ml.cookie == cookie) {
+ if (lock->unlock_pending)
+ break;
goto do_ast;
+ }
}
mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index de854cca12a2..20f790a47484 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1,25 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* dlmcommon.h
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef DLMCOMMON_H
@@ -32,10 +15,7 @@
#define DLM_LOCKID_NAME_MAX 32
-#define DLM_DOMAIN_NAME_MAX_LEN 255
#define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES
-#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
-#define DLM_THREAD_MS 200 // flush at least every 200 ms
#define DLM_HASH_SIZE_DEFAULT (1 << 17)
#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
@@ -47,7 +27,7 @@
#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
/* Intended to make it easier for us to switch out hash functions */
-#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
+#define dlm_lockid_hash(_n, _l) full_name_hash(NULL, _n, _l)
enum dlm_mle_type {
DLM_MLE_BLOCK = 0,
@@ -108,7 +88,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
struct dlm_recovery_ctxt
{
struct list_head resources;
- struct list_head received;
struct list_head node_data;
u8 new_master;
u8 dead_node;
@@ -141,6 +120,7 @@ struct dlm_ctxt
u8 node_num;
u32 key;
u8 joining_node;
+ u8 migrate_done; /* set to 1 means node has migrated all lock resources */
wait_queue_head_t dlm_join_events;
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -157,7 +137,6 @@ struct dlm_ctxt
atomic_t res_tot_count;
atomic_t res_cur_count;
- struct dlm_debug_ctxt *dlm_debug_ctxt;
struct dentry *dlm_debugfs_subroot;
/* NOTE: Next three are protected by dlm_domain_lock */
@@ -283,6 +262,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
#define DLM_LOCK_RES_DROPPING_REF 0x00000040
#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
#define DLM_LOCK_RES_SETREF_INPROG 0x00002000
+#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000
/* max milliseconds to wait to sync up a network failure with a node death */
#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -332,6 +312,7 @@ struct dlm_lock_resource
u16 state;
char lvb[DLM_LVB_LEN];
unsigned int inflight_locks;
+ unsigned int inflight_assert_workers;
unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
};
@@ -376,17 +357,6 @@ struct dlm_lock
lksb_kernel_allocated:1;
};
-
-#define DLM_LKSB_UNUSED1 0x01
-#define DLM_LKSB_PUT_LVB 0x02
-#define DLM_LKSB_GET_LVB 0x04
-#define DLM_LKSB_UNUSED2 0x08
-#define DLM_LKSB_UNUSED3 0x10
-#define DLM_LKSB_UNUSED4 0x20
-#define DLM_LKSB_UNUSED5 0x40
-#define DLM_LKSB_UNUSED6 0x80
-
-
enum dlm_lockres_list {
DLM_GRANTED_LIST = 0,
DLM_CONVERTING_LIST = 1,
@@ -462,6 +432,7 @@ enum {
DLM_QUERY_REGION = 519,
DLM_QUERY_NODEINFO = 520,
DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
+ DLM_DEREF_LOCKRES_DONE = 522,
};
struct dlm_reco_node_data
@@ -556,7 +527,7 @@ struct dlm_master_requery
* };
*
* from ../cluster/tcp.h
- * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
+ * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
* (roughly 4080 bytes)
* and sizeof(dlm_migratable_lockres) = 112 bytes
* and sizeof(dlm_migratable_lock) = 16 bytes
@@ -588,7 +559,7 @@ struct dlm_migratable_lockres
// 48 bytes
u8 lvb[DLM_LVB_LEN];
// 112 bytes
- struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112
+ struct dlm_migratable_lock ml[]; // 16 bytes each, begins at byte 112
};
#define DLM_MIG_LOCKRES_MAX_LEN \
(sizeof(struct dlm_migratable_lockres) + \
@@ -597,7 +568,7 @@ struct dlm_migratable_lockres
/* from above, 128 bytes
* for some undetermined future use */
-#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \
+#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \
DLM_MIG_LOCKRES_MAX_LEN)
struct dlm_create_lock
@@ -625,7 +596,7 @@ struct dlm_convert_lock
u8 name[O2NM_MAX_NAME_LEN];
- s8 lvb[0];
+ s8 lvb[];
};
#define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
@@ -640,7 +611,7 @@ struct dlm_unlock_lock
u8 name[O2NM_MAX_NAME_LEN];
- s8 lvb[0];
+ s8 lvb[];
};
#define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
@@ -656,7 +627,7 @@ struct dlm_proxy_ast
u8 name[O2NM_MAX_NAME_LEN];
- s8 lvb[0];
+ s8 lvb[];
};
#define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
@@ -712,10 +683,6 @@ struct dlm_begin_reco
__be32 pad2;
};
-
-#define BITS_PER_BYTE 8
-#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE)
-
struct dlm_query_join_request
{
u8 node_idx;
@@ -793,6 +760,20 @@ struct dlm_deref_lockres
u8 name[O2NM_MAX_NAME_LEN];
};
+enum {
+ DLM_DEREF_RESPONSE_DONE = 0,
+ DLM_DEREF_RESPONSE_INPROG = 1,
+};
+
+struct dlm_deref_lockres_done {
+ u32 pad1;
+ u16 pad2;
+ u8 node_idx;
+ u8 namelen;
+
+ u8 name[O2NM_MAX_NAME_LEN];
+};
+
static inline enum dlm_status
__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
{
@@ -800,7 +781,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
assert_spin_locked(&res->spinlock);
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
status = DLM_RECOVERING;
else if (res->state & DLM_LOCK_RES_MIGRATING)
status = DLM_MIGRATING;
@@ -911,8 +893,10 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
-void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_do_local_ast(struct dlm_ctxt *dlm,
@@ -951,13 +935,10 @@ static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
-u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
-int dlm_nm_init(struct dlm_ctxt *dlm);
-int dlm_heartbeat_init(struct dlm_ctxt *dlm);
void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
@@ -976,6 +957,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -993,6 +976,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
u8 nodenum, u8 *real_master);
+void __dlm_do_purge_lockres(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
@@ -1011,13 +996,13 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
/* will exit holding res->spinlock, but may drop in function */
void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
-void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
/* will exit holding res->spinlock, but may drop in function */
static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
{
__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING|
DLM_LOCK_RES_MIGRATING));
}
@@ -1079,11 +1064,9 @@ static inline int dlm_lock_compatible(int existing, int request)
static inline int dlm_lock_on_list(struct list_head *head,
struct dlm_lock *lock)
{
- struct list_head *iter;
struct dlm_lock *tmplock;
- list_for_each(iter, head) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, head, list) {
if (tmplock == lock)
return 1;
}
@@ -1111,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err)
static inline void dlm_node_iter_init(unsigned long *map,
struct dlm_node_iter *iter)
{
- memcpy(iter->node_map, map, sizeof(iter->node_map));
+ bitmap_copy(iter->node_map, map, O2NM_MAX_NODES);
iter->curnode = -1;
}
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 29a886d1e82c..450d46eefab3 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmconvert.c
*
* underlying calls for lock conversion
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
@@ -38,9 +21,9 @@
#include <linux/spinlock.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -48,7 +31,7 @@
#include "dlmconvert.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
/* NOTE: __dlmconvert_master is the only function in here that
* needs a spinlock held on entry (res->spinlock) and it is the
@@ -123,7 +106,6 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
int *kick_thread)
{
enum dlm_status status = DLM_NORMAL;
- struct list_head *iter;
struct dlm_lock *tmplock=NULL;
assert_spin_locked(&res->spinlock);
@@ -185,16 +167,14 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
/* upconvert from here on */
status = DLM_NORMAL;
- list_for_each(iter, &res->granted) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, &res->granted, list) {
if (tmplock == lock)
continue;
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
}
- list_for_each(iter, &res->converting) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, &res->converting, list) {
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
/* existing conversion requests take precedence */
@@ -215,6 +195,12 @@ grant:
if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+ /*
+ * Move the lock to the tail because it may be the only lock which has
+ * an invalid lvb.
+ */
+ list_move_tail(&lock->list, &res->granted);
+
status = DLM_NORMAL;
*call_ast = 1;
goto unlock_exit;
@@ -290,6 +276,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
status = DLM_DENIED;
goto bail;
}
+
+ if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+ mlog(0, "last convert request returned DLM_RECOVERING, but "
+ "owner has already queued and sent ast to me. res %.*s, "
+ "(cookie=%u:%llu, type=%d, conv=%d)\n",
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.convert_type);
+ status = DLM_NORMAL;
+ goto bail;
+ }
+
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
@@ -318,13 +317,22 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
- lock->convert_pending = 0;
- /* if it failed, move it back to granted queue */
+ /* if it failed, move it back to granted queue.
+ * if master returns DLM_NORMAL and then down before sending ast,
+ * it may have already been moved to granted queue, reset to
+ * DLM_RECOVERING and retry convert */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
+ } else if (!lock->convert_pending) {
+ mlog(0, "%s: res %.*s, owner died and lock has been moved back "
+ "to granted list, retry convert.\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ status = DLM_RECOVERING;
}
+
+ lock->convert_pending = 0;
bail:
spin_unlock(&res->spinlock);
@@ -424,8 +432,8 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_ctxt *dlm = data;
struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
- struct list_head *iter;
struct dlm_lock *lock = NULL;
+ struct dlm_lock *tmp_lock;
struct dlm_lockstatus *lksb;
enum dlm_status status = DLM_NORMAL;
u32 flags;
@@ -471,14 +479,13 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
dlm_error(status);
goto leave;
}
- list_for_each(iter, &res->granted) {
- lock = list_entry(iter, struct dlm_lock, list);
- if (lock->ml.cookie == cnv->cookie &&
- lock->ml.node == cnv->node_idx) {
+ list_for_each_entry(tmp_lock, &res->granted, list) {
+ if (tmp_lock->ml.cookie == cnv->cookie &&
+ tmp_lock->ml.node == cnv->node_idx) {
+ lock = tmp_lock;
dlm_lock_get(lock);
break;
}
- lock = NULL;
}
spin_unlock(&res->spinlock);
if (!lock) {
diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h
index b2e3677df878..1f371716513b 100644
--- a/fs/ocfs2/dlm/dlmconvert.h
+++ b/fs/ocfs2/dlm/dlmconvert.h
@@ -1,25 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* dlmconvert.h
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef DLMCONVERT_H
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 0e28e242226d..fe4fdd09bae3 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmdebug.c
*
* debug functionality for the dlm
*
* Copyright (C) 2004, 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#include <linux/types.h>
@@ -31,10 +14,11 @@
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/export.h>
+#include <linux/string_choices.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -42,7 +26,7 @@
#include "dlmdebug.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static int stringify_lockname(const char *lockname, int locklen, char *buf,
int len);
@@ -81,7 +65,7 @@ static void __dlm_print_lock(struct dlm_lock *lock)
lock->ml.type, lock->ml.convert_type, lock->ml.node,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
- atomic_read(&lock->lock_refs.refcount),
+ kref_read(&lock->lock_refs),
(list_empty(&lock->ast_list) ? 'y' : 'n'),
(lock->ast_pending ? 'y' : 'n'),
(list_empty(&lock->bast_list) ? 'y' : 'n'),
@@ -96,7 +80,6 @@ static void __dlm_print_lock(struct dlm_lock *lock)
void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
{
- struct list_head *iter2;
struct dlm_lock *lock;
char buf[DLM_LOCKID_NAME_MAX];
@@ -107,29 +90,26 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
printk("lockres: %s, owner=%u, state=%u\n",
buf, res->owner, res->state);
printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
- res->last_used, atomic_read(&res->refs.refcount),
- list_empty(&res->purge) ? "no" : "yes");
+ res->last_used, kref_read(&res->refs),
+ str_no_yes(list_empty(&res->purge)));
printk(" on dirty list: %s, on reco list: %s, "
"migrating pending: %s\n",
- list_empty(&res->dirty) ? "no" : "yes",
- list_empty(&res->recovering) ? "no" : "yes",
- res->migration_pending ? "yes" : "no");
+ str_no_yes(list_empty(&res->dirty)),
+ str_no_yes(list_empty(&res->recovering)),
+ str_yes_no(res->migration_pending));
printk(" inflight locks: %d, asts reserved: %d\n",
res->inflight_locks, atomic_read(&res->asts_reserved));
dlm_print_lockres_refmap(res);
printk(" granted queue:\n");
- list_for_each(iter2, &res->granted) {
- lock = list_entry(iter2, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
__dlm_print_lock(lock);
}
printk(" converting queue:\n");
- list_for_each(iter2, &res->converting) {
- lock = list_entry(iter2, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->converting, list) {
__dlm_print_lock(lock);
}
printk(" blocked queue:\n");
- list_for_each(iter2, &res->blocked) {
- lock = list_entry(iter2, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->blocked, list) {
__dlm_print_lock(lock);
}
}
@@ -185,59 +165,6 @@ static const char *dlm_errnames[] = {
[DLM_MAXSTATS] = "DLM_MAXSTATS",
};
-static const char *dlm_errmsgs[] = {
- [DLM_NORMAL] = "request in progress",
- [DLM_GRANTED] = "request granted",
- [DLM_DENIED] = "request denied",
- [DLM_DENIED_NOLOCKS] = "request denied, out of system resources",
- [DLM_WORKING] = "async request in progress",
- [DLM_BLOCKED] = "lock request blocked",
- [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock",
- [DLM_DENIED_GRACE_PERIOD] = "topological change in progress",
- [DLM_SYSERR] = "system error",
- [DLM_NOSUPPORT] = "unsupported",
- [DLM_CANCELGRANT] = "can't cancel convert: already granted",
- [DLM_IVLOCKID] = "bad lockid",
- [DLM_SYNC] = "synchronous request granted",
- [DLM_BADTYPE] = "bad resource type",
- [DLM_BADRESOURCE] = "bad resource handle",
- [DLM_MAXHANDLES] = "no more resource handles",
- [DLM_NOCLINFO] = "can't contact cluster manager",
- [DLM_NOLOCKMGR] = "can't contact lock manager",
- [DLM_NOPURGED] = "can't contact purge daemon",
- [DLM_BADARGS] = "bad api args",
- [DLM_VOID] = "no status",
- [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed",
- [DLM_IVBUFLEN] = "invalid resource name length",
- [DLM_CVTUNGRANT] = "attempted to convert ungranted lock",
- [DLM_BADPARAM] = "invalid lock mode specified",
- [DLM_VALNOTVALID] = "value block has been invalidated",
- [DLM_REJECTED] = "request rejected, unrecognized client",
- [DLM_ABORT] = "blocked lock request cancelled",
- [DLM_CANCEL] = "conversion request cancelled",
- [DLM_IVRESHANDLE] = "invalid resource handle",
- [DLM_DEADLOCK] = "deadlock recovery refused this request",
- [DLM_DENIED_NOASTS] = "failed to allocate AST",
- [DLM_FORWARD] = "request must wait for primary's response",
- [DLM_TIMEOUT] = "timeout value for lock has expired",
- [DLM_IVGROUPID] = "invalid group specification",
- [DLM_VERS_CONFLICT] = "version conflicts prevent request handling",
- [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong",
- [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device",
- [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ",
- [DLM_RECOVERING] = "lock resource being recovered",
- [DLM_MIGRATING] = "lock resource being migrated",
- [DLM_MAXSTATS] = "invalid error number",
-};
-
-const char *dlm_errmsg(enum dlm_status err)
-{
- if (err >= DLM_MAXSTATS || err < 0)
- return dlm_errmsgs[DLM_MAXSTATS];
- return dlm_errmsgs[err];
-}
-EXPORT_SYMBOL_GPL(dlm_errmsg);
-
const char *dlm_errname(enum dlm_status err)
{
if (err >= DLM_MAXSTATS || err < 0)
@@ -263,11 +190,11 @@ static int stringify_lockname(const char *lockname, int locklen, char *buf,
memcpy((__be64 *)&inode_blkno_be,
(char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
sizeof(__be64));
- out += snprintf(buf + out, len - out, "%.*s%08x",
+ out += scnprintf(buf + out, len - out, "%.*s%08x",
OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
(unsigned int)be64_to_cpu(inode_blkno_be));
} else
- out += snprintf(buf + out, len - out, "%.*s",
+ out += scnprintf(buf + out, len - out, "%.*s",
locklen, lockname);
return out;
}
@@ -279,7 +206,7 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
int i = -1;
while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
- out += snprintf(buf + out, len - out, "%d ", i);
+ out += scnprintf(buf + out, len - out, "%d ", i);
return out;
}
@@ -297,34 +224,34 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
mle_type = "MIG";
out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
mle_type, mle->master, mle->new_master,
!list_empty(&mle->hb_events),
!!mle->inuse,
- atomic_read(&mle->mle_refs.refcount));
+ kref_read(&mle->mle_refs));
- out += snprintf(buf + out, len - out, "Maybe=");
+ out += scnprintf(buf + out, len - out, "Maybe=");
out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
- out += snprintf(buf + out, len - out, "Vote=");
+ out += scnprintf(buf + out, len - out, "Vote=");
out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
- out += snprintf(buf + out, len - out, "Response=");
+ out += scnprintf(buf + out, len - out, "Response=");
out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
- out += snprintf(buf + out, len - out, "Node=");
+ out += scnprintf(buf + out, len - out, "Node=");
out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
return out;
}
@@ -333,7 +260,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
{
char *buf;
- buf = (char *) get_zeroed_page(GFP_NOFS);
+ buf = (char *) get_zeroed_page(GFP_ATOMIC);
if (buf) {
dump_mle(mle, buf, PAGE_SIZE - 1);
free_page((unsigned long)buf);
@@ -342,7 +269,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
#ifdef CONFIG_DEBUG_FS
-static struct dentry *dlm_debugfs_root = NULL;
+static struct dentry *dlm_debugfs_root;
#define DLM_DEBUGFS_DIR "o2dlm"
#define DLM_DEBUGFS_DLM_STATE "dlm_state"
@@ -351,26 +278,6 @@ static struct dentry *dlm_debugfs_root = NULL;
#define DLM_DEBUGFS_PURGE_LIST "purge_list"
/* begin - utils funcs */
-static void dlm_debug_free(struct kref *kref)
-{
- struct dlm_debug_ctxt *dc;
-
- dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
-
- kfree(dc);
-}
-
-static void dlm_debug_put(struct dlm_debug_ctxt *dc)
-{
- if (dc)
- kref_put(&dc->debug_refcnt, dlm_debug_free);
-}
-
-static void dlm_debug_get(struct dlm_debug_ctxt *dc)
-{
- kref_get(&dc->debug_refcnt);
-}
-
static int debug_release(struct inode *inode, struct file *file)
{
free_page((unsigned long)file->private_data);
@@ -392,7 +299,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
int out = 0;
unsigned long total = 0;
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Dumping Purgelist for Domain: %s\n", dlm->name);
spin_lock(&dlm->spinlock);
@@ -404,13 +311,13 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
out += stringify_lockname(res->lockname.name,
res->lockname.len,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\t%ld\n",
+ out += scnprintf(buf + out, len - out, "\t%ld\n",
(jiffies - res->last_used)/HZ);
spin_unlock(&res->spinlock);
}
spin_unlock(&dlm->spinlock);
- out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
+ out += scnprintf(buf + out, len - out, "Total on list: %lu\n", total);
return out;
}
@@ -446,19 +353,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
{
struct dlm_master_list_entry *mle;
struct hlist_head *bucket;
- struct hlist_node *list;
int i, out = 0;
unsigned long total = 0, longest = 0, bucket_count = 0;
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Dumping MLEs for Domain: %s\n", dlm->name);
spin_lock(&dlm->master_lock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_master_hash(dlm, i);
- hlist_for_each(list, bucket) {
- mle = hlist_entry(list, struct dlm_master_list_entry,
- master_hash_node);
+ hlist_for_each_entry(mle, bucket, master_hash_node) {
++total;
++bucket_count;
if (len - out < 200)
@@ -470,8 +374,8 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
}
spin_unlock(&dlm->master_lock);
- out += snprintf(buf + out, len - out,
- "Total: %ld, Longest: %ld\n", total, longest);
+ out += scnprintf(buf + out, len - out,
+ "Total: %lu, Longest: %lu\n", total, longest);
return out;
}
@@ -509,7 +413,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
#define DEBUG_LOCK_VERSION 1
spin_lock(&lock->spinlock);
- out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+ out = scnprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
"%d,%d,%d,%d\n",
DEBUG_LOCK_VERSION,
list_type, lock->ml.type, lock->ml.convert_type,
@@ -521,7 +425,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
lock->ast_pending, lock->bast_pending,
lock->convert_pending, lock->lock_pending,
lock->cancel_pending, lock->unlock_pending,
- atomic_read(&lock->lock_refs.refcount));
+ kref_read(&lock->lock_refs));
spin_unlock(&lock->spinlock);
return out;
@@ -533,13 +437,13 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
int i;
int out = 0;
- out += snprintf(buf + out, len - out, "NAME:");
+ out += scnprintf(buf + out, len - out, "NAME:");
out += stringify_lockname(res->lockname.name, res->lockname.len,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
#define DEBUG_LRES_VERSION 1
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
DEBUG_LRES_VERSION,
res->owner, res->state, res->last_used,
@@ -548,20 +452,20 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
!list_empty(&res->recovering),
res->inflight_locks, res->migration_pending,
atomic_read(&res->asts_reserved),
- atomic_read(&res->refs.refcount));
+ kref_read(&res->refs));
/* refmap */
- out += snprintf(buf + out, len - out, "RMAP:");
+ out += scnprintf(buf + out, len - out, "RMAP:");
out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* lvb */
- out += snprintf(buf + out, len - out, "LVBX:");
+ out += scnprintf(buf + out, len - out, "LVBX:");
for (i = 0; i < DLM_LVB_LEN; i++)
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%02x", (unsigned char)res->lvb[i]);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* granted */
list_for_each_entry(lock, &res->granted, list)
@@ -575,7 +479,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
list_for_each_entry(lock, &res->blocked, list)
out += dump_lock(lock, 2, buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
return out;
}
@@ -585,7 +489,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
struct debug_lockres *dl = m->private;
struct dlm_ctxt *dlm = dl->dl_ctxt;
struct dlm_lock_resource *oldres = dl->dl_res;
- struct dlm_lock_resource *res = NULL;
+ struct dlm_lock_resource *res = NULL, *iter;
struct list_head *track_list;
spin_lock(&dlm->track_lock);
@@ -600,11 +504,11 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
}
}
- list_for_each_entry(res, track_list, tracking) {
- if (&res->tracking == &dlm->tracking_list)
- res = NULL;
- else
- dlm_lockres_get(res);
+ list_for_each_entry(iter, track_list, tracking) {
+ if (&iter->tracking != &dlm->tracking_list) {
+ dlm_lockres_get(iter);
+ res = iter;
+ }
break;
}
spin_unlock(&dlm->track_lock);
@@ -654,41 +558,30 @@ static const struct seq_operations debug_lockres_ops = {
static int debug_lockres_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- int ret = -ENOMEM;
- struct seq_file *seq;
- struct debug_lockres *dl = NULL;
+ struct debug_lockres *dl;
+ void *buf;
- dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
- if (!dl) {
- mlog_errno(ret);
- goto bail;
- }
-
- dl->dl_len = PAGE_SIZE;
- dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
- if (!dl->dl_buf) {
- mlog_errno(ret);
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
goto bail;
- }
- ret = seq_open(file, &debug_lockres_ops);
- if (ret) {
- mlog_errno(ret);
- goto bail;
- }
+ dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl));
+ if (!dl)
+ goto bailfree;
- seq = file->private_data;
- seq->private = dl;
+ dl->dl_len = PAGE_SIZE;
+ dl->dl_buf = buf;
dlm_grab(dlm);
dl->dl_ctxt = dlm;
return 0;
+
+bailfree:
+ kfree(buf);
bail:
- if (dl)
- kfree(dl->dl_buf);
- kfree(dl);
- return ret;
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
static int debug_lockres_release(struct inode *inode, struct file *file)
@@ -736,41 +629,41 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
}
/* Domain: xxxxxxxxxx Key: 0xdfbac769 */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Domain: %s Key: 0x%08x Protocol: %d.%d\n",
dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
dlm->dlm_locking_proto.pv_minor);
/* Thread Pid: xxx Node: xxx State: xxxxx */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Thread Pid: %d Node: %d State: %s\n",
task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
/* Number of Joins: xxx Joining Node: xxx */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Number of Joins: %d Joining Node: %d\n",
dlm->num_joins, dlm->joining_node);
/* Domain Map: xx xx xx */
- out += snprintf(buf + out, len - out, "Domain Map: ");
+ out += scnprintf(buf + out, len - out, "Domain Map: ");
out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* Exit Domain Map: xx xx xx */
- out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+ out += scnprintf(buf + out, len - out, "Exit Domain Map: ");
out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* Live Map: xx xx xx */
- out += snprintf(buf + out, len - out, "Live Map: ");
+ out += scnprintf(buf + out, len - out, "Live Map: ");
out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* Lock Resources: xxx (xxx) */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Lock Resources: %d (%d)\n",
atomic_read(&dlm->res_cur_count),
atomic_read(&dlm->res_tot_count));
@@ -782,29 +675,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
cur_mles += atomic_read(&dlm->mle_cur_count[i]);
/* MLEs: xxx (xxx) */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"MLEs: %d (%d)\n", cur_mles, tot_mles);
/* Blocking: xxx (xxx) */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
" Blocking: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
/* Mastery: xxx (xxx) */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
" Mastery: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
/* Migration: xxx (xxx) */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
" Migration: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
/* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Lists: Dirty=%s Purge=%s PendingASTs=%s "
"PendingBASTs=%s\n",
(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -813,12 +706,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
/* Purge Count: xxx Refs: xxx */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Purge Count: %d Refs: %d\n", dlm->purge_count,
- atomic_read(&dlm->dlm_refs.refcount));
+ kref_read(&dlm->dlm_refs));
/* Dead Node: xxx */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Dead Node: %d\n", dlm->reco.dead_node);
/* What about DLM_RECO_STATE_FINALIZE? */
@@ -828,19 +721,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
state = "INACTIVE";
/* Recovery Pid: xxxx Master: xxx State: xxxx */
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"Recovery Pid: %d Master: %d State: %s\n",
task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.new_master, state);
/* Recovery Map: xx xx */
- out += snprintf(buf + out, len - out, "Recovery Map: ");
+ out += scnprintf(buf + out, len - out, "Recovery Map: ");
out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
buf + out, len - out);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
/* Recovery Node State: */
- out += snprintf(buf + out, len - out, "Recovery Node State:\n");
+ out += scnprintf(buf + out, len - out, "Recovery Node State:\n");
list_for_each_entry(node, &dlm->reco.node_data, list) {
switch (node->state) {
case DLM_RECO_NODE_DATA_INIT:
@@ -868,7 +761,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
state = "BAD";
break;
}
- out += snprintf(buf + out, len - out, "\t%u - %s\n",
+ out += scnprintf(buf + out, len - out, "\t%u - %s\n",
node->node_num, state);
}
@@ -904,111 +797,42 @@ static const struct file_operations debug_state_fops = {
/* end - debug state funcs */
/* files in subroot */
-int dlm_debug_init(struct dlm_ctxt *dlm)
+void dlm_debug_init(struct dlm_ctxt *dlm)
{
- struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
-
/* for dumping dlm_ctxt */
- dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_state_fops);
- if (!dc->debug_state_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm, &debug_state_fops);
/* for dumping lockres */
- dc->debug_lockres_dentry =
- debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_lockres_fops);
- if (!dc->debug_lockres_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops);
/* for dumping mles */
- dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_mle_fops);
- if (!dc->debug_mle_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops);
/* for dumping lockres on the purge list */
- dc->debug_purgelist_dentry =
- debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_purgelist_fops);
- if (!dc->debug_purgelist_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
-
- dlm_debug_get(dc);
- return 0;
-
-bail:
- dlm_debug_shutdown(dlm);
- return -ENOMEM;
-}
-
-void dlm_debug_shutdown(struct dlm_ctxt *dlm)
-{
- struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
-
- if (dc) {
- debugfs_remove(dc->debug_purgelist_dentry);
- debugfs_remove(dc->debug_mle_dentry);
- debugfs_remove(dc->debug_lockres_dentry);
- debugfs_remove(dc->debug_state_dentry);
- dlm_debug_put(dc);
- }
+ debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm,
+ &debug_purgelist_fops);
}
/* subroot - domain dir */
-int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
{
dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
dlm_debugfs_root);
- if (!dlm->dlm_debugfs_subroot) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
-
- dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
- GFP_KERNEL);
- if (!dlm->dlm_debug_ctxt) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
- kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
-
- return 0;
-bail:
- dlm_destroy_debugfs_subroot(dlm);
- return -ENOMEM;
}
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
- debugfs_remove(dlm->dlm_debugfs_subroot);
+ debugfs_remove_recursive(dlm->dlm_debugfs_subroot);
}
/* debugfs root */
-int dlm_create_debugfs_root(void)
+void dlm_create_debugfs_root(void)
{
dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
- if (!dlm_debugfs_root) {
- mlog_errno(-ENOMEM);
- return -ENOMEM;
- }
- return 0;
}
void dlm_destroy_debugfs_root(void)
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 1f27c4812d1a..e08f7357e7ec 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -1,25 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* dlmdebug.h
*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef DLMDEBUG_H
@@ -29,14 +12,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle);
#ifdef CONFIG_DEBUG_FS
-struct dlm_debug_ctxt {
- struct kref debug_refcnt;
- struct dentry *debug_state_dentry;
- struct dentry *debug_lockres_dentry;
- struct dentry *debug_mle_dentry;
- struct dentry *debug_purgelist_dentry;
-};
-
struct debug_lockres {
int dl_len;
char *dl_buf;
@@ -44,34 +19,27 @@ struct debug_lockres {
struct dlm_lock_resource *dl_res;
};
-int dlm_debug_init(struct dlm_ctxt *dlm);
-void dlm_debug_shutdown(struct dlm_ctxt *dlm);
+void dlm_debug_init(struct dlm_ctxt *dlm);
-int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
-int dlm_create_debugfs_root(void);
+void dlm_create_debugfs_root(void);
void dlm_destroy_debugfs_root(void);
#else
-static inline int dlm_debug_init(struct dlm_ctxt *dlm)
-{
- return 0;
-}
-static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+static inline void dlm_debug_init(struct dlm_ctxt *dlm)
{
}
-static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
{
- return 0;
}
static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
}
-static inline int dlm_create_debugfs_root(void)
+static inline void dlm_create_debugfs_root(void)
{
- return 0;
}
static inline void dlm_destroy_debugfs_root(void)
{
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index dbb17c07656a..2347a50f079b 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmdomain.c
*
* defines domain join / leave apis
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#include <linux/module.h>
@@ -33,20 +16,19 @@
#include <linux/delay.h>
#include <linux/err.h>
#include <linux/debugfs.h>
+#include <linux/sched/signal.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#include "dlmdebug.h"
-#include "dlmver.h"
-
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
/*
* ocfs2 node maps are array of long int, which limits to send them freely
@@ -87,7 +69,7 @@ static void dlm_free_pagevec(void **vec, int pages)
static void **dlm_alloc_pagevec(int pages)
{
- void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+ void **vec = kmalloc_array(pages, sizeof(void *), GFP_KERNEL);
int i;
if (!vec)
@@ -134,10 +116,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* - Message DLM_QUERY_NODEINFO added to allow online node removes
* New in version 1.2:
* - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
+ * New in version 1.3:
+ * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the
+ * refmap is cleared
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
- .pv_minor = 2,
+ .pv_minor = 3,
};
#define DLM_DOMAIN_BACKOFF_MS 200
@@ -171,12 +156,10 @@ void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
struct hlist_head *bucket;
- struct qstr *q;
assert_spin_locked(&dlm->spinlock);
- q = &res->lockname;
- bucket = dlm_lockres_hash(dlm, q->hash);
+ bucket = dlm_lockres_hash(dlm, res->lockname.hash);
/* get a reference for our hashtable */
dlm_lockres_get(res);
@@ -193,7 +176,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
unsigned int hash)
{
struct hlist_head *bucket;
- struct hlist_node *list;
+ struct dlm_lock_resource *res;
mlog(0, "%.*s\n", len, name);
@@ -201,9 +184,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
bucket = dlm_lockres_hash(dlm, hash);
- hlist_for_each(list, bucket) {
- struct dlm_lock_resource *res = hlist_entry(list,
- struct dlm_lock_resource, hash_node);
+ hlist_for_each_entry(res, bucket, hash_node) {
if (res->lockname.name[0] != name[0])
continue;
if (unlikely(res->lockname.len != len))
@@ -262,22 +243,19 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
{
- struct dlm_ctxt *tmp = NULL;
- struct list_head *iter;
+ struct dlm_ctxt *tmp;
assert_spin_locked(&dlm_domain_lock);
/* tmp->name here is always NULL terminated,
* but domain may not be! */
- list_for_each(iter, &dlm_domains) {
- tmp = list_entry (iter, struct dlm_ctxt, list);
+ list_for_each_entry(tmp, &dlm_domains, list) {
if (strlen(tmp->name) == len &&
memcmp(tmp->name, domain, len)==0)
- break;
- tmp = NULL;
+ return tmp;
}
- return tmp;
+ return NULL;
}
/* For null terminated domain strings ONLY */
@@ -366,25 +344,22 @@ static void __dlm_get(struct dlm_ctxt *dlm)
* you shouldn't trust your pointer. */
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
{
- struct list_head *iter;
- struct dlm_ctxt *target = NULL;
+ struct dlm_ctxt *target;
+ struct dlm_ctxt *ret = NULL;
spin_lock(&dlm_domain_lock);
- list_for_each(iter, &dlm_domains) {
- target = list_entry (iter, struct dlm_ctxt, list);
-
+ list_for_each_entry(target, &dlm_domains, list) {
if (target == dlm) {
__dlm_get(target);
+ ret = target;
break;
}
-
- target = NULL;
}
spin_unlock(&dlm_domain_lock);
- return target;
+ return ret;
}
int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
@@ -402,7 +377,6 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
{
if (dlm->dlm_worker) {
- flush_workqueue(dlm->dlm_worker);
destroy_workqueue(dlm->dlm_worker);
dlm->dlm_worker = NULL;
}
@@ -411,7 +385,6 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
{
dlm_unregister_domain_handlers(dlm);
- dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
dlm_destroy_dlm_worker(dlm);
@@ -470,6 +443,19 @@ redo_bucket:
cond_resched_lock(&dlm->spinlock);
num += n;
}
+
+ if (!num) {
+ if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+ mlog(0, "%s: perhaps there are more lock resources "
+ "need to be migrated after dlm recovery\n", dlm->name);
+ ret = -EAGAIN;
+ } else {
+ mlog(0, "%s: we won't do dlm recovery after migrating "
+ "all lock resources\n", dlm->name);
+ dlm->migrate_done = 1;
+ }
+ }
+
spin_unlock(&dlm->spinlock);
wake_up(&dlm->dlm_thread_wq);
@@ -684,34 +670,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
spin_unlock(&dlm->spinlock);
}
-int dlm_joined(struct dlm_ctxt *dlm)
-{
- int ret = 0;
-
- spin_lock(&dlm_domain_lock);
-
- if (dlm->dlm_state == DLM_CTXT_JOINED)
- ret = 1;
-
- spin_unlock(&dlm_domain_lock);
-
- return ret;
-}
-
-int dlm_shutting_down(struct dlm_ctxt *dlm)
-{
- int ret = 0;
-
- spin_lock(&dlm_domain_lock);
-
- if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
- ret = 1;
-
- spin_unlock(&dlm_domain_lock);
-
- return ret;
-}
-
void dlm_unregister_domain(struct dlm_ctxt *dlm)
{
int leave = 0;
@@ -849,7 +807,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
* to back off and try again. This gives heartbeat a chance
* to catch up.
*/
- if (!o2hb_check_node_heartbeating(query->node_idx)) {
+ if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) {
mlog(0, "node %u is not in our live map yet\n",
query->node_idx);
@@ -887,7 +845,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
* to be put in someone's domain map.
* Also, explicitly disallow joining at certain troublesome
* times (ie. during recovery). */
- if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
+ if (dlm->dlm_state != DLM_CTXT_LEAVING) {
int bit = query->node_idx;
spin_lock(&dlm->spinlock);
@@ -969,6 +927,14 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
* domain. Set him in the map and clean up our
* leftover join state. */
BUG_ON(dlm->joining_node != assert->node_idx);
+
+ if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+ mlog(0, "dlm recovery is ongoing, disallow join\n");
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+ return -EAGAIN;
+ }
+
set_bit(assert->node_idx, dlm->domain_map);
clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -1079,7 +1045,7 @@ static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
int status, ret = 0, i;
char *p;
- if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES)
goto bail;
qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
@@ -1133,7 +1099,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
struct dlm_ctxt *dlm = NULL;
char *local = NULL;
int status = 0;
- int locked = 0;
qr = (struct dlm_query_region *) msg->buf;
@@ -1142,10 +1107,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
/* buffer used in dlm_mast_regions() */
local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
- if (!local) {
- status = -ENOMEM;
- goto bail;
- }
+ if (!local)
+ return -ENOMEM;
status = -EINVAL;
@@ -1154,16 +1117,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
if (!dlm) {
mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
"before join domain\n", qr->qr_node, qr->qr_domain);
- goto bail;
+ goto out_domain_lock;
}
spin_lock(&dlm->spinlock);
- locked = 1;
if (dlm->joining_node != qr->qr_node) {
mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
"but joining node is %d\n", qr->qr_node, qr->qr_domain,
dlm->joining_node);
- goto bail;
+ goto out_dlm_lock;
}
/* Support for global heartbeat was added in 1.1 */
@@ -1173,14 +1135,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
"but active dlm protocol is %d.%d\n", qr->qr_node,
qr->qr_domain, dlm->dlm_locking_proto.pv_major,
dlm->dlm_locking_proto.pv_minor);
- goto bail;
+ goto out_dlm_lock;
}
status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
-bail:
- if (locked)
- spin_unlock(&dlm->spinlock);
+out_dlm_lock:
+ spin_unlock(&dlm->spinlock);
+
+out_domain_lock:
spin_unlock(&dlm_domain_lock);
kfree(local);
@@ -1254,7 +1217,7 @@ static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
struct o2nm_node *node;
int ret = 0, status, count, i;
- if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES)
goto bail;
qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
@@ -1311,7 +1274,7 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
{
struct dlm_query_nodeinfo *qn;
struct dlm_ctxt *dlm = NULL;
- int locked = 0, status = -EINVAL;
+ int status = -EINVAL;
qn = (struct dlm_query_nodeinfo *) msg->buf;
@@ -1327,12 +1290,11 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
}
spin_lock(&dlm->spinlock);
- locked = 1;
if (dlm->joining_node != qn->qn_nodenum) {
mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
"joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
dlm->joining_node);
- goto bail;
+ goto unlock;
}
/* Support for node query was added in 1.1 */
@@ -1342,14 +1304,14 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
"but active dlm protocol is %d.%d\n", qn->qn_nodenum,
qn->qn_domain, dlm->dlm_locking_proto.pv_major,
dlm->dlm_locking_proto.pv_minor);
- goto bail;
+ goto unlock;
}
status = dlm_match_nodes(dlm, qn);
+unlock:
+ spin_unlock(&dlm->spinlock);
bail:
- if (locked)
- spin_unlock(&dlm->spinlock);
spin_unlock(&dlm_domain_lock);
return status;
@@ -1415,7 +1377,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
unsigned int map_size)
{
int status, tmpstat;
- unsigned int node;
+ int node;
if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
sizeof(unsigned long))) {
@@ -1484,39 +1446,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
if (status == -ENOPROTOOPT) {
status = 0;
*response = JOIN_OK_NO_MAP;
- } else if (packet.code == JOIN_DISALLOW ||
- packet.code == JOIN_OK_NO_MAP) {
- *response = packet.code;
- } else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
- mlog(ML_NOTICE,
- "This node requested DLM locking protocol %u.%u and "
- "filesystem locking protocol %u.%u. At least one of "
- "the protocol versions on node %d is not compatible, "
- "disconnecting\n",
- dlm->dlm_locking_proto.pv_major,
- dlm->dlm_locking_proto.pv_minor,
- dlm->fs_locking_proto.pv_major,
- dlm->fs_locking_proto.pv_minor,
- node);
- status = -EPROTO;
- *response = packet.code;
- } else if (packet.code == JOIN_OK) {
- *response = packet.code;
- /* Use the same locking protocol as the remote node */
- dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
- dlm->fs_locking_proto.pv_minor = packet.fs_minor;
- mlog(0,
- "Node %d responds JOIN_OK with DLM locking protocol "
- "%u.%u and fs locking protocol %u.%u\n",
- node,
- dlm->dlm_locking_proto.pv_major,
- dlm->dlm_locking_proto.pv_minor,
- dlm->fs_locking_proto.pv_major,
- dlm->fs_locking_proto.pv_minor);
} else {
- status = -EINVAL;
- mlog(ML_ERROR, "invalid response %d from node %u\n",
- packet.code, node);
+ *response = packet.code;
+ switch (packet.code) {
+ case JOIN_DISALLOW:
+ case JOIN_OK_NO_MAP:
+ break;
+ case JOIN_PROTOCOL_MISMATCH:
+ mlog(ML_NOTICE,
+ "This node requested DLM locking protocol %u.%u and "
+ "filesystem locking protocol %u.%u. At least one of "
+ "the protocol versions on node %d is not compatible, "
+ "disconnecting\n",
+ dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor,
+ dlm->fs_locking_proto.pv_major,
+ dlm->fs_locking_proto.pv_minor,
+ node);
+ status = -EPROTO;
+ break;
+ case JOIN_OK:
+ /* Use the same locking protocol as the remote node */
+ dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
+ dlm->fs_locking_proto.pv_minor = packet.fs_minor;
+ mlog(0,
+ "Node %d responds JOIN_OK with DLM locking protocol "
+ "%u.%u and fs locking protocol %u.%u\n",
+ node,
+ dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor,
+ dlm->fs_locking_proto.pv_major,
+ dlm->fs_locking_proto.pv_minor);
+ break;
+ default:
+ status = -EINVAL;
+ mlog(ML_ERROR, "invalid response %d from node %u\n",
+ packet.code, node);
+ /* Reset response to JOIN_DISALLOW */
+ *response = JOIN_DISALLOW;
+ break;
+ }
}
mlog(0, "status %d, node %d response is %d\n", status, node,
@@ -1530,6 +1499,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
unsigned int node)
{
int status;
+ int ret;
struct dlm_assert_joined assert_msg;
mlog(0, "Sending join assert to node %u\n", node);
@@ -1541,11 +1511,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
&assert_msg, sizeof(assert_msg), node,
- NULL);
+ &ret);
if (status < 0)
mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
"node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
node);
+ else
+ status = ret;
return status;
}
@@ -1555,7 +1527,6 @@ static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
{
int status, node, live;
- status = 0;
node = -1;
while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
@@ -1603,8 +1574,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm,
spin_lock(&dlm->spinlock);
/* For now, we restart the process if the node maps have
* changed at all */
- ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
- sizeof(dlm->live_nodes_map));
+ ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map,
+ O2NM_MAX_NODES);
spin_unlock(&dlm->spinlock);
if (ret)
@@ -1631,13 +1602,11 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
/* group sem locking should work for us here -- we're already
* registered for heartbeat events so filling this should be
* atomic wrt getting those handlers called. */
- o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+ o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES);
spin_lock(&dlm->spinlock);
- memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
-
+ bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES);
__dlm_set_joining_node(dlm, dlm->node_num);
-
spin_unlock(&dlm->spinlock);
node = -1;
@@ -1670,8 +1639,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
* yes_resp_map. Copy that into our domain map and send a join
* assert message to clean up everyone elses state. */
spin_lock(&dlm->spinlock);
- memcpy(dlm->domain_map, ctxt->yes_resp_map,
- sizeof(ctxt->yes_resp_map));
+ bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES);
set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
@@ -1741,12 +1709,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+ o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+ dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+
status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
if (status)
goto bail;
- o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
- dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
if (status)
goto bail;
@@ -1864,6 +1833,10 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
if (status)
goto bail;
+ status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ sizeof(struct dlm_deref_lockres_done),
+ dlm_deref_lockres_done_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
bail:
if (status)
dlm_unregister_domain_handlers(dlm);
@@ -1876,6 +1849,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
int status;
unsigned int backoff;
unsigned int total_backoff = 0;
+ char wq_name[O2NM_MAX_NAME_LEN];
BUG_ON(!dlm);
@@ -1887,12 +1861,6 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
- status = dlm_debug_init(dlm);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
status = dlm_launch_thread(dlm);
if (status < 0) {
mlog_errno(status);
@@ -1905,7 +1873,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
- dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+ dlm_debug_init(dlm);
+
+ snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
+ dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!dlm->dlm_worker) {
status = -ENOMEM;
mlog_errno(status);
@@ -1925,12 +1897,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
- if (total_backoff >
- msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
+ if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) {
status = -ERESTARTSYS;
mlog(ML_NOTICE, "Timed out joining dlm domain "
"%s after %u msecs\n", dlm->name,
- jiffies_to_msecs(total_backoff));
+ total_backoff);
goto bail;
}
@@ -1960,7 +1931,6 @@ bail:
if (status) {
dlm_unregister_domain_handlers(dlm);
- dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
dlm_destroy_dlm_worker(dlm);
@@ -1978,24 +1948,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
if (!dlm) {
- mlog_errno(-ENOMEM);
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
dlm->name = kstrdup(domain, GFP_KERNEL);
if (dlm->name == NULL) {
- mlog_errno(-ENOMEM);
- kfree(dlm);
- dlm = NULL;
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
if (!dlm->lockres_hash) {
- mlog_errno(-ENOMEM);
- kfree(dlm->name);
- kfree(dlm);
- dlm = NULL;
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
@@ -2005,11 +1973,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->master_hash = (struct hlist_head **)
dlm_alloc_pagevec(DLM_HASH_PAGES);
if (!dlm->master_hash) {
- mlog_errno(-ENOMEM);
- dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
- kfree(dlm->name);
- kfree(dlm);
- dlm = NULL;
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
@@ -2019,15 +1984,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->key = key;
dlm->node_num = o2nm_this_node();
- ret = dlm_create_debugfs_subroot(dlm);
- if (ret < 0) {
- dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
- dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
- kfree(dlm->name);
- kfree(dlm);
- dlm = NULL;
- goto leave;
- }
+ dlm_create_debugfs_subroot(dlm);
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
@@ -2036,7 +1993,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
INIT_LIST_HEAD(&dlm->list);
INIT_LIST_HEAD(&dlm->dirty_list);
INIT_LIST_HEAD(&dlm->reco.resources);
- INIT_LIST_HEAD(&dlm->reco.received);
INIT_LIST_HEAD(&dlm->reco.node_data);
INIT_LIST_HEAD(&dlm->purge_list);
INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
@@ -2049,9 +2005,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
dlm->recovery_map, &(dlm->recovery_map[0]));
- memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
- memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
- memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+ bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES);
+ bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES);
+ bitmap_zero(dlm->domain_map, O2NM_MAX_NODES);
dlm->dlm_thread_task = NULL;
dlm->dlm_reco_thread_task = NULL;
@@ -2066,6 +2022,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
init_waitqueue_head(&dlm->dlm_join_events);
+ dlm->migrate_done = 0;
+
dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
@@ -2086,9 +2044,23 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
mlog(0, "context init: refcount %u\n",
- atomic_read(&dlm->dlm_refs.refcount));
+ kref_read(&dlm->dlm_refs));
+ ret = 0;
leave:
+ if (ret < 0 && dlm) {
+ if (dlm->master_hash)
+ dlm_free_pagevec((void **)dlm->master_hash,
+ DLM_HASH_PAGES);
+
+ if (dlm->lockres_hash)
+ dlm_free_pagevec((void **)dlm->lockres_hash,
+ DLM_HASH_PAGES);
+
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ }
return dlm;
}
@@ -2296,13 +2268,10 @@ static DECLARE_RWSEM(dlm_callback_sem);
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
int node_num)
{
- struct list_head *iter;
struct dlm_eviction_cb *cb;
down_read(&dlm_callback_sem);
- list_for_each(iter, &dlm->dlm_eviction_callbacks) {
- cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
-
+ list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) {
cb->ec_func(node_num, cb->ec_data);
}
up_read(&dlm_callback_sem);
@@ -2339,8 +2308,6 @@ static int __init dlm_init(void)
{
int status;
- dlm_print_version();
-
status = dlm_init_mle_cache();
if (status) {
mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -2366,9 +2333,7 @@ static int __init dlm_init(void)
goto error;
}
- status = dlm_create_debugfs_root();
- if (status)
- goto error;
+ dlm_create_debugfs_root();
return 0;
error:
@@ -2390,6 +2355,7 @@ static void __exit dlm_exit (void)
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
module_init(dlm_init);
module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index 2f7f60bfeb3b..815abe30ad09 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -1,25 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* dlmdomain.h
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
#ifndef DLMDOMAIN_H
@@ -28,8 +11,30 @@
extern spinlock_t dlm_domain_lock;
extern struct list_head dlm_domains;
-int dlm_joined(struct dlm_ctxt *dlm);
-int dlm_shutting_down(struct dlm_ctxt *dlm);
+static inline int dlm_joined(struct dlm_ctxt *dlm)
+{
+ int ret = 0;
+
+ spin_lock(&dlm_domain_lock);
+ if (dlm->dlm_state == DLM_CTXT_JOINED)
+ ret = 1;
+ spin_unlock(&dlm_domain_lock);
+
+ return ret;
+}
+
+static inline int dlm_shutting_down(struct dlm_ctxt *dlm)
+{
+ int ret = 0;
+
+ spin_lock(&dlm_domain_lock);
+ if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
+ ret = 1;
+ spin_unlock(&dlm_domain_lock);
+
+ return ret;
+}
+
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
int node_num);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 47e67c2d228f..041fd1791ae7 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmlock.c
*
* underlying calls for lock creation
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
@@ -40,9 +23,9 @@
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -50,9 +33,9 @@
#include "dlmconvert.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
-static struct kmem_cache *dlm_lock_cache = NULL;
+static struct kmem_cache *dlm_lock_cache;
static DEFINE_SPINLOCK(dlm_cookie_lock);
static u64 dlm_next_cookie = 1;
@@ -77,8 +60,7 @@ int dlm_init_lock_cache(void)
void dlm_destroy_lock_cache(void)
{
- if (dlm_lock_cache)
- kmem_cache_destroy(dlm_lock_cache);
+ kmem_cache_destroy(dlm_lock_cache);
}
/* Tell us whether we can grant a new lock request.
@@ -91,19 +73,14 @@ void dlm_destroy_lock_cache(void)
static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
- struct list_head *iter;
struct dlm_lock *tmplock;
- list_for_each(iter, &res->granted) {
- tmplock = list_entry(iter, struct dlm_lock, list);
-
+ list_for_each_entry(tmplock, &res->granted, list) {
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
}
- list_for_each(iter, &res->converting) {
- tmplock = list_entry(iter, struct dlm_lock, list);
-
+ list_for_each_entry(tmplock, &res->converting, list) {
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
if (!dlm_lock_compatible(tmplock->ml.convert_type,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 33ecbe0e6734..4145e06d2c08 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmmod.c
*
* standalone DLM module
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
@@ -38,11 +21,11 @@
#include <linux/inet.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
+#include <linux/string_choices.h>
-
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -50,7 +33,7 @@
#include "dlmdebug.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static void dlm_mle_node_down(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
@@ -82,9 +65,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
return 1;
}
-static struct kmem_cache *dlm_lockres_cache = NULL;
-static struct kmem_cache *dlm_lockname_cache = NULL;
-static struct kmem_cache *dlm_mle_cache = NULL;
+static struct kmem_cache *dlm_lockres_cache;
+static struct kmem_cache *dlm_lockname_cache;
+static struct kmem_cache *dlm_mle_cache;
static void dlm_mle_release(struct kref *kref);
static void dlm_init_mle(struct dlm_master_list_entry *mle,
@@ -233,7 +216,7 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
- if (!atomic_read(&mle->mle_refs.refcount)) {
+ if (!kref_read(&mle->mle_refs)) {
/* this may or may not crash, but who cares.
* it's a BUG. */
mlog(ML_ERROR, "bad mle: %p\n", mle);
@@ -275,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
mle->type = type;
INIT_HLIST_NODE(&mle->master_hash_node);
INIT_LIST_HEAD(&mle->hb_events);
- memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+ bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
spin_lock_init(&mle->spinlock);
init_waitqueue_head(&mle->wq);
atomic_set(&mle->woken, 0);
kref_init(&mle->mle_refs);
- memset(mle->response_map, 0, sizeof(mle->response_map));
+ bitmap_zero(mle->response_map, O2NM_MAX_NODES);
mle->master = O2NM_MAX_NODES;
mle->new_master = O2NM_MAX_NODES;
mle->inuse = 0;
@@ -307,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
atomic_inc(&dlm->mle_cur_count[mle->type]);
/* copy off the node_map and register hb callbacks on our copy */
- memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
- memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+ bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES);
+ bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES);
clear_bit(dlm->node_num, mle->vote_map);
clear_bit(dlm->node_num, mle->node_map);
@@ -342,16 +325,13 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
{
struct dlm_master_list_entry *tmpmle;
struct hlist_head *bucket;
- struct hlist_node *list;
unsigned int hash;
assert_spin_locked(&dlm->master_lock);
hash = dlm_lockid_hash(name, namelen);
bucket = dlm_master_hash(dlm, hash);
- hlist_for_each(list, bucket) {
- tmpmle = hlist_entry(list, struct dlm_master_list_entry,
- master_hash_node);
+ hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
continue;
dlm_get_mle(tmpmle);
@@ -417,8 +397,7 @@ int dlm_init_mle_cache(void)
void dlm_destroy_mle_cache(void)
{
- if (dlm_mle_cache)
- kmem_cache_destroy(dlm_mle_cache);
+ kmem_cache_destroy(dlm_mle_cache);
}
static void dlm_mle_release(struct kref *kref)
@@ -475,11 +454,11 @@ bail:
void dlm_destroy_master_caches(void)
{
- if (dlm_lockname_cache)
- kmem_cache_destroy(dlm_lockname_cache);
+ kmem_cache_destroy(dlm_lockname_cache);
+ dlm_lockname_cache = NULL;
- if (dlm_lockres_cache)
- kmem_cache_destroy(dlm_lockres_cache);
+ kmem_cache_destroy(dlm_lockres_cache);
+ dlm_lockres_cache = NULL;
}
static void dlm_lockres_release(struct kref *kref)
@@ -497,16 +476,6 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
- res->lockname.len, res->lockname.name);
- dlm_print_one_lock_resource(res);
- }
- spin_unlock(&dlm->track_lock);
-
atomic_dec(&dlm->res_cur_count);
if (!hlist_unhashed(&res->hash_node) ||
@@ -580,6 +549,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
atomic_set(&res->asts_reserved, 0);
res->migration_pending = 0;
res->inflight_locks = 0;
+ res->inflight_assert_workers = 0;
res->dlm = dlm;
@@ -597,12 +567,12 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->last_used = 0;
- spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->track_lock);
list_add_tail(&res->tracking, &dlm->tracking_list);
- spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm->track_lock);
memset(res->lvb, 0, DLM_LVB_LEN);
- memset(res->refmap, 0, sizeof(res->refmap));
+ bitmap_zero(res->refmap, O2NM_MAX_NODES);
}
struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
@@ -623,9 +593,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
return res;
error:
- if (res && res->lockname.name)
- kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
-
if (res)
kmem_cache_free(dlm_lockres_cache, res);
return NULL;
@@ -653,12 +620,9 @@ void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
clear_bit(bit, res->refmap);
}
-
-void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- assert_spin_locked(&res->spinlock);
-
res->inflight_locks++;
mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
@@ -666,6 +630,13 @@ void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
__builtin_return_address(0));
}
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ __dlm_lockres_grab_inflight_ref(dlm, res);
+}
+
void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
@@ -682,6 +653,35 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
wake_up(&res->wq);
}
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ res->inflight_assert_workers++;
+ mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ BUG_ON(res->inflight_assert_workers == 0);
+ res->inflight_assert_workers--;
+ mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_drop_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
+}
+
/*
* lookup a lock resource by name.
* may already exist in the hashtable.
@@ -725,6 +725,19 @@ lookup:
if (tmpres) {
spin_unlock(&dlm->spinlock);
spin_lock(&tmpres->spinlock);
+
+ /*
+ * Right after dlm spinlock was released, dlm_thread could have
+ * purged the lockres. Check if lockres got unhashed. If so
+ * start over.
+ */
+ if (hlist_unhashed(&tmpres->hash_node)) {
+ spin_unlock(&tmpres->spinlock);
+ dlm_lockres_put(tmpres);
+ tmpres = NULL;
+ goto lookup;
+ }
+
/* Wait on the thread that is mastering the resource */
if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
__dlm_wait_on_lockres(tmpres);
@@ -750,8 +763,18 @@ lookup:
dlm_lockres_grab_inflight_ref(dlm, tmpres);
spin_unlock(&tmpres->spinlock);
- if (res)
+ if (res) {
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else
+ mlog(ML_ERROR, "Resource %.*s not "
+ "on the Tracking list\n",
+ res->lockname.len,
+ res->lockname.name);
+ spin_unlock(&dlm->track_lock);
dlm_lockres_put(res);
+ }
res = tmpres;
goto leave;
}
@@ -838,7 +861,7 @@ lookup:
* to see if there are any nodes that still need to be
* considered. these will not appear in the mle nodemap
* but they might own this lockres. wait on them. */
- bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES);
if (bit < O2NM_MAX_NODES) {
mlog(0, "%s: res %.*s, At least one node (%d) "
"to recover before lock mastery can begin\n",
@@ -855,10 +878,8 @@ lookup:
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
- /* Grab inflight ref to pin the resource */
- spin_lock(&res->spinlock);
- dlm_lockres_grab_inflight_ref(dlm, res);
- spin_unlock(&res->spinlock);
+ /* since this lockres is new it doesn't not require the spinlock */
+ __dlm_lockres_grab_inflight_ref(dlm, res);
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
@@ -891,7 +912,7 @@ redo_request:
dlm_wait_for_recovery(dlm);
spin_lock(&dlm->spinlock);
- bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES);
if (bit < O2NM_MAX_NODES) {
mlog(0, "%s: res %.*s, At least one node (%d) "
"to recover before lock mastery can begin\n",
@@ -1015,10 +1036,10 @@ recheck:
spin_lock(&mle->spinlock);
m = mle->master;
- map_changed = (memcmp(mle->vote_map, mle->node_map,
- sizeof(mle->vote_map)) != 0);
- voting_done = (memcmp(mle->vote_map, mle->response_map,
- sizeof(mle->vote_map)) == 0);
+ map_changed = !bitmap_equal(mle->vote_map, mle->node_map,
+ O2NM_MAX_NODES);
+ voting_done = bitmap_equal(mle->vote_map, mle->response_map,
+ O2NM_MAX_NODES);
/* restart if we hit any errors */
if (map_changed) {
@@ -1058,7 +1079,7 @@ recheck:
sleep = 1;
/* have all nodes responded? */
if (voting_done && !*blocked) {
- bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES);
if (dlm->node_num <= bit) {
/* my node number is lowest.
* now tell other nodes that I am
@@ -1079,13 +1100,6 @@ recheck:
/* sleep if we haven't finished voting yet */
if (sleep) {
unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
-
- /*
- if (atomic_read(&mle->mle_refs.refcount) < 2)
- mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
- atomic_read(&mle->mle_refs.refcount),
- res->lockname.len, res->lockname.name);
- */
atomic_set(&mle->woken, 0);
(void)wait_event_timeout(mle->wq,
(atomic_read(&mle->woken) == 1),
@@ -1220,8 +1234,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
} else {
mlog(ML_ERROR, "node down! %d\n", node);
if (blocked) {
- int lowest = find_next_bit(mle->maybe_map,
- O2NM_MAX_NODES, 0);
+ int lowest = find_first_bit(mle->maybe_map,
+ O2NM_MAX_NODES);
/* act like it was never there */
clear_bit(node, mle->maybe_map);
@@ -1263,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
/* now blank out everything, as if we had never
* contacted anyone */
- memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
- memset(mle->response_map, 0, sizeof(mle->response_map));
+ bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
+ bitmap_zero(mle->response_map, O2NM_MAX_NODES);
/* reset the vote_map to the current node_map */
- memcpy(mle->vote_map, mle->node_map,
- sizeof(mle->node_map));
+ bitmap_copy(mle->vote_map, mle->node_map,
+ O2NM_MAX_NODES);
/* put myself into the maybe map */
if (mle->type != DLM_MLE_BLOCK)
set_bit(dlm->node_num, mle->maybe_map);
@@ -1396,6 +1410,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
int found, ret;
int set_maybe;
int dispatch_assert = 0;
+ int dispatched = 0;
if (!dlm_grab(dlm))
return DLM_MASTER_RESP_NO;
@@ -1422,6 +1437,18 @@ way_up_top:
/* take care of the easy cases up front */
spin_lock(&res->spinlock);
+
+ /*
+ * Right after dlm spinlock was released, dlm_thread could have
+ * purged the lockres. Check if lockres got unhashed. If so
+ * start over.
+ */
+ if (hlist_unhashed(&res->hash_node)) {
+ spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
+ goto way_up_top;
+ }
+
if (res->state & (DLM_LOCK_RES_RECOVERING|
DLM_LOCK_RES_MIGRATING)) {
spin_unlock(&res->spinlock);
@@ -1450,7 +1477,6 @@ way_up_top:
goto send_response;
} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
spin_unlock(&res->spinlock);
- // mlog(0, "node %u is the master\n", res->owner);
response = DLM_MASTER_RESP_NO;
if (mle)
kmem_cache_free(dlm_mle_cache, mle);
@@ -1466,7 +1492,6 @@ way_up_top:
BUG();
}
- // mlog(0, "lockres is in progress...\n");
spin_lock(&dlm->master_lock);
found = dlm_find_mle(dlm, &tmpmle, name, namelen);
if (!found) {
@@ -1476,8 +1501,6 @@ way_up_top:
set_maybe = 1;
spin_lock(&tmpmle->spinlock);
if (tmpmle->type == DLM_MLE_BLOCK) {
- // mlog(0, "this node is waiting for "
- // "lockres to be mastered\n");
response = DLM_MASTER_RESP_NO;
} else if (tmpmle->type == DLM_MLE_MIGRATION) {
mlog(0, "node %u is master, but trying to migrate to "
@@ -1504,8 +1527,6 @@ way_up_top:
} else
response = DLM_MASTER_RESP_NO;
} else {
- // mlog(0, "this node is attempting to "
- // "master lockres\n");
response = DLM_MASTER_RESP_MAYBE;
}
if (set_maybe)
@@ -1532,7 +1553,6 @@ way_up_top:
found = dlm_find_mle(dlm, &tmpmle, name, namelen);
if (!found) {
/* this lockid has never been seen on this node yet */
- // mlog(0, "no mle found\n");
if (!mle) {
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
@@ -1546,15 +1566,11 @@ way_up_top:
goto way_up_top;
}
- // mlog(0, "this is second time thru, already allocated, "
- // "add the block.\n");
dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
set_bit(request->node_idx, mle->maybe_map);
__dlm_insert_mle(dlm, mle);
response = DLM_MASTER_RESP_NO;
} else {
- // mlog(0, "mle was found\n");
- set_maybe = 1;
spin_lock(&tmpmle->spinlock);
if (tmpmle->master == dlm->node_num) {
mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
@@ -1569,8 +1585,7 @@ way_up_top:
response = DLM_MASTER_RESP_NO;
} else
response = DLM_MASTER_RESP_MAYBE;
- if (set_maybe)
- set_bit(request->node_idx, tmpmle->maybe_map);
+ set_bit(request->node_idx, tmpmle->maybe_map);
spin_unlock(&tmpmle->spinlock);
}
spin_unlock(&dlm->master_lock);
@@ -1588,27 +1603,28 @@ send_response:
* dlm_assert_master_worker() isn't called, we drop it here.
*/
if (dispatch_assert) {
- if (response != DLM_MASTER_RESP_YES)
- mlog(ML_ERROR, "invalid response %d\n", response);
- if (!res) {
- mlog(ML_ERROR, "bad lockres while trying to assert!\n");
- BUG();
- }
mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
dlm->node_num, res->lockname.len, res->lockname.name);
+ spin_lock(&res->spinlock);
ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
DLM_ASSERT_MASTER_MLE_CLEANUP);
if (ret < 0) {
mlog(ML_ERROR, "failed to dispatch assert master work\n");
response = DLM_MASTER_RESP_ERROR;
+ spin_unlock(&res->spinlock);
dlm_lockres_put(res);
+ } else {
+ dispatched = 1;
+ __dlm_lockres_grab_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
}
} else {
if (res)
dlm_lockres_put(res);
}
- dlm_put(dlm);
+ if (!dispatched)
+ dlm_put(dlm);
return response;
}
@@ -1770,7 +1786,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
"MLE for it! (%.*s)\n", assert->node_idx,
namelen, name);
} else {
- int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
+ int bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES);
if (bit >= O2NM_MAX_NODES) {
/* not necessarily an error, though less likely.
* could be master just re-asserting. */
@@ -1872,8 +1888,6 @@ ok:
spin_unlock(&res->spinlock);
}
- // mlog(0, "woo! got an assert_master from node %u!\n",
- // assert->node_idx);
if (mle) {
int extra_ref = 0;
int nn = -1;
@@ -1888,8 +1902,10 @@ ok:
* up nodes that this node contacted */
while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
nn+1)) < O2NM_MAX_NODES) {
- if (nn != dlm->node_num && nn != assert->node_idx)
+ if (nn != dlm->node_num && nn != assert->node_idx) {
master_request = 1;
+ break;
+ }
}
}
mle->master = assert->node_idx;
@@ -1923,7 +1939,7 @@ ok:
* on this mle. */
spin_lock(&dlm->master_lock);
- rr = atomic_read(&mle->mle_refs.refcount);
+ rr = kref_read(&mle->mle_refs);
if (mle->inuse > 0) {
if (extra_ref && rr < 3)
err = 1;
@@ -1995,6 +2011,10 @@ kill:
"and killing the other node now! This node is OK and can continue.\n");
__dlm_print_one_lock_resource(res);
spin_unlock(&res->spinlock);
+ spin_lock(&dlm->master_lock);
+ if (mle)
+ __dlm_put_mle(mle);
+ spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
*ret_data = (void *)res;
dlm_put(dlm);
@@ -2026,7 +2046,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
/* queue up work for dlm_assert_master_worker */
- dlm_grab(dlm); /* get an extra ref for the work item */
dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
item->u.am.lockres = res; /* already have a ref */
/* can optionally ignore node numbers higher than this node */
@@ -2064,7 +2083,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
flags = item->u.am.flags;
spin_lock(&dlm->spinlock);
- memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+ bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES);
spin_unlock(&dlm->spinlock);
clear_bit(dlm->node_num, nodemap);
@@ -2115,6 +2134,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
dlm_lockres_release_ast(dlm, res);
put:
+ dlm_lockres_drop_inflight_worker(dlm, res);
+
dlm_lockres_put(res);
mlog(0, "finished with dlm_assert_master_worker\n");
@@ -2127,7 +2148,7 @@ put:
* think that $RECOVERY is currently mastered by a dead node. If so,
* we wait a short time to allow that node to get notified by its own
* heartbeat stack, then check again. All $RECOVERY lock resources
- * mastered by dead nodes are purged when the hearbeat callback is
+ * mastered by dead nodes are purged when the heartbeat callback is
* fired, so we can know for sure that it is safe to continue once
* the node returns a live node or no node. */
static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
@@ -2206,8 +2227,11 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
dlm->name, namelen, lockname, res->owner, r);
dlm_print_one_lock_resource(res);
- BUG();
- }
+ if (r == -ENOMEM)
+ BUG();
+ } else
+ ret = r;
+
return ret;
}
@@ -2275,7 +2299,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
res->lockname.len, res->lockname.name, node);
dlm_print_one_lock_resource(res);
}
- ret = 0;
+ ret = DLM_DEREF_RESPONSE_DONE;
goto done;
}
@@ -2295,7 +2319,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&dlm->work_lock);
queue_work(dlm->dlm_worker, &dlm->dispatched_work);
- return 0;
+ return DLM_DEREF_RESPONSE_INPROG;
done:
if (res)
@@ -2305,6 +2329,102 @@ done:
return ret;
}
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct dlm_ctxt *dlm = data;
+ struct dlm_deref_lockres_done *deref
+ = (struct dlm_deref_lockres_done *)msg->buf;
+ struct dlm_lock_resource *res = NULL;
+ char *name;
+ unsigned int namelen;
+ int ret = -EINVAL;
+ u8 node;
+ unsigned int hash;
+
+ if (!dlm_grab(dlm))
+ return 0;
+
+ name = deref->name;
+ namelen = deref->namelen;
+ node = deref->node_idx;
+
+ if (namelen > DLM_LOCKID_NAME_MAX) {
+ mlog(ML_ERROR, "Invalid name length!");
+ goto done;
+ }
+ if (deref->node_idx >= O2NM_MAX_NODES) {
+ mlog(ML_ERROR, "Invalid node number: %u\n", node);
+ goto done;
+ }
+
+ hash = dlm_lockid_hash(name, namelen);
+
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+ if (!res) {
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+ dlm->name, namelen, name);
+ goto done;
+ }
+
+ spin_lock(&res->spinlock);
+ if (!(res->state & DLM_LOCK_RES_DROPPING_REF)) {
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done "
+ "but it is already derefed!\n", dlm->name,
+ res->lockname.len, res->lockname.name, node);
+ ret = 0;
+ goto done;
+ }
+
+ __dlm_do_purge_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+
+ spin_unlock(&dlm->spinlock);
+
+ ret = 0;
+done:
+ if (res)
+ dlm_lockres_put(res);
+ dlm_put(dlm);
+ return ret;
+}
+
+static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, u8 node)
+{
+ struct dlm_deref_lockres_done deref;
+ int ret = 0, r;
+ const char *lockname;
+ unsigned int namelen;
+
+ lockname = res->lockname.name;
+ namelen = res->lockname.len;
+ BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+ memset(&deref, 0, sizeof(deref));
+ deref.node_idx = dlm->node_num;
+ deref.namelen = namelen;
+ memcpy(deref.name, lockname, namelen);
+
+ ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ &deref, sizeof(deref), node, &r);
+ if (ret < 0) {
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
+ " to node %u\n", dlm->name, namelen,
+ lockname, ret, node);
+ } else if (r < 0) {
+ /* ignore the error */
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, node, r);
+ dlm_print_one_lock_resource(res);
+ }
+}
+
static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
{
struct dlm_ctxt *dlm;
@@ -2318,13 +2438,15 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
spin_lock(&res->spinlock);
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
if (test_bit(node, res->refmap)) {
- __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
spin_unlock(&res->spinlock);
+ dlm_drop_lockres_ref_done(dlm, res, node);
+
if (cleared) {
mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
dlm->name, res->lockname.len, res->lockname.name, node);
@@ -2340,13 +2462,13 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
}
/*
- * A migrateable resource is one that is :
+ * A migratable resource is one that is :
* 1. locally mastered, and,
* 2. zero local locks, and,
* 3. one or more non-local locks, or, one or more references
* Returns 1 if yes, 0 if not.
*/
-static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
+static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
enum dlm_lockres_list idx;
@@ -2357,6 +2479,15 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
assert_spin_locked(&res->spinlock);
+ /* delay migration when the lockres is in MIGRATING state */
+ if (res->state & DLM_LOCK_RES_MIGRATING)
+ return 0;
+
+ /* delay migration when the lockres is in RECOCERING state */
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
+ return 0;
+
if (res->owner != dlm->node_num)
return 0;
@@ -2368,7 +2499,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
continue;
}
cookie = be64_to_cpu(lock->ml.cookie);
- mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+ mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on "
"%s list\n", dlm->name, res->lockname.len,
res->lockname.name,
dlm_get_lock_cookie_node(cookie),
@@ -2379,12 +2510,12 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
}
if (!nonlocal) {
- node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ node_ref = find_first_bit(res->refmap, O2NM_MAX_NODES);
if (node_ref >= O2NM_MAX_NODES)
return 0;
}
- mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+ mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len,
res->lockname.name);
return 1;
@@ -2410,8 +2541,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
if (!dlm_grab(dlm))
return -EINVAL;
- BUG_ON(target == O2NM_MAX_NODES);
-
name = res->lockname.name;
namelen = res->lockname.len;
@@ -2441,6 +2570,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
+ /* get an extra reference on the mle.
+ * otherwise the assert_master from the new
+ * master will destroy this.
+ */
+ if (ret != -EEXIST)
+ dlm_get_mle_inuse(mle);
+
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
@@ -2466,7 +2602,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
}
fail:
- if (oldmle) {
+ if (ret != -EEXIST && oldmle) {
/* master is known, detach if not already detached */
dlm_mle_detach_hb_events(dlm, oldmle);
dlm_put_mle(oldmle);
@@ -2476,6 +2612,7 @@ fail:
if (mle_added) {
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
mle = NULL;
@@ -2493,17 +2630,6 @@ fail:
* ensure that all assert_master work is flushed. */
flush_workqueue(dlm->dlm_worker);
- /* get an extra reference on the mle.
- * otherwise the assert_master from the new
- * master will destroy this.
- * also, make sure that all callers of dlm_get_mle
- * take both dlm->spinlock and dlm->master_lock */
- spin_lock(&dlm->spinlock);
- spin_lock(&dlm->master_lock);
- dlm_get_mle_inuse(mle);
- spin_unlock(&dlm->master_lock);
- spin_unlock(&dlm->spinlock);
-
/* notify new node and send all lock state */
/* call send_one_lockres with migration flag.
* this serves as notice to the target node that a
@@ -2610,8 +2736,6 @@ leave:
return ret;
}
-#define DLM_MIGRATION_RETRY_MS 100
-
/*
* Should be called only after beginning the domain leave process.
* There should not be any remaining locks on nonlocal lock resources,
@@ -2623,6 +2747,7 @@ leave:
* Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
*/
int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+ __must_hold(&dlm->spinlock)
{
int ret;
int lock_dropped = 0;
@@ -2631,7 +2756,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
assert_spin_locked(&dlm->spinlock);
spin_lock(&res->spinlock);
- if (dlm_is_lockres_migrateable(dlm, res))
+ if (dlm_is_lockres_migratable(dlm, res))
target = dlm_pick_migration_target(dlm, res);
spin_unlock(&res->spinlock);
@@ -2723,7 +2848,7 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
dlm_lockres_release_ast(dlm, res);
mlog(0, "about to wait on migration_wq, dirty=%s\n",
- res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+ str_yes_no(res->state & DLM_LOCK_RES_DIRTY));
/* if the extra ref we just put was the final one, this
* will pass thru immediately. otherwise, we need to wait
* for the last ast to finish. */
@@ -2733,12 +2858,12 @@ again:
msecs_to_jiffies(1000));
if (ret < 0) {
mlog(0, "woken again: migrating? %s, dead? %s\n",
- res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
- test_bit(target, dlm->domain_map) ? "no":"yes");
+ str_yes_no(res->state & DLM_LOCK_RES_MIGRATING),
+ str_no_yes(test_bit(target, dlm->domain_map)));
} else {
mlog(0, "all is well: migrating? %s, dead? %s\n",
- res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
- test_bit(target, dlm->domain_map) ? "no":"yes");
+ str_yes_no(res->state & DLM_LOCK_RES_MIGRATING),
+ str_no_yes(test_bit(target, dlm->domain_map)));
}
if (!dlm_migration_can_proceed(dlm, res, target)) {
mlog(0, "trying again...\n");
@@ -2758,13 +2883,15 @@ again:
/*
* if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
* another try; otherwise, we are sure the MIGRATING state is there,
- * drop the unneded state which blocked threads trying to DIRTY
+ * drop the unneeded state which blocked threads trying to DIRTY
*/
spin_lock(&res->spinlock);
BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
if (!ret)
BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+ else
+ res->migration_pending = 0;
spin_unlock(&res->spinlock);
/*
@@ -2839,7 +2966,7 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
enum dlm_lockres_list idx;
- struct list_head *queue = &res->granted;
+ struct list_head *queue;
struct dlm_lock *lock;
int noderef;
u8 nodenum = O2NM_MAX_NODES;
@@ -2970,7 +3097,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
int ret = 0;
if (!dlm_grab(dlm))
- return -EINVAL;
+ return 0;
name = migrate->name;
namelen = migrate->namelen;
@@ -3011,6 +3138,9 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
migrate->new_master,
migrate->master);
+ if (ret < 0)
+ kmem_cache_free(dlm_mle_cache, mle);
+
spin_unlock(&dlm->master_lock);
unlock:
spin_unlock(&dlm->spinlock);
@@ -3061,7 +3191,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
mlog(0, "tried to migrate %.*s, but some "
"process beat me to it\n",
namelen, name);
- ret = -EEXIST;
+ spin_unlock(&tmp->spinlock);
+ return -EEXIST;
} else {
/* bad. 2 NODES are trying to migrate! */
mlog(ML_ERROR, "migration error mle: "
@@ -3081,11 +3212,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
/* remove it so that only one mle will be found */
__dlm_unlink_mle(dlm, tmp);
__dlm_mle_detach_hb_events(dlm, tmp);
- ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
- mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
- "telling master to get ref for cleared out mle "
- "during migration\n", dlm->name, namelen, name,
- master, new_master);
+ if (tmp->type == DLM_MLE_MASTER) {
+ ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+ mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+ "telling master to get ref "
+ "for cleared out mle during "
+ "migration\n", dlm->name,
+ namelen, name, master,
+ new_master);
+ }
}
spin_unlock(&tmp->spinlock);
}
@@ -3157,7 +3292,7 @@ static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
BUG_ON(mle->type != DLM_MLE_BLOCK);
spin_lock(&mle->spinlock);
- bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES);
if (bit != dead_node) {
mlog(0, "mle found, but dead node %u would not have been "
"master\n", dead_node);
@@ -3183,7 +3318,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_master_list_entry *mle;
struct dlm_lock_resource *res;
struct hlist_head *bucket;
- struct hlist_node *list;
+ struct hlist_node *tmp;
unsigned int i;
mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
@@ -3194,10 +3329,7 @@ top:
spin_lock(&dlm->master_lock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_master_hash(dlm, i);
- hlist_for_each(list, bucket) {
- mle = hlist_entry(list, struct dlm_master_list_entry,
- master_hash_node);
-
+ hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
BUG_ON(mle->type != DLM_MLE_BLOCK &&
mle->type != DLM_MLE_MASTER &&
mle->type != DLM_MLE_MIGRATION);
@@ -3231,6 +3363,15 @@ top:
mle->new_master != dead_node)
continue;
+ if (mle->new_master == dead_node && mle->inuse) {
+ mlog(ML_NOTICE, "%s: target %u died during "
+ "migration from %u, the MLE is "
+ "still keep used, ignore it!\n",
+ dlm->name, dead_node,
+ mle->master);
+ continue;
+ }
+
/* If we have reached this point, this mle needs to be
* removed from the list and freed. */
dlm_clean_migration_mle(dlm, mle);
@@ -3295,7 +3436,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = 0;
}
- memset(iter.node_map, 0, sizeof(iter.node_map));
+ bitmap_zero(iter.node_map, O2NM_MAX_NODES);
set_bit(old_master, iter.node_map);
mlog(0, "doing assert master of %.*s back to %u\n",
res->lockname.len, res->lockname.name, old_master);
@@ -3378,7 +3519,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm)
int i;
struct hlist_head *bucket;
struct dlm_master_list_entry *mle;
- struct hlist_node *tmp, *list;
+ struct hlist_node *tmp;
/*
* We notified all other nodes that we are exiting the domain and
@@ -3390,13 +3531,11 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm)
spin_lock(&dlm->master_lock);
BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
- BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+ BUG_ON((find_first_bit(dlm->domain_map, O2NM_MAX_NODES) < O2NM_MAX_NODES));
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_master_hash(dlm, i);
- hlist_for_each_safe(list, tmp, bucket) {
- mle = hlist_entry(list, struct dlm_master_list_entry,
- master_hash_node);
+ hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
if (mle->type != DLM_MLE_BLOCK) {
mlog(ML_ERROR, "bad mle: %p\n", mle);
dlm_print_one_mle(mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 773bd32bfd8c..843ee02bd85f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmrecovery.c
*
* recovery stuff
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
@@ -39,18 +22,18 @@
#include <linux/timer.h>
#include <linux/kthread.h>
#include <linux/delay.h>
+#include <linux/string_choices.h>
-
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
@@ -62,7 +45,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_request_all_locks(struct dlm_ctxt *dlm,
u8 request_from, u8 dead_node);
-static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm);
static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
@@ -141,13 +124,6 @@ static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
}
-static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
-{
- spin_lock(&dlm->spinlock);
- __dlm_reset_recovery(dlm);
- spin_unlock(&dlm->spinlock);
-}
-
/* Worker function used during recovery. */
void dlm_dispatch_work(struct work_struct *work)
{
@@ -205,7 +181,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
mlog(0, "starting dlm recovery thread...\n");
dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
- "dlm_reco_thread");
+ "dlm_reco-%s", dlm->name);
if (IS_ERR(dlm->dlm_reco_thread_task)) {
mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
dlm->dlm_reco_thread_task = NULL;
@@ -231,7 +207,7 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
* 1) all recovery threads cluster wide will work on recovering
* ONE node at a time
* 2) negotiate who will take over all the locks for the dead node.
- * thats right... ALL the locks.
+ * that's right... ALL the locks.
* 3) once a new master is chosen, everyone scans all locks
* and moves aside those mastered by the dead guy
* 4) each of these locks should be locked until recovery is done
@@ -423,12 +399,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
static void dlm_begin_recovery(struct dlm_ctxt *dlm)
{
- spin_lock(&dlm->spinlock);
+ assert_spin_locked(&dlm->spinlock);
BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
dlm->name, dlm->reco.dead_node);
dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
- spin_unlock(&dlm->spinlock);
}
static void dlm_end_recovery(struct dlm_ctxt *dlm)
@@ -456,6 +431,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
spin_lock(&dlm->spinlock);
+ if (dlm->migrate_done) {
+ mlog(0, "%s: no need do recovery after migrating all "
+ "lock resources\n", dlm->name);
+ spin_unlock(&dlm->spinlock);
+ return 0;
+ }
+
/* check to see if the new master has died */
if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
test_bit(dlm->reco.new_master, dlm->recovery_map)) {
@@ -469,7 +451,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
int bit;
- bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES);
if (bit >= O2NM_MAX_NODES || bit < 0)
dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
else
@@ -482,7 +464,6 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
}
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
- // mlog(0, "nothing to recover! sleeping now!\n");
spin_unlock(&dlm->spinlock);
/* return to main thread loop and sleep. */
return 0;
@@ -490,12 +471,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.dead_node);
- spin_unlock(&dlm->spinlock);
/* take write barrier */
/* (stops the list reshuffling thread, proxy ast handling) */
dlm_begin_recovery(dlm);
+ spin_unlock(&dlm->spinlock);
+
if (dlm->reco.new_master == dlm->node_num)
goto master_here;
@@ -537,7 +519,10 @@ master_here:
/* success! see if any other nodes need recovery */
mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
dlm->name, dlm->reco.dead_node, dlm->node_num);
- dlm_reset_recovery(dlm);
+ spin_lock(&dlm->spinlock);
+ __dlm_reset_recovery(dlm);
+ dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
}
dlm_end_recovery(dlm);
@@ -595,8 +580,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
msecs_to_jiffies(1000));
mlog(0, "waited 1 sec for %u, "
"dead? %s\n", ndata->node_num,
- dlm_is_node_dead(dlm, ndata->node_num) ?
- "yes" : "no");
+ str_yes_no(dlm_is_node_dead(dlm, ndata->node_num)));
} else {
/* -ENOMEM on the other node */
mlog(0, "%s: node %u returned "
@@ -691,10 +675,18 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
spin_unlock(&dlm_reco_state_lock);
mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
- all_nodes_done?"yes":"no");
+ str_yes_no(all_nodes_done));
if (all_nodes_done) {
int ret;
+ /* Set this flag on recovery master to avoid
+ * a new recovery for another dead node start
+ * before the recovery is not done. That may
+ * cause recovery hung.*/
+ spin_lock(&dlm->spinlock);
+ dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
+
/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
* just send a finalize message to everyone and
* clean up */
@@ -728,7 +720,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
}
if (destroy)
- dlm_destroy_recovery_area(dlm, dead_node);
+ dlm_destroy_recovery_area(dlm);
return status;
}
@@ -739,7 +731,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_reco_node_data *ndata;
spin_lock(&dlm->spinlock);
- memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+ bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES);
/* nodes can only be removed (by dying) after dropping
* this lock, and death will be trapped later, so this should do */
spin_unlock(&dlm->spinlock);
@@ -753,7 +745,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
ndata = kzalloc(sizeof(*ndata), GFP_NOFS);
if (!ndata) {
- dlm_destroy_recovery_area(dlm, dead_node);
+ dlm_destroy_recovery_area(dlm);
return -ENOMEM;
}
ndata->node_num = num;
@@ -767,7 +759,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
return 0;
}
-static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm)
{
struct dlm_reco_node_data *ndata, *next;
LIST_HEAD(tmplist);
@@ -787,6 +779,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
{
struct dlm_lock_request lr;
int ret;
+ int status;
mlog(0, "\n");
@@ -800,13 +793,15 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
// send message
ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
- &lr, sizeof(lr), request_from, NULL);
+ &lr, sizeof(lr), request_from, &status);
/* negative status is handled by caller */
if (ret < 0)
mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
"to recover dead node %u\n", dlm->name, ret,
request_from, dead_node);
+ else
+ ret = status;
// return from here, then
// sleep until all received or error
return ret;
@@ -1056,6 +1051,9 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
dead_node, dlm->name);
list_del_init(&lock->list);
dlm_lock_put(lock);
+ /* Can't schedule DLM_UNLOCK_FREE_LOCK
+ * - do manually */
+ dlm_lock_put(lock);
break;
}
}
@@ -1100,7 +1098,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
{
u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
int mres_total_locks = be32_to_cpu(mres->total_locks);
- int sz, ret = 0, status = 0;
+ int ret = 0, status = 0;
u8 orig_flags = mres->flags,
orig_master = mres->master;
@@ -1108,9 +1106,6 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
if (!mres->num_locks)
return 0;
- sz = sizeof(struct dlm_migratable_lockres) +
- (mres->num_locks * sizeof(struct dlm_migratable_lock));
-
/* add an all-done flag if we reached the last lock */
orig_flags = mres->flags;
BUG_ON(total_locks > mres_total_locks);
@@ -1124,7 +1119,8 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
/* send it */
ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
- sz, send_to, &status);
+ struct_size(mres, ml, mres->num_locks),
+ send_to, &status);
if (ret < 0) {
/* XXX: negative status is not handled.
* this will end up killing this node. */
@@ -1356,10 +1352,20 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
char *buf = NULL;
struct dlm_work_item *item = NULL;
struct dlm_lock_resource *res = NULL;
+ unsigned int hash;
if (!dlm_grab(dlm))
return -EINVAL;
+ if (!dlm_joined(dlm)) {
+ mlog(ML_ERROR, "Domain %s not joined! "
+ "lockres %.*s, master %u\n",
+ dlm->name, mres->lockname_len,
+ mres->lockname, mres->master);
+ dlm_put(dlm);
+ return -EINVAL;
+ }
+
BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
real_master = mres->master;
@@ -1383,11 +1389,26 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
/* lookup the lock to see if we have a secondary queue for this
* already... just add the locks in and this will have its owner
* and RECOVERY flag changed when it completes. */
- res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+ hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
+ hash);
if (res) {
/* this will get a ref on res */
/* mark it as recovering/migrating and hash it */
spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(0, "%s: node is attempting to migrate "
+ "lockres %.*s, but marked as dropping "
+ " ref!\n", dlm->name,
+ mres->lockname_len, mres->lockname);
+ ret = -EINVAL;
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ dlm_lockres_put(res);
+ goto leave;
+ }
+
if (mres->flags & DLM_MRES_RECOVERY) {
res->state |= DLM_LOCK_RES_RECOVERING;
} else {
@@ -1404,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
mres->lockname_len, mres->lockname);
ret = -EFAULT;
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
dlm_lockres_put(res);
goto leave;
}
res->state |= DLM_LOCK_RES_MIGRATING;
}
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
} else {
+ spin_unlock(&dlm->spinlock);
/* need to allocate, just like if it was
* mastered here normally */
res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
@@ -1443,7 +1467,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
* The first one is handled at the end of this function. The
* other two are handled in the worker thread after locks have
* been attached. Yes, we don't wait for purge time to match
- * kref_init. The lockres will still have atleast one ref
+ * kref_init. The lockres will still have at least one ref
* added because it is in the hash __dlm_insert_lockres() */
extra_refs++;
@@ -1633,7 +1657,7 @@ static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
u8 nodenum, u8 *real_master)
{
- int ret = -EINVAL;
+ int ret;
struct dlm_master_requery req;
int status = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1642,14 +1666,18 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
req.namelen = res->lockname.len;
memcpy(req.name, res->lockname.name, res->lockname.len);
+resend:
ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
&req, sizeof(req), nodenum, &status);
- /* XXX: negative status not handled properly here. */
if (ret < 0)
mlog(ML_ERROR, "Error %d when sending message %u (key "
"0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
dlm->key, nodenum);
- else {
+ else if (status == -ENOMEM) {
+ mlog_errno(status);
+ msleep(50);
+ goto resend;
+ } else {
BUG_ON(status < 0);
BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
*real_master = (u8) (status & 0xff);
@@ -1673,6 +1701,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
unsigned int hash;
int master = DLM_LOCK_RES_OWNER_UNKNOWN;
u32 flags = DLM_ASSERT_MASTER_REQUERY;
+ int dispatched = 0;
if (!dlm_grab(dlm)) {
/* since the domain has gone away on this
@@ -1691,17 +1720,28 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
int ret = dlm_dispatch_assert_master(dlm, res,
0, 0, flags);
if (ret < 0) {
- mlog_errno(-ENOMEM);
- /* retry!? */
- BUG();
+ mlog_errno(ret);
+ spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
+ spin_unlock(&dlm->spinlock);
+ dlm_put(dlm);
+ /* sender will take care of this and retry */
+ return ret;
+ } else {
+ dispatched = 1;
+ __dlm_lockres_grab_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
}
- } else /* put.. incase we are not the master */
+ } else {
+ /* put.. in case we are not the master */
+ spin_unlock(&res->spinlock);
dlm_lockres_put(res);
- spin_unlock(&res->spinlock);
+ }
}
spin_unlock(&dlm->spinlock);
- dlm_put(dlm);
+ if (!dispatched)
+ dlm_put(dlm);
return master;
}
@@ -1747,15 +1787,14 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
struct dlm_migratable_lockres *mres)
{
struct dlm_migratable_lock *ml;
- struct list_head *queue;
+ struct list_head *queue, *iter;
struct list_head *tmpq = NULL;
struct dlm_lock *newlock = NULL;
struct dlm_lockstatus *lksb = NULL;
int ret = 0;
int i, j, bad;
- struct dlm_lock *lock = NULL;
+ struct dlm_lock *lock;
u8 from = O2NM_MAX_NODES;
- unsigned int added = 0;
__be64 c;
mlog(0, "running %d locks for this lockres\n", mres->num_locks);
@@ -1771,7 +1810,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
dlm_lockres_set_refmap_bit(dlm, res, from);
spin_unlock(&res->spinlock);
- added++;
break;
}
BUG_ON(ml->highest_blocked != LKM_IVMODE);
@@ -1788,14 +1826,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* MIGRATION ONLY! */
BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+ lock = NULL;
spin_lock(&res->spinlock);
for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
tmpq = dlm_list_idx_to_ptr(res, j);
- list_for_each_entry(lock, tmpq, list) {
- if (lock->ml.cookie != ml->cookie)
- lock = NULL;
- else
+ list_for_each(iter, tmpq) {
+ lock = list_entry(iter,
+ struct dlm_lock, list);
+ if (lock->ml.cookie == ml->cookie)
break;
+ lock = NULL;
}
if (lock)
break;
@@ -1857,7 +1897,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, queue);
spin_unlock(&res->spinlock);
- added++;
mlog(0, "just reordered a local lock!\n");
continue;
@@ -1883,6 +1922,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
if (ml->type == LKM_NLMODE)
goto skip_lvb;
+ /*
+ * If the lock is in the blocked list it can't have a valid lvb,
+ * so skip it
+ */
+ if (ml->list == DLM_BLOCKED_LIST)
+ goto skip_lvb;
+
if (!dlm_lvb_is_empty(mres->lvb)) {
if (lksb->flags & DLM_LKSB_PUT_LVB) {
/* other node was trying to update
@@ -1963,12 +2009,19 @@ skip_lvb:
}
if (!bad) {
dlm_lock_get(newlock);
- list_add_tail(&newlock->list, queue);
+ if (mres->flags & DLM_MRES_RECOVERY &&
+ ml->list == DLM_CONVERTING_LIST &&
+ newlock->ml.type >
+ newlock->ml.convert_type) {
+ /* newlock is doing downconvert, add it to the
+ * head of converting list */
+ list_add(&newlock->list, queue);
+ } else
+ list_add_tail(&newlock->list, queue);
mlog(0, "%s:%.*s: added lock for node %u, "
"setting refmap bit\n", dlm->name,
res->lockname.len, res->lockname.name, ml->node);
dlm_lockres_set_refmap_bit(dlm, res, ml->node);
- added++;
}
spin_unlock(&res->spinlock);
}
@@ -1980,11 +2033,8 @@ leave:
dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
- if (ret < 0) {
+ if (ret < 0)
mlog_errno(ret);
- if (newlock)
- dlm_lock_put(newlock);
- }
return ret;
}
@@ -2017,7 +2067,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
dlm_lock_get(lock);
if (lock->convert_pending) {
/* move converting lock back to granted */
- BUG_ON(i != DLM_CONVERTING_LIST);
mlog(0, "node died with convert pending "
"on %.*s. move back to granted list.\n",
res->lockname.len, res->lockname.name);
@@ -2109,6 +2158,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, bucket, hash_node) {
+ if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ }
+
if (!(res->state & DLM_LOCK_RES_RECOVERING))
continue;
@@ -2246,6 +2302,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, freed, dead_node);
__dlm_print_one_lock_resource(res);
}
+ res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
} else if (test_bit(dead_node, res->refmap)) {
mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2258,18 +2315,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
__dlm_dirty_lockres(dlm, res);
}
-/* if this node is the recovery master, and there are no
- * locks for a given lockres owned by this node that are in
- * either PR or EX mode, zero out the lvb before requesting.
- *
- */
-
-
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
{
struct dlm_lock_resource *res;
int i;
struct hlist_head *bucket;
+ struct hlist_node *tmp;
struct dlm_lock *lock;
@@ -2292,7 +2343,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
*/
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, bucket, hash_node) {
+ hlist_for_each_entry_safe(res, tmp, bucket, hash_node) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
@@ -2306,9 +2357,24 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
dead_node, dlm->name);
list_del_init(&lock->list);
dlm_lock_put(lock);
+ /* Can't schedule
+ * DLM_UNLOCK_FREE_LOCK
+ * - do manually */
+ dlm_lock_put(lock);
break;
}
}
+
+ if ((res->owner == dead_node) &&
+ (res->state & DLM_LOCK_RES_DROPPING_REF)) {
+ dlm_lockres_get(res);
+ __dlm_do_purge_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ dlm_lockres_put(res);
+ continue;
+ } else if (res->owner == dlm->node_num)
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
spin_unlock(&res->spinlock);
continue;
}
@@ -2317,17 +2383,31 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
- mlog(ML_NOTICE, "%s: res %.*s, Skip "
- "recovery as it is being freed\n",
- dlm->name, res->lockname.len,
- res->lockname.name);
- } else
- dlm_move_lockres_to_recovery_list(dlm,
- res);
-
+ mlog(0, "%s:%.*s: owned by "
+ "dead node %u, this node was "
+ "dropping its ref when master died. "
+ "continue, purging the lockres.\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, dead_node);
+ dlm_lockres_get(res);
+ __dlm_do_purge_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ dlm_lockres_put(res);
+ continue;
+ }
+ dlm_move_lockres_to_recovery_list(dlm, res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
+ } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+ if (test_bit(dead_node, res->refmap)) {
+ mlog(0, "%s:%.*s: dead node %u had a ref, but had "
+ "no locks and had not purged before dying\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, dead_node);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
+ }
}
spin_unlock(&res->spinlock);
}
@@ -2391,11 +2471,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
* perhaps later we can genericize this for other waiters. */
wake_up(&dlm->migration_wq);
- if (test_bit(idx, dlm->recovery_map))
- mlog(0, "domain %s, node %u already added "
- "to recovery map!\n", dlm->name, idx);
- else
- set_bit(idx, dlm->recovery_map);
+ set_bit(idx, dlm->recovery_map);
}
void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
@@ -2555,7 +2631,7 @@ again:
dlm_reco_master_ready(dlm),
msecs_to_jiffies(1000));
if (!dlm_reco_master_ready(dlm)) {
- mlog(0, "%s: reco master taking awhile\n",
+ mlog(0, "%s: reco master taking a while\n",
dlm->name);
goto again;
}
@@ -2620,7 +2696,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
continue;
}
retry:
- ret = -EINVAL;
mlog(0, "attempting to send begin reco msg to %d\n",
nodenum);
ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
@@ -2864,12 +2939,10 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
BUG();
}
dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ __dlm_reset_recovery(dlm);
spin_unlock(&dlm->spinlock);
- dlm_reset_recovery(dlm);
dlm_kick_recovery_thread(dlm);
break;
- default:
- BUG();
}
mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index e73c833fc2a1..eedf07ca23ca 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmthread.c
*
* standalone DLM module
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
@@ -40,22 +23,20 @@
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static int dlm_thread(void *data);
static void dlm_flush_asts(struct dlm_ctxt *dlm);
-#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num)
-
/* will exit holding res->spinlock, but may drop in function */
/* waits until flags are cleared on res->state */
void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
@@ -106,11 +87,12 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
return 0;
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
return 0;
/* Another node has this resource with this node as the master */
- bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ bit = find_first_bit(res->refmap, O2NM_MAX_NODES);
if (bit < O2NM_MAX_NODES)
return 0;
@@ -159,6 +141,52 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
+/*
+ * Do the real purge work:
+ * unhash the lockres, and
+ * clear flag DLM_LOCK_RES_DROPPING_REF.
+ * It requires dlm and lockres spinlock to be taken.
+ */
+void __dlm_do_purge_lockres(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
+
+ if (!list_empty(&res->purge)) {
+ mlog(0, "%s: Removing res %.*s from purgelist\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ list_del_init(&res->purge);
+ dlm_lockres_put(res);
+ dlm->purge_count--;
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
+
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
+ /*
+ * lockres is not in the hash now. drop the flag and wake up
+ * any processes waiting in dlm_get_lock_resource.
+ */
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+}
+
static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
@@ -174,6 +202,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, master);
if (!master) {
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(ML_NOTICE, "%s: res %.*s already in DLM_LOCK_RES_DROPPING_REF state\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ spin_unlock(&res->spinlock);
+ return;
+ }
+
res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
spin_unlock(&res->spinlock);
@@ -202,6 +237,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
dlm->purge_count--;
}
+ if (!master && ret == DLM_DEREF_RESPONSE_INPROG) {
+ mlog(0, "%s: deref %.*s in progress\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ spin_unlock(&res->spinlock);
+ return;
+ }
+
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
dlm->name, res->lockname.len, res->lockname.name);
@@ -211,6 +253,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
__dlm_unhash_lockres(dlm, res);
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+ res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
@@ -259,12 +311,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
* refs on it. */
unused = __dlm_lockres_unused(lockres);
if (!unused ||
- (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+ (lockres->state & DLM_LOCK_RES_MIGRATING) ||
+ (lockres->inflight_assert_workers != 0)) {
mlog(0, "%s: res %.*s is in use or being remastered, "
- "used %d, state %d\n", dlm->name,
- lockres->lockname.len, lockres->lockname.name,
- !unused, lockres->state);
- list_move_tail(&dlm->purge_list, &lockres->purge);
+ "used %d, state %d, assert master workers %u\n",
+ dlm->name, lockres->lockname.len,
+ lockres->lockname.name,
+ !unused, lockres->state,
+ lockres->inflight_assert_workers);
+ list_move_tail(&lockres->purge, &dlm->purge_list);
spin_unlock(&lockres->spinlock);
continue;
}
@@ -286,8 +341,6 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
struct dlm_lock *lock, *target;
- struct list_head *iter;
- struct list_head *head;
int can_grant = 1;
/*
@@ -314,9 +367,7 @@ converting:
dlm->name, res->lockname.len, res->lockname.name);
BUG();
}
- head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
@@ -333,9 +384,8 @@ converting:
target->ml.convert_type;
}
}
- head = &res->converting;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+
+ list_for_each_entry(lock, &res->converting, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
@@ -384,9 +434,7 @@ blocked:
goto leave;
target = list_entry(res->blocked.next, struct dlm_lock, list);
- head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
@@ -400,9 +448,7 @@ blocked:
}
}
- head = &res->converting;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->converting, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
@@ -466,7 +512,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
assert_spin_locked(&res->spinlock);
/* don't shuffle secondary queues */
- if ((res->owner == dlm->node_num)) {
+ if (res->owner == dlm->node_num) {
if (res->state & (DLM_LOCK_RES_MIGRATING |
DLM_LOCK_RES_BLOCK_DIRTY))
return;
@@ -489,7 +535,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
{
mlog(0, "Starting dlm_thread...\n");
- dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
+ dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s",
+ dlm->name);
if (IS_ERR(dlm->dlm_thread_task)) {
mlog_errno(PTR_ERR(dlm->dlm_thread_task));
dlm->dlm_thread_task = NULL;
@@ -629,7 +676,6 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
#define DLM_THREAD_MAX_DIRTY 100
-#define DLM_THREAD_MAX_ASTS 10
static int dlm_thread(void *data)
{
@@ -695,7 +741,8 @@ static int dlm_thread(void *data)
* dirty for a short while. */
BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
- DLM_LOCK_RES_RECOVERING)) {
+ DLM_LOCK_RES_RECOVERING |
+ DLM_LOCK_RES_RECOVERY_WAITING)) {
/* move it to the tail and keep going */
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 850aa7e87537..7318e4794ef9 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -1,27 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmunlock.c
*
* underlying calls for unlocking locks
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
*/
@@ -38,15 +21,15 @@
#include <linux/spinlock.h>
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
#define DLM_UNLOCK_FREE_LOCK 0x00000001
#define DLM_UNLOCK_CALL_AST 0x00000002
@@ -105,7 +88,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
enum dlm_status status;
int actions = 0;
int in_use;
- u8 owner;
+ u8 owner;
+ int recovery_wait = 0;
mlog(0, "master_node = %d, valblk = %d\n", master_node,
flags & LKM_VALBLK);
@@ -191,7 +175,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
} else if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
- status == DLM_FORWARD) {
+ status == DLM_FORWARD ||
+ status == DLM_NOLOCKMGR
+ ) {
/* must clear the actions because this unlock
* is about to be retried. cannot free or do
* any list manipulation. */
@@ -200,14 +186,18 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
res->lockname.name,
status==DLM_RECOVERING?"recovering":
(status==DLM_MIGRATING?"migrating":
- "forward"));
+ (status == DLM_FORWARD ? "forward" :
+ "nolockmanager")));
actions = 0;
}
if (flags & LKM_CANCEL)
lock->cancel_pending = 0;
- else
- lock->unlock_pending = 0;
-
+ else {
+ if (!lock->unlock_pending)
+ recovery_wait = 1;
+ else
+ lock->unlock_pending = 0;
+ }
}
/* get an extra ref on lock. if we are just switching
@@ -241,6 +231,17 @@ leave:
spin_unlock(&res->spinlock);
wake_up(&res->wq);
+ if (recovery_wait) {
+ spin_lock(&res->spinlock);
+ /* Unlock request will directly succeed after owner dies,
+ * and the lock is already removed from grant list. We have to
+ * wait for RECOVERING done or we miss the chance to purge it
+ * since the removement is much faster than RECOVERING proc.
+ */
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING);
+ spin_unlock(&res->spinlock);
+ }
+
/* let the caller's final dlm_lock_put handle the actual kfree */
if (actions & DLM_UNLOCK_FREE_LOCK) {
/* this should always be coupled with list removal */
@@ -248,7 +249,7 @@ leave:
mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
- atomic_read(&lock->lock_refs.refcount)-1);
+ kref_read(&lock->lock_refs)-1);
dlm_lock_put(lock);
}
if (actions & DLM_UNLOCK_CALL_AST)
@@ -364,7 +365,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
* updated state to the recovery master. this thread
* just needs to finish out the operation and call
* the unlockast. */
- ret = DLM_NORMAL;
+ if (dlm_is_node_dead(dlm, owner))
+ ret = DLM_NORMAL;
+ else
+ ret = DLM_NOLOCKMGR;
} else {
/* something bad. this will BUG in ocfs2 */
ret = dlm_err_to_dlm_status(tmpret);
@@ -388,10 +392,9 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_ctxt *dlm = data;
struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
- struct list_head *iter;
- struct dlm_lock *lock = NULL;
+ struct dlm_lock *lock = NULL, *iter;
enum dlm_status status = DLM_NORMAL;
- int found = 0, i;
+ int i;
struct dlm_lockstatus *lksb = NULL;
int ignore;
u32 flags;
@@ -416,7 +419,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
if (!dlm_grab(dlm))
- return DLM_REJECTED;
+ return DLM_FORWARD;
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
@@ -434,7 +437,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
queue=&res->granted;
- found = 0;
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
spin_unlock(&res->spinlock);
@@ -458,22 +460,21 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
for (i=0; i<3; i++) {
- list_for_each(iter, queue) {
- lock = list_entry(iter, struct dlm_lock, list);
- if (lock->ml.cookie == unlock->cookie &&
- lock->ml.node == unlock->node_idx) {
- dlm_lock_get(lock);
- found = 1;
+ list_for_each_entry(iter, queue, list) {
+ if (iter->ml.cookie == unlock->cookie &&
+ iter->ml.node == unlock->node_idx) {
+ dlm_lock_get(iter);
+ lock = iter;
break;
}
}
- if (found)
+ if (lock)
break;
/* scan granted -> converting -> blocked queues */
queue++;
}
spin_unlock(&res->spinlock);
- if (!found) {
+ if (!lock) {
status = DLM_IVLOCKID;
goto not_found;
}
@@ -503,7 +504,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
dlm_kick_thread(dlm, res);
not_found:
- if (!found)
+ if (!lock)
mlog(ML_ERROR, "failed to find lock to unlock! "
"cookie=%u:%llu\n",
dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)),
@@ -640,7 +641,9 @@ retry:
if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
- status == DLM_FORWARD) {
+ status == DLM_FORWARD ||
+ status == DLM_NOLOCKMGR) {
+
/* We want to go away for a tiny bit to allow recovery
* / migration to complete on this resource. I don't
* know of any wait queue we could sleep on as this
@@ -652,7 +655,7 @@ retry:
msleep(50);
mlog(0, "retrying unlock due to pending recovery/"
- "migration/in-progress\n");
+ "migration/in-progress/reconnect\n");
goto retry;
}
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158d..000000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
-
-void dlm_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a16..000000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLM_VER_H
-#define DLM_VER_H
-
-void dlm_print_version(void);
-
-#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index f14be89a6701..c7895f65be0e 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,5 +1,4 @@
-ccflags-y := -Ifs/ocfs2
-
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 12bafb7265ce..339f0b11cdc8 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -1,6 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmfs.c
*
* Code which implements the kernel side of a minimal userspace
@@ -9,21 +8,6 @@
* which was a template for the fs side of this module.
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
/* Simple VFS hooks based on: */
@@ -36,6 +20,7 @@
#include <linux/module.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
#include <linux/pagemap.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -45,14 +30,13 @@
#include <linux/backing-dev.h>
#include <linux/poll.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
-#include "stackglue.h"
+#include "../stackglue.h"
#include "userdlm.h"
-#include "dlmfsver.h"
#define MLOG_MASK_PREFIX ML_DLMFS
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static const struct super_operations dlmfs_ops;
@@ -72,7 +56,7 @@ struct workqueue_struct *user_dlm_worker;
* Over time, dlmfs has added some features that were not part of the
* initial ABI. Unfortunately, some of these features are not detectable
* via standard usage. For example, Linux's default poll always returns
- * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
+ * EPOLLIN, so there is no way for a caller of poll(2) to know when dlmfs
* added poll support. Instead, we provide this list of new capabilities.
*
* Capabilities is a read-only attribute. We do it as a module parameter
@@ -84,21 +68,20 @@ struct workqueue_struct *user_dlm_worker;
* interaction.
*
* Capabilities:
- * - bast : POLLIN against the file descriptor of a held lock
+ * - bast : EPOLLIN against the file descriptor of a held lock
* signifies a bast fired on the lock.
*/
#define DLMFS_CAPABILITIES "bast stackglue"
static int param_set_dlmfs_capabilities(const char *val,
- struct kernel_param *kp)
+ const struct kernel_param *kp)
{
printk(KERN_ERR "%s: readonly parameter\n", kp->name);
return -EINVAL;
}
static int param_get_dlmfs_capabilities(char *buffer,
- struct kernel_param *kp)
+ const struct kernel_param *kp)
{
- return strlcpy(buffer, DLMFS_CAPABILITIES,
- strlen(DLMFS_CAPABILITIES) + 1);
+ return sysfs_emit(buffer, DLMFS_CAPABILITIES);
}
module_param_call(capabilities, param_set_dlmfs_capabilities,
param_get_dlmfs_capabilities, NULL, 0444);
@@ -180,7 +163,7 @@ bail:
static int dlmfs_file_release(struct inode *inode,
struct file *file)
{
- int level, status;
+ int level;
struct dlmfs_inode_private *ip = DLMFS_I(inode);
struct dlmfs_filp_private *fp = file->private_data;
@@ -189,7 +172,6 @@ static int dlmfs_file_release(struct inode *inode,
mlog(0, "close called on inode %lu\n", inode->i_ino);
- status = 0;
if (fp) {
level = fp->fp_lock_level;
if (level != DLM_LOCK_IV)
@@ -206,24 +188,25 @@ static int dlmfs_file_release(struct inode *inode,
* We do ->setattr() just to override size changes. Our size is the size
* of the LVB and nothing else.
*/
-static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
+static int dlmfs_file_setattr(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *attr)
{
int error;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
attr->ia_valid &= ~ATTR_SIZE;
- error = inode_change_ok(inode, attr);
+ error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
if (error)
return error;
- setattr_copy(inode, attr);
+ setattr_copy(&nop_mnt_idmap, inode, attr);
mark_inode_dirty(inode);
return 0;
}
-static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
+static __poll_t dlmfs_file_poll(struct file *file, poll_table *wait)
{
- int event = 0;
+ __poll_t event = 0;
struct inode *inode = file_inode(file);
struct dlmfs_inode_private *ip = DLMFS_I(inode);
@@ -231,58 +214,23 @@ static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
spin_lock(&ip->ip_lockres.l_lock);
if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
- event = POLLIN | POLLRDNORM;
+ event = EPOLLIN | EPOLLRDNORM;
spin_unlock(&ip->ip_lockres.l_lock);
return event;
}
-static ssize_t dlmfs_file_read(struct file *filp,
+static ssize_t dlmfs_file_read(struct file *file,
char __user *buf,
size_t count,
loff_t *ppos)
{
- int bytes_left;
- ssize_t readlen, got;
- char *lvb_buf;
- struct inode *inode = file_inode(filp);
-
- mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
- inode->i_ino, count, *ppos);
+ char lvb[DLM_LVB_LEN];
- if (*ppos >= i_size_read(inode))
+ if (!user_dlm_read_lvb(file_inode(file), lvb))
return 0;
- if (!count)
- return 0;
-
- if (!access_ok(VERIFY_WRITE, buf, count))
- return -EFAULT;
-
- /* don't read past the lvb */
- if ((count + *ppos) > i_size_read(inode))
- readlen = i_size_read(inode) - *ppos;
- else
- readlen = count;
-
- lvb_buf = kmalloc(readlen, GFP_NOFS);
- if (!lvb_buf)
- return -ENOMEM;
-
- got = user_dlm_read_lvb(inode, lvb_buf, readlen);
- if (got) {
- BUG_ON(got != readlen);
- bytes_left = __copy_to_user(buf, lvb_buf, readlen);
- readlen -= bytes_left;
- } else
- readlen = 0;
-
- kfree(lvb_buf);
-
- *ppos = *ppos + readlen;
-
- mlog(0, "read %zd bytes\n", readlen);
- return readlen;
+ return simple_read_from_buffer(buf, count, ppos, lvb, sizeof(lvb));
}
static ssize_t dlmfs_file_write(struct file *filp,
@@ -290,43 +238,31 @@ static ssize_t dlmfs_file_write(struct file *filp,
size_t count,
loff_t *ppos)
{
+ char lvb_buf[DLM_LVB_LEN];
int bytes_left;
- ssize_t writelen;
- char *lvb_buf;
struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
- if (*ppos >= i_size_read(inode))
+ if (*ppos >= DLM_LVB_LEN)
return -ENOSPC;
- if (!count)
- return 0;
-
- if (!access_ok(VERIFY_READ, buf, count))
- return -EFAULT;
-
/* don't write past the lvb */
- if ((count + *ppos) > i_size_read(inode))
- writelen = i_size_read(inode) - *ppos;
- else
- writelen = count - *ppos;
+ if (count > DLM_LVB_LEN - *ppos)
+ count = DLM_LVB_LEN - *ppos;
- lvb_buf = kmalloc(writelen, GFP_NOFS);
- if (!lvb_buf)
- return -ENOMEM;
-
- bytes_left = copy_from_user(lvb_buf, buf, writelen);
- writelen -= bytes_left;
- if (writelen)
- user_dlm_write_lvb(inode, lvb_buf, writelen);
+ if (!count)
+ return 0;
- kfree(lvb_buf);
+ bytes_left = copy_from_user(lvb_buf, buf, count);
+ count -= bytes_left;
+ if (count)
+ user_dlm_write_lvb(inode, lvb_buf, count);
- *ppos = *ppos + writelen;
- mlog(0, "wrote %zd bytes\n", writelen);
- return writelen;
+ *ppos = *ppos + count;
+ mlog(0, "wrote %zu bytes\n", count);
+ return count;
}
static void dlmfs_init_once(void *foo)
@@ -344,39 +280,41 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
{
struct dlmfs_inode_private *ip;
- ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
+ ip = alloc_inode_sb(sb, dlmfs_inode_cache, GFP_NOFS);
if (!ip)
return NULL;
return &ip->ip_vfs_inode;
}
-static void dlmfs_i_callback(struct rcu_head *head)
+static void dlmfs_free_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
}
-static void dlmfs_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, dlmfs_i_callback);
-}
-
static void dlmfs_evict_inode(struct inode *inode)
{
int status;
struct dlmfs_inode_private *ip;
+ struct user_lock_res *lockres;
+ int teardown;
clear_inode(inode);
mlog(0, "inode %lu\n", inode->i_ino);
ip = DLMFS_I(inode);
+ lockres = &ip->ip_lockres;
if (S_ISREG(inode->i_mode)) {
- status = user_dlm_destroy_lock(&ip->ip_lockres);
- if (status < 0)
- mlog_errno(status);
+ spin_lock(&lockres->l_lock);
+ teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN);
+ spin_unlock(&lockres->l_lock);
+ if (!teardown) {
+ status = user_dlm_destroy_lock(lockres);
+ if (status < 0)
+ mlog_errno(status);
+ }
iput(ip->ip_parent);
goto clear_fields;
}
@@ -391,25 +329,15 @@ clear_fields:
ip->ip_conn = NULL;
}
-static struct backing_dev_info dlmfs_backing_dev_info = {
- .name = "ocfs2-dlmfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
static struct inode *dlmfs_get_root_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
umode_t mode = S_IFDIR | 0755;
- struct dlmfs_inode_private *ip;
if (inode) {
- ip = DLMFS_I(inode);
-
inode->i_ino = get_next_ino();
- inode_init_owner(inode, NULL, mode);
- inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
+ simple_inode_init_ts(inode);
inc_nlink(inode);
inode->i_fop = &simple_dir_operations;
@@ -431,9 +359,8 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
return NULL;
inode->i_ino = get_next_ino();
- inode_init_owner(inode, parent, mode);
- inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
+ simple_inode_init_ts(inode);
ip = DLMFS_I(inode);
ip->ip_conn = DLMFS_I(parent)->ip_conn;
@@ -475,13 +402,14 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
* File creation. Allocate an inode, and we're done..
*/
/* SMP-safe */
-static int dlmfs_mkdir(struct inode * dir,
- struct dentry * dentry,
- umode_t mode)
+static struct dentry *dlmfs_mkdir(struct mnt_idmap * idmap,
+ struct inode * dir,
+ struct dentry * dentry,
+ umode_t mode)
{
int status;
struct inode *inode = NULL;
- struct qstr *domain = &dentry->d_name;
+ const struct qstr *domain = &dentry->d_name;
struct dlmfs_inode_private *ip;
struct ocfs2_cluster_connection *conn;
@@ -513,24 +441,24 @@ static int dlmfs_mkdir(struct inode * dir,
ip->ip_conn = conn;
inc_nlink(dir);
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
+ d_make_persistent(dentry, inode);
status = 0;
bail:
if (status < 0)
iput(inode);
- return status;
+ return ERR_PTR(status);
}
-static int dlmfs_create(struct inode *dir,
+static int dlmfs_create(struct mnt_idmap *idmap,
+ struct inode *dir,
struct dentry *dentry,
umode_t mode,
bool excl)
{
int status = 0;
struct inode *inode;
- struct qstr *name = &dentry->d_name;
+ const struct qstr *name = &dentry->d_name;
mlog(0, "create %.*s\n", name->len, name->name);
@@ -551,8 +479,7 @@ static int dlmfs_create(struct inode *dir,
goto bail;
}
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
+ d_make_persistent(dentry, inode);
bail:
return status;
}
@@ -561,7 +488,7 @@ static int dlmfs_unlink(struct inode *dir,
struct dentry *dentry)
{
int status;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
mlog(0, "unlink inode %lu\n", inode->i_ino);
@@ -569,8 +496,8 @@ static int dlmfs_unlink(struct inode *dir,
* to acquire a lock, this basically destroys our lockres. */
status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
if (status < 0) {
- mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
- dentry->d_name.len, dentry->d_name.name, status);
+ mlog(ML_ERROR, "unlink %pd, error %d from destroy\n",
+ dentry, status);
goto bail;
}
status = simple_unlink(dir, dentry);
@@ -578,13 +505,11 @@ bail:
return status;
}
-static int dlmfs_fill_super(struct super_block * sb,
- void * data,
- int silent)
+static int dlmfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = DLMFS_MAGIC;
sb->s_op = &dlmfs_ops;
sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
@@ -618,9 +543,9 @@ static const struct inode_operations dlmfs_root_inode_operations = {
static const struct super_operations dlmfs_ops = {
.statfs = simple_statfs,
.alloc_inode = dlmfs_alloc_inode,
- .destroy_inode = dlmfs_destroy_inode,
+ .free_inode = dlmfs_free_inode,
.evict_inode = dlmfs_evict_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
};
static const struct inode_operations dlmfs_file_inode_operations = {
@@ -628,17 +553,27 @@ static const struct inode_operations dlmfs_file_inode_operations = {
.setattr = dlmfs_file_setattr,
};
-static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int dlmfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, dlmfs_fill_super);
+}
+
+static const struct fs_context_operations dlmfs_context_ops = {
+ .get_tree = dlmfs_get_tree,
+};
+
+static int dlmfs_init_fs_context(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
+ fc->ops = &dlmfs_context_ops;
+
+ return 0;
}
static struct file_system_type dlmfs_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2_dlmfs",
- .mount = dlmfs_mount,
- .kill_sb = kill_litter_super,
+ .kill_sb = kill_anon_super,
+ .init_fs_context = dlmfs_init_fs_context,
};
MODULE_ALIAS_FS("ocfs2_dlmfs");
@@ -647,16 +582,10 @@ static int __init init_dlmfs_fs(void)
int status;
int cleanup_inode = 0, cleanup_worker = 0;
- dlmfs_print_version();
-
- status = bdi_init(&dlmfs_backing_dev_info);
- if (status)
- return status;
-
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_ACCOUNT),
dlmfs_init_once);
if (!dlmfs_inode_cache) {
status = -ENOMEM;
@@ -664,7 +593,8 @@ static int __init init_dlmfs_fs(void)
}
cleanup_inode = 1;
- user_dlm_worker = create_singlethread_workqueue("user_dlm");
+ user_dlm_worker = alloc_workqueue("user_dlm",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!user_dlm_worker) {
status = -ENOMEM;
goto bail;
@@ -679,7 +609,6 @@ bail:
kmem_cache_destroy(dlmfs_inode_cache);
if (cleanup_worker)
destroy_workqueue(user_dlm_worker);
- bdi_destroy(&dlmfs_backing_dev_info);
} else
printk("OCFS2 User DLM kernel interface loaded\n");
return status;
@@ -689,7 +618,6 @@ static void __exit exit_dlmfs_fs(void)
{
unregister_filesystem(&dlmfs_fs_type);
- flush_workqueue(user_dlm_worker);
destroy_workqueue(user_dlm_worker);
/*
@@ -699,11 +627,11 @@ static void __exit exit_dlmfs_fs(void)
rcu_barrier();
kmem_cache_destroy(dlmfs_inode_cache);
- bdi_destroy(&dlmfs_backing_dev_info);
}
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
module_init(init_dlmfs_fs)
module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
deleted file mode 100644
index a733b3321f83..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmfsver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
-
-void dlmfs_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
deleted file mode 100644
index f35eadbed25c..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLMFS_VER_H
-#define DLMFS_VER_H
-
-void dlmfs_print_version(void);
-
-#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 0499e3fb7bdb..617c92e7b925 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -1,6 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* userdlm.c
*
* Code which implements the kernel side of a minimal userspace
@@ -10,36 +9,22 @@
* functions.
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/signal.h>
+#include <linux/sched/signal.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/crc32.h>
-#include "ocfs2_lockingver.h"
-#include "stackglue.h"
+#include "../ocfs2_lockingver.h"
+#include "../stackglue.h"
#include "userdlm.h"
#define MLOG_MASK_PREFIX ML_DLMFS
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
@@ -448,6 +433,11 @@ again:
}
spin_lock(&lockres->l_lock);
+ if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+ spin_unlock(&lockres->l_lock);
+ status = -EAGAIN;
+ goto bail;
+ }
/* We only compare against the currently granted level
* here. If the lock is blocked waiting on a downconvert,
@@ -560,24 +550,20 @@ void user_dlm_write_lvb(struct inode *inode,
spin_unlock(&lockres->l_lock);
}
-ssize_t user_dlm_read_lvb(struct inode *inode,
- char *val,
- unsigned int len)
+bool user_dlm_read_lvb(struct inode *inode, char *val)
{
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
char *lvb;
- ssize_t ret = len;
-
- BUG_ON(len > DLM_LVB_LEN);
+ bool ret = true;
spin_lock(&lockres->l_lock);
BUG_ON(lockres->l_level < DLM_LOCK_PR);
if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
- memcpy(val, lvb, len);
+ memcpy(val, lvb, DLM_LVB_LEN);
} else
- ret = 0;
+ ret = false;
spin_unlock(&lockres->l_lock);
return ret;
@@ -614,7 +600,7 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
spin_lock(&lockres->l_lock);
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
spin_unlock(&lockres->l_lock);
- return 0;
+ goto bail;
}
lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
@@ -628,22 +614,30 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
}
if (lockres->l_ro_holders || lockres->l_ex_holders) {
+ lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN;
spin_unlock(&lockres->l_lock);
goto bail;
}
status = 0;
if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+ /*
+ * lock is never requested, leave USER_LOCK_IN_TEARDOWN set
+ * to avoid new lock request coming in.
+ */
spin_unlock(&lockres->l_lock);
goto bail;
}
- lockres->l_flags &= ~USER_LOCK_ATTACHED;
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
if (status) {
+ spin_lock(&lockres->l_lock);
+ lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN;
+ lockres->l_flags &= ~USER_LOCK_BUSY;
+ spin_unlock(&lockres->l_lock);
user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
goto bail;
}
@@ -667,7 +661,7 @@ void user_dlm_set_locking_protocol(void)
ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
}
-struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
+struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name)
{
int rc;
struct ocfs2_cluster_connection *conn;
diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 3b42d79531d7..47ba18eac423 100644
--- a/fs/ocfs2/dlmfs/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* userdlm.h
*
* Userspace dlm defines
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
@@ -80,10 +64,8 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
void user_dlm_write_lvb(struct inode *inode,
const char *val,
unsigned int len);
-ssize_t user_dlm_read_lvb(struct inode *inode,
- char *val,
- unsigned int len);
-struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
+bool user_dlm_read_lvb(struct inode *inode, char *val);
+struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name);
void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
void user_dlm_set_locking_protocol(void);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3a44a648dae7..619ff03b15d6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmglue.c
*
* Code which implements an OCFS2 specific interface to our DLM.
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/types.h>
@@ -32,7 +16,10 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/delay.h>
#include <linux/quotaops.h>
+#include <linux/sched/signal.h>
+#include <linux/string_choices.h>
#define MLOG_MASK_PREFIX ML_DLM_GLUE
#include <cluster/masklog.h>
@@ -54,6 +41,7 @@
#include "uptodate.h"
#include "quota.h"
#include "refcounttree.h"
+#include "acl.h"
#include "buffer_head_io.h"
@@ -94,7 +82,9 @@ struct ocfs2_unblock_ctl {
};
/* Lockdep class keys */
-struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+#endif
static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
int new_level);
@@ -232,12 +222,12 @@ struct ocfs2_lock_res_ops {
*/
#define LOCK_TYPE_USES_LVB 0x2
-static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
.get_osb = ocfs2_get_inode_osb,
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
.get_osb = ocfs2_get_inode_osb,
.check_downconvert = ocfs2_check_meta_downconvert,
.set_lvb = ocfs2_set_meta_lvb,
@@ -245,46 +235,50 @@ static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
.flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
};
-static struct ocfs2_lock_res_ops ocfs2_super_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_super_lops = {
.flags = LOCK_TYPE_REQUIRES_REFRESH,
};
-static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_rename_lops = {
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = {
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
+static const struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
.flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
};
-static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.get_osb = ocfs2_get_dentry_osb,
.post_unlock = ocfs2_dentry_post_unlock,
.downconvert_worker = ocfs2_dentry_convert_worker,
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
.get_osb = ocfs2_get_inode_osb,
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_flock_lops = {
.get_osb = ocfs2_get_file_osb,
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
.set_lvb = ocfs2_set_qinfo_lvb,
.get_osb = ocfs2_get_qinfo_osb,
.flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
};
-static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
.check_downconvert = ocfs2_check_refcount_downconvert,
.downconvert_worker = ocfs2_refcount_convert_worker,
.flags = 0,
@@ -432,6 +426,7 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
{
res->l_lock_refresh = 0;
+ res->l_lock_wait = 0;
memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
}
@@ -466,6 +461,8 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
if (ret)
stats->ls_fail++;
+
+ stats->ls_last = ktime_to_us(ktime_get_real());
}
static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
@@ -473,6 +470,21 @@ static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
lockres->l_lock_refresh++;
}
+static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_mask_waiter *mw;
+
+ if (list_empty(&lockres->l_mask_waiters)) {
+ lockres->l_lock_wait = 0;
+ return;
+ }
+
+ mw = list_first_entry(&lockres->l_mask_waiters,
+ struct ocfs2_mask_waiter, mw_item);
+ lockres->l_lock_wait =
+ ktime_to_us(ktime_mono_to_real(mw->mw_lock_start));
+}
+
static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
{
mw->mw_lock_start = ktime_get();
@@ -488,6 +500,9 @@ static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
{
}
+static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres)
+{
+}
static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
{
}
@@ -496,7 +511,7 @@ static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
- struct ocfs2_lock_res_ops *ops,
+ const struct ocfs2_lock_res_ops *ops,
void *priv)
{
res->l_type = type;
@@ -531,6 +546,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
init_waitqueue_head(&res->l_event);
INIT_LIST_HEAD(&res->l_blocked_list);
INIT_LIST_HEAD(&res->l_mask_waiters);
+ INIT_LIST_HEAD(&res->l_holders);
}
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
@@ -538,7 +554,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
unsigned int generation,
struct inode *inode)
{
- struct ocfs2_lock_res_ops *ops;
+ const struct ocfs2_lock_res_ops *ops;
switch(type) {
case OCFS2_LOCK_TYPE_RW:
@@ -554,7 +570,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
mlog_bug_on_msg(1, "type: %d\n", type);
ops = NULL; /* thanks, gcc */
break;
- };
+ }
ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
generation, res->l_name);
@@ -673,6 +689,35 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb)
+{
+ ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ init_rwsem(&osb->nfs_sync_rwlock);
+}
+
+void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ /* Only one trimfs thread are allowed to work at the same time. */
+ mutex_lock(&osb->obs_trim_fs_mutex);
+
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
+ ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
+ &ocfs2_trim_fs_lops, osb);
+}
+
+void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
+{
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ ocfs2_simple_drop_lockres(osb, lockres);
+ ocfs2_lock_res_free(lockres);
+
+ mutex_unlock(&osb->obs_trim_fs_mutex);
+}
+
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
struct ocfs2_super *osb)
{
@@ -748,6 +793,49 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
res->l_flags = 0UL;
}
+/*
+ * Keep a list of processes who have interest in a lockres.
+ * Note: this is now only used for check recursive cluster locking.
+ */
+static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
+ struct ocfs2_lock_holder *oh)
+{
+ INIT_LIST_HEAD(&oh->oh_list);
+ oh->oh_owner_pid = get_pid(task_pid(current));
+
+ spin_lock(&lockres->l_lock);
+ list_add_tail(&oh->oh_list, &lockres->l_holders);
+ spin_unlock(&lockres->l_lock);
+}
+
+static struct ocfs2_lock_holder *
+ocfs2_pid_holder(struct ocfs2_lock_res *lockres,
+ struct pid *pid)
+{
+ struct ocfs2_lock_holder *oh;
+
+ spin_lock(&lockres->l_lock);
+ list_for_each_entry(oh, &lockres->l_holders, oh_list) {
+ if (oh->oh_owner_pid == pid) {
+ spin_unlock(&lockres->l_lock);
+ return oh;
+ }
+ }
+ spin_unlock(&lockres->l_lock);
+ return NULL;
+}
+
+static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
+ struct ocfs2_lock_holder *oh)
+{
+ spin_lock(&lockres->l_lock);
+ list_del(&oh->oh_list);
+ spin_unlock(&lockres->l_lock);
+
+ put_pid(oh->oh_owner_pid);
+}
+
+
static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
int level)
{
@@ -814,6 +902,7 @@ static void lockres_set_flags(struct ocfs2_lock_res *lockres,
list_del_init(&mw->mw_item);
mw->mw_status = 0;
complete(&mw->mw_complete);
+ ocfs2_track_lock_wait(lockres);
}
}
static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
@@ -861,8 +950,13 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
* We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
* the OCFS2_LOCK_BUSY flag to prevent the dc thread from
* downconverting the lock before the upconvert has fully completed.
+ * Do not prevent the dc thread from downconverting if NONBLOCK lock
+ * had already returned.
*/
- lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED))
+ lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ else
+ lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED);
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
}
@@ -1304,7 +1398,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
{
wait_for_completion(&mw->mw_complete);
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return mw->mw_status;
}
@@ -1320,24 +1414,37 @@ static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
mw->mw_mask = mask;
mw->mw_goal = goal;
+ ocfs2_track_lock_wait(lockres);
}
/* returns 0 if the mw that was removed was already satisfied, -EBUSY
* if the mask still hadn't reached its goal */
-static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
struct ocfs2_mask_waiter *mw)
{
- unsigned long flags;
int ret = 0;
- spin_lock_irqsave(&lockres->l_lock, flags);
+ assert_spin_locked(&lockres->l_lock);
if (!list_empty(&mw->mw_item)) {
if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
ret = -EBUSY;
list_del_init(&mw->mw_item);
init_completion(&mw->mw_complete);
+ ocfs2_track_lock_wait(lockres);
}
+
+ return ret;
+}
+
+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+ struct ocfs2_mask_waiter *mw)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ ret = __lockres_remove_mask_waiter(lockres, mw);
spin_unlock_irqrestore(&lockres->l_lock, flags);
return ret;
@@ -1355,7 +1462,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
else
ret = mw->mw_status;
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return ret;
}
@@ -1373,6 +1480,13 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
unsigned long flags;
unsigned int gen;
int noqueue_attempted = 0;
+ int dlm_locked = 0;
+ int kick_dc = 0;
+
+ if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
+ mlog_errno(-EINVAL);
+ return -EINVAL;
+ }
ocfs2_init_mask_waiter(&mw);
@@ -1481,6 +1595,7 @@ again:
ocfs2_recover_from_dlm_error(lockres, 1);
goto out;
}
+ dlm_locked = 1;
mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
lockres->l_name);
@@ -1501,7 +1616,12 @@ update_holders:
unlock:
lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ /* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+ kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
spin_unlock_irqrestore(&lockres->l_lock, flags);
+ if (kick_dc)
+ ocfs2_wake_downconvert_thread(osb);
out:
/*
* This is helping work around a lock inversion between the page lock
@@ -1514,10 +1634,17 @@ out:
if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
wait = 0;
- if (lockres_remove_mask_waiter(lockres, &mw))
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ if (__lockres_remove_mask_waiter(lockres, &mw)) {
+ if (dlm_locked)
+ lockres_or_flags(lockres,
+ OCFS2_LOCK_NONBLOCK_FINISHED);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = -EAGAIN;
- else
+ } else {
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
goto again;
+ }
}
if (wait) {
ret = ocfs2_wait_for_mask(&mw);
@@ -1566,7 +1693,7 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
spin_unlock_irqrestore(&lockres->l_lock, flags);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (lockres->l_lockdep_map.key != NULL)
- rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+ rwsem_release(&lockres->l_lockdep_map, caller_ip);
#endif
}
@@ -1598,7 +1725,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
int ret;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
BUG_ON(!ocfs2_inode_is_new(inode));
mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1628,10 +1754,8 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
}
ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto bail;
- }
bail:
return ret;
@@ -1643,8 +1767,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
-
mlog(0, "inode %llu take %s RW lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
@@ -1656,14 +1778,34 @@ int ocfs2_rw_lock(struct inode *inode, int write)
level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
- 0);
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
if (status < 0)
mlog_errno(status);
return status;
}
+int ocfs2_try_rw_lock(struct inode *inode, int write)
+{
+ int status, level;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ mlog(0, "inode %llu try to take %s RW lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ write ? "EXMODE" : "PRMODE");
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+ level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+ status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
+ return status;
+}
+
void ocfs2_rw_unlock(struct inode *inode, int write)
{
int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
@@ -1675,7 +1817,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
write ? "EXMODE" : "PRMODE");
if (!ocfs2_mount_local(osb))
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+ ocfs2_cluster_unlock(osb, lockres, level);
}
/*
@@ -1687,8 +1829,6 @@ int ocfs2_open_lock(struct inode *inode)
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
-
mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1697,8 +1837,7 @@ int ocfs2_open_lock(struct inode *inode)
lockres = &OCFS2_I(inode)->ip_open_lockres;
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_PR, 0, 0);
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -1712,8 +1851,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
-
mlog(0, "inode %llu try to take %s open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
@@ -1737,8 +1874,7 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
* other nodes and the -EAGAIN will indicate to the caller that
* this inode is still in use.
*/
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
- level, DLM_LKF_NOQUEUE, 0);
+ status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
out:
return status;
@@ -1759,11 +1895,9 @@ void ocfs2_open_unlock(struct inode *inode)
goto out;
if(lockres->l_ro_holders)
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_PR);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR);
if(lockres->l_ex_holders)
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_EX);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
out:
return;
@@ -2005,15 +2139,15 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
}
#define OCFS2_SEC_BITS 34
-#define OCFS2_SEC_SHIFT (64 - 34)
+#define OCFS2_SEC_SHIFT (64 - OCFS2_SEC_BITS)
#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
/* LVB only has room for 64 bits of time here so we pack it for
* now. */
-static u64 ocfs2_pack_timespec(struct timespec *spec)
+static u64 ocfs2_pack_timespec(struct timespec64 *spec)
{
u64 res;
- u64 sec = spec->tv_sec;
+ u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull);
u32 nsec = spec->tv_nsec;
res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
@@ -2029,6 +2163,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
+ struct timespec64 ts;
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2049,12 +2184,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
- lvb->lvb_iatime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
- lvb->lvb_ictime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
- lvb->lvb_imtime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+ ts = inode_get_atime(inode);
+ lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+ ts = inode_get_ctime(inode);
+ lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+ ts = inode_get_mtime(inode);
+ lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2063,22 +2198,25 @@ out:
mlog_meta_lvb(0, lockres);
}
-static void ocfs2_unpack_timespec(struct timespec *spec,
+static void ocfs2_unpack_timespec(struct timespec64 *spec,
u64 packed_time)
{
spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
}
-static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
+ struct timespec64 ts;
mlog_meta_lvb(0, lockres);
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ if (inode_wrong_type(inode, be16_to_cpu(lvb->lvb_imode)))
+ return -ESTALE;
/* We're safe here without the lockres lock... */
spin_lock(&oi->ip_lock);
@@ -2099,13 +2237,14 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
- ocfs2_unpack_timespec(&inode->i_atime,
- be64_to_cpu(lvb->lvb_iatime_packed));
- ocfs2_unpack_timespec(&inode->i_mtime,
- be64_to_cpu(lvb->lvb_imtime_packed));
- ocfs2_unpack_timespec(&inode->i_ctime,
- be64_to_cpu(lvb->lvb_ictime_packed));
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed));
+ inode_set_atime_to_ts(inode, ts);
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed));
+ inode_set_mtime_to_ts(inode, ts);
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed));
+ inode_set_ctime_to_ts(inode, ts);
spin_unlock(&oi->ip_lock);
+ return 0;
}
static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
@@ -2208,7 +2347,8 @@ static int ocfs2_inode_lock_update(struct inode *inode,
if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
mlog(0, "Trusting LVB on inode %llu\n",
(unsigned long long)oi->ip_blkno);
- ocfs2_refresh_inode_from_lvb(inode);
+ status = ocfs2_refresh_inode_from_lvb(inode);
+ goto bail_refresh;
} else {
/* Boo, we have to go to disk. */
/* read bh, cast, ocfs2_refresh_inode */
@@ -2218,6 +2358,10 @@ static int ocfs2_inode_lock_update(struct inode *inode,
goto bail_refresh;
}
fe = (struct ocfs2_dinode *) (*bh)->b_data;
+ if (inode_wrong_type(inode, le16_to_cpu(fe->i_mode))) {
+ status = -ESTALE;
+ goto bail_refresh;
+ }
/* This is a good chance to make sure we're not
* locking an invalid object. ocfs2_read_inode_block()
@@ -2291,8 +2435,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *local_bh = NULL;
- BUG_ON(!inode);
-
mlog(0, "inode %llu, take %s META lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
@@ -2307,8 +2449,9 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
goto getbh;
}
- if (ocfs2_mount_local(osb))
- goto local;
+ if ((arg_flags & OCFS2_META_LOCK_GETBH) ||
+ ocfs2_mount_local(osb))
+ goto update;
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
ocfs2_wait_for_recovery(osb);
@@ -2337,14 +2480,14 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
ocfs2_wait_for_recovery(osb);
-local:
+update:
/*
* We only see this flag if we're being called from
* ocfs2_read_locked_inode(). It means we're locking an inode
* which hasn't been populated yet, so clear the refresh flag
* and let the caller handle it.
*/
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
status = 0;
if (lockres)
ocfs2_complete_lock_res_refresh(lockres, 0);
@@ -2381,44 +2524,41 @@ bail:
ocfs2_inode_unlock(inode, ex);
}
- if (local_bh)
- brelse(local_bh);
-
+ brelse(local_bh);
return status;
}
/*
* This is working around a lock inversion between tasks acquiring DLM
- * locks while holding a page lock and the downconvert thread which
- * blocks dlm lock acquiry while acquiring page locks.
+ * locks while holding a folio lock and the downconvert thread which
+ * blocks dlm lock acquiry while acquiring folio locks.
*
- * ** These _with_page variantes are only intended to be called from aop
- * methods that hold page locks and return a very specific *positive* error
+ * ** These _with_folio variants are only intended to be called from aop
+ * methods that hold folio locks and return a very specific *positive* error
* code that aop methods pass up to the VFS -- test for errors with != 0. **
*
* The DLM is called such that it returns -EAGAIN if it would have
* blocked waiting for the downconvert thread. In that case we unlock
- * our page so the downconvert thread can make progress. Once we've
+ * our folio so the downconvert thread can make progress. Once we've
* done this we have to return AOP_TRUNCATED_PAGE so the aop method
* that called us can bubble that back up into the VFS who will then
* immediately retry the aop call.
- *
- * We do a blocking lock and immediate unlock before returning, though, so that
- * the lock has a great chance of being cached on this node by the time the VFS
- * calls back to retry the aop. This has a potential to livelock as nodes
- * ping locks back and forth, but that's a risk we're willing to take to avoid
- * the lock inversion simply.
*/
-int ocfs2_inode_lock_with_page(struct inode *inode,
- struct buffer_head **ret_bh,
- int ex,
- struct page *page)
+int ocfs2_inode_lock_with_folio(struct inode *inode,
+ struct buffer_head **ret_bh, int ex, struct folio *folio)
{
int ret;
ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
- unlock_page(page);
+ folio_unlock(folio);
+ /*
+ * If we can't get inode lock immediately, we should not return
+ * directly here, since this will lead to a softlockup problem.
+ * The method is to get a blocking lock and immediately unlock
+ * before returning, this can avoid CPU resource waste due to
+ * lots of retries, and benefits fairness in getting lock.
+ */
if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
@@ -2429,13 +2569,18 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
- int *level)
+ int *level, int wait)
{
int ret;
- ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ else
+ ret = ocfs2_try_inode_lock(inode, NULL, 0);
+
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
return ret;
}
@@ -2447,16 +2592,20 @@ int ocfs2_inode_lock_atime(struct inode *inode,
struct buffer_head *bh = NULL;
ocfs2_inode_unlock(inode, 0);
- ret = ocfs2_inode_lock(inode, &bh, 1);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, &bh, 1);
+ else
+ ret = ocfs2_try_inode_lock(inode, &bh, 1);
+
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
return ret;
}
*level = 1;
if (ocfs2_should_update_atime(inode, vfsmnt))
ocfs2_update_inode_atime(inode, bh);
- if (bh)
- brelse(bh);
+ brelse(bh);
} else
*level = 0;
@@ -2474,9 +2623,126 @@ void ocfs2_inode_unlock(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
- if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
+ if (!ocfs2_is_hard_readonly(osb) &&
!ocfs2_mount_local(osb))
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+/*
+ * This _tracker variants are introduced to deal with the recursive cluster
+ * locking issue. The idea is to keep track of a lock holder on the stack of
+ * the current process. If there's a lock holder on the stack, we know the
+ * task context is already protected by cluster locking. Currently, they're
+ * used in some VFS entry routines.
+ *
+ * return < 0 on error, return == 0 if there's no lock holder on the stack
+ * before this call, return == 1 if this call would be a recursive locking.
+ * return == -1 if this lock attempt will cause an upgrade which is forbidden.
+ *
+ * When taking lock levels into account,we face some different situations.
+ *
+ * 1. no lock is held
+ * In this case, just lock the inode as requested and return 0
+ *
+ * 2. We are holding a lock
+ * For this situation, things diverges into several cases
+ *
+ * wanted holding what to do
+ * ex ex see 2.1 below
+ * ex pr see 2.2 below
+ * pr ex see 2.1 below
+ * pr pr see 2.1 below
+ *
+ * 2.1 lock level that is been held is compatible
+ * with the wanted level, so no lock action will be tacken.
+ *
+ * 2.2 Otherwise, an upgrade is needed, but it is forbidden.
+ *
+ * Reason why upgrade within a process is forbidden is that
+ * lock upgrade may cause dead lock. The following illustrates
+ * how it happens.
+ *
+ * thread on node1 thread on node2
+ * ocfs2_inode_lock_tracker(ex=0)
+ *
+ * <====== ocfs2_inode_lock_tracker(ex=1)
+ *
+ * ocfs2_inode_lock_tracker(ex=1)
+ */
+int ocfs2_inode_lock_tracker(struct inode *inode,
+ struct buffer_head **ret_bh,
+ int ex,
+ struct ocfs2_lock_holder *oh)
+{
+ int status = 0;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_lock_holder *tmp_oh;
+ struct pid *pid = task_pid(current);
+
+
+ lockres = &OCFS2_I(inode)->ip_inode_lockres;
+ tmp_oh = ocfs2_pid_holder(lockres, pid);
+
+ if (!tmp_oh) {
+ /*
+ * This corresponds to the case 1.
+ * We haven't got any lock before.
+ */
+ status = ocfs2_inode_lock_full(inode, ret_bh, ex, 0);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+
+ oh->oh_ex = ex;
+ ocfs2_add_holder(lockres, oh);
+ return 0;
+ }
+
+ if (unlikely(ex && !tmp_oh->oh_ex)) {
+ /*
+ * case 2.2 upgrade may cause dead lock, forbid it.
+ */
+ mlog(ML_ERROR, "Recursive locking is not permitted to "
+ "upgrade to EX level from PR level.\n");
+ dump_stack();
+ return -EINVAL;
+ }
+
+ /*
+ * case 2.1 OCFS2_META_LOCK_GETBH flag make ocfs2_inode_lock_full.
+ * ignore the lock level and just update it.
+ */
+ if (ret_bh) {
+ status = ocfs2_inode_lock_full(inode, ret_bh, ex,
+ OCFS2_META_LOCK_GETBH);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+ }
+ return 1;
+}
+
+void ocfs2_inode_unlock_tracker(struct inode *inode,
+ int ex,
+ struct ocfs2_lock_holder *oh,
+ int had_lock)
+{
+ struct ocfs2_lock_res *lockres;
+
+ lockres = &OCFS2_I(inode)->ip_inode_lockres;
+ /* had_lock means that the current process already takes the cluster
+ * lock previously.
+ * If had_lock is 1, we have nothing to do here.
+ * If had_lock is 0, we will release the lock.
+ */
+ if (!had_lock) {
+ ocfs2_inode_unlock(inode, oh->oh_ex);
+ ocfs2_remove_holder(lockres, oh);
+ }
}
int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
@@ -2544,11 +2810,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
* refreshed, so we do it here. Of course, making sense of
* everything is up to the caller :) */
status = ocfs2_should_refresh_lock_res(lockres);
- if (status < 0) {
- ocfs2_cluster_unlock(osb, lockres, level);
- mlog_errno(status);
- goto bail;
- }
if (status) {
status = ocfs2_refresh_slot_info(osb);
@@ -2608,14 +2869,25 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
+ if (ex)
+ down_write(&osb->nfs_sync_rwlock);
+ else
+ down_read(&osb->nfs_sync_rwlock);
+
if (ocfs2_mount_local(osb))
return 0;
status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
0, 0);
- if (status < 0)
+ if (status < 0) {
mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+ if (ex)
+ up_write(&osb->nfs_sync_rwlock);
+ else
+ up_read(&osb->nfs_sync_rwlock);
+ }
+
return status;
}
@@ -2626,6 +2898,74 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
if (!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, lockres,
ex ? LKM_EXMODE : LKM_PRMODE);
+ if (ex)
+ up_write(&osb->nfs_sync_rwlock);
+ else
+ up_read(&osb->nfs_sync_rwlock);
+}
+
+int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
+ struct ocfs2_trim_fs_info *info, int trylock)
+{
+ int status;
+ struct ocfs2_trim_fs_lvb *lvb;
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ if (info)
+ info->tf_valid = 0;
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX,
+ trylock ? DLM_LKF_NOQUEUE : 0, 0);
+ if (status < 0) {
+ if (status != -EAGAIN)
+ mlog_errno(status);
+ return status;
+ }
+
+ if (info) {
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+ lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) {
+ info->tf_valid = 1;
+ info->tf_success = lvb->lvb_success;
+ info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum);
+ info->tf_start = be64_to_cpu(lvb->lvb_start);
+ info->tf_len = be64_to_cpu(lvb->lvb_len);
+ info->tf_minlen = be64_to_cpu(lvb->lvb_minlen);
+ info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen);
+ }
+ }
+
+ return status;
+}
+
+void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
+ struct ocfs2_trim_fs_info *info)
+{
+ struct ocfs2_trim_fs_lvb *lvb;
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ if (ocfs2_mount_local(osb))
+ return;
+
+ if (info) {
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION;
+ lvb->lvb_success = info->tf_success;
+ lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum);
+ lvb->lvb_start = cpu_to_be64(info->tf_start);
+ lvb->lvb_len = cpu_to_be64(info->tf_len);
+ lvb->lvb_minlen = cpu_to_be64(info->tf_minlen);
+ lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen);
+ }
+
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
}
int ocfs2_dentry_lock(struct dentry *dentry, int ex)
@@ -2698,7 +3038,7 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
kref_init(&dlm_debug->d_refcnt);
INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
- dlm_debug->d_locking_state = NULL;
+ dlm_debug->d_filter_secs = 0;
out:
return dlm_debug;
}
@@ -2769,6 +3109,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
struct ocfs2_lock_res *iter = v;
struct ocfs2_lock_res *dummy = &priv->p_iter_res;
+ (*pos)++;
spin_lock(&ocfs2_dlm_tracking_lock);
iter = ocfs2_dlm_next_res(iter, priv);
list_del_init(&dummy->l_debug_list);
@@ -2789,17 +3130,40 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
* - Lock stats printed
* New in version 3
* - Max time in lock stats is in usecs (instead of nsecs)
+ * New in version 4
+ * - Add last pr/ex unlock times and first lock wait time in usecs
*/
-#define OCFS2_DLM_DEBUG_STR_VERSION 3
+#define OCFS2_DLM_DEBUG_STR_VERSION 4
static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
{
int i;
char *lvb;
struct ocfs2_lock_res *lockres = v;
+#ifdef CONFIG_OCFS2_FS_STATS
+ u64 now, last;
+ struct ocfs2_dlm_debug *dlm_debug =
+ ((struct ocfs2_dlm_seq_priv *)m->private)->p_dlm_debug;
+#endif
if (!lockres)
return -EINVAL;
+#ifdef CONFIG_OCFS2_FS_STATS
+ if (!lockres->l_lock_wait && dlm_debug->d_filter_secs) {
+ now = ktime_to_us(ktime_get_real());
+ last = max(lockres->l_lock_prmode.ls_last,
+ lockres->l_lock_exmode.ls_last);
+ /*
+ * Use d_filter_secs field to filter lock resources dump,
+ * the default d_filter_secs(0) value filters nothing,
+ * otherwise, only dump the last N seconds active lock
+ * resources.
+ */
+ if (div_u64(now - last, 1000000) > dlm_debug->d_filter_secs)
+ return 0;
+ }
+#endif
+
seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
@@ -2841,6 +3205,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max)
# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max)
# define lock_refresh(_l) ((_l)->l_lock_refresh)
+# define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last)
+# define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last)
+# define lock_wait(_l) ((_l)->l_lock_wait)
#else
# define lock_num_prmode(_l) (0)
# define lock_num_exmode(_l) (0)
@@ -2851,6 +3218,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
# define lock_max_prmode(_l) (0)
# define lock_max_exmode(_l) (0)
# define lock_refresh(_l) (0)
+# define lock_last_prmode(_l) (0ULL)
+# define lock_last_exmode(_l) (0ULL)
+# define lock_wait(_l) (0ULL)
#endif
/* The following seq_print was added in version 2 of this output */
seq_printf(m, "%u\t"
@@ -2861,7 +3231,10 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
"%llu\t"
"%u\t"
"%u\t"
- "%u\t",
+ "%u\t"
+ "%llu\t"
+ "%llu\t"
+ "%llu\t",
lock_num_prmode(lockres),
lock_num_exmode(lockres),
lock_num_prmode_failed(lockres),
@@ -2870,7 +3243,10 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
lock_total_exmode(lockres),
lock_max_prmode(lockres),
lock_max_exmode(lockres),
- lock_refresh(lockres));
+ lock_refresh(lockres),
+ lock_last_prmode(lockres),
+ lock_last_exmode(lockres),
+ lock_wait(lockres));
/* End the line */
seq_printf(m, "\n");
@@ -2897,37 +3273,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
{
- int ret;
struct ocfs2_dlm_seq_priv *priv;
- struct seq_file *seq;
struct ocfs2_super *osb;
- priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
+ priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
if (!priv) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
+
osb = inode->i_private;
ocfs2_get_dlm_debug(osb->osb_dlm_debug);
priv->p_dlm_debug = osb->osb_dlm_debug;
INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
- ret = seq_open(file, &ocfs2_dlm_seq_ops);
- if (ret) {
- kfree(priv);
- mlog_errno(ret);
- goto out;
- }
-
- seq = file->private_data;
- seq->private = priv;
-
ocfs2_add_lockres_tracking(&priv->p_iter_res,
priv->p_dlm_debug);
-out:
- return ret;
+ return 0;
}
static const struct file_operations ocfs2_dlm_debug_fops = {
@@ -2937,36 +3300,24 @@ static const struct file_operations ocfs2_dlm_debug_fops = {
.llseek = seq_lseek,
};
-static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
+static void ocfs2_dlm_init_debug(struct ocfs2_super *osb)
{
- int ret = 0;
struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
- dlm_debug->d_locking_state = debugfs_create_file("locking_state",
- S_IFREG|S_IRUSR,
- osb->osb_debug_root,
- osb,
- &ocfs2_dlm_debug_fops);
- if (!dlm_debug->d_locking_state) {
- ret = -EINVAL;
- mlog(ML_ERROR,
- "Unable to create locking state debugfs file.\n");
- goto out;
- }
+ debugfs_create_file("locking_state", S_IFREG|S_IRUSR,
+ osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops);
+ debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
+ &dlm_debug->d_filter_secs);
ocfs2_get_dlm_debug(dlm_debug);
-out:
- return ret;
}
static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
{
struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
- if (dlm_debug) {
- debugfs_remove(dlm_debug->d_locking_state);
+ if (dlm_debug)
ocfs2_put_dlm_debug(dlm_debug);
- }
}
int ocfs2_dlm_init(struct ocfs2_super *osb)
@@ -2979,14 +3330,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto local;
}
- status = ocfs2_dlm_init_debug(osb);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_dlm_init_debug(osb);
/* launch downconvert thread */
- osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+ osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
+ osb->uuid_str);
if (IS_ERR(osb->dc_task)) {
status = PTR_ERR(osb->dc_task);
osb->dc_task = NULL;
@@ -2996,6 +3344,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
/* for now, uuid == domain */
status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+ osb->osb_cluster_name,
+ strlen(osb->osb_cluster_name),
osb->uuid_str,
strlen(osb->uuid_str),
&lproto, ocfs2_do_node_down, osb,
@@ -3005,7 +3355,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_cluster_this_node(&osb->node_num);
+ status = ocfs2_cluster_this_node(conn, &osb->node_num);
if (status < 0) {
mlog_errno(status);
mlog(ML_ERROR,
@@ -3017,12 +3367,10 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
- ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_nfs_sync_lock_init(osb);
ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
-
- status = 0;
bail:
if (status < 0) {
ocfs2_dlm_shutdown_debug(osb);
@@ -3054,10 +3402,12 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
- ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
- osb->cconn = NULL;
+ if (osb->cconn) {
+ ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+ osb->cconn = NULL;
- ocfs2_dlm_shutdown_debug(osb);
+ ocfs2_dlm_shutdown_debug(osb);
+ }
}
static int ocfs2_drop_lock(struct ocfs2_super *osb,
@@ -3142,22 +3492,60 @@ out:
return 0;
}
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+
/* Mark the lockres as being dropped. It will no longer be
* queued if blocking, but we still may have to wait on it
* being dequeued from the downconvert thread before we can consider
* it safe to drop.
*
* You can *not* attempt to call cluster_lock on this lockres anymore. */
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres)
{
int status;
struct ocfs2_mask_waiter mw;
- unsigned long flags;
+ unsigned long flags, flags2;
ocfs2_init_mask_waiter(&mw);
spin_lock_irqsave(&lockres->l_lock, flags);
lockres->l_flags |= OCFS2_LOCK_FREEING;
+ if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
+ /*
+ * We know the downconvert is queued but not in progress
+ * because we are the downconvert thread and processing
+ * different lock. So we can just remove the lock from the
+ * queue. This is not only an optimization but also a way
+ * to avoid the following deadlock:
+ * ocfs2_dentry_post_unlock()
+ * ocfs2_dentry_lock_put()
+ * ocfs2_drop_dentry_lock()
+ * iput()
+ * ocfs2_evict_inode()
+ * ocfs2_clear_inode()
+ * ocfs2_mark_lockres_freeing()
+ * ... blocks waiting for OCFS2_LOCK_QUEUED
+ * since we are the downconvert thread which
+ * should clear the flag.
+ */
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ spin_lock_irqsave(&osb->dc_task_lock, flags2);
+ list_del_init(&lockres->l_blocked_list);
+ osb->blocked_lock_count--;
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
+ /*
+ * Warn if we recurse into another post_unlock call. Strictly
+ * speaking it isn't a problem but we need to be careful if
+ * that happens (stack overflow, deadlocks, ...) so warn if
+ * ocfs2 grows a path for which this can happen.
+ */
+ WARN_ON_ONCE(lockres->l_ops->post_unlock);
+ /* Since the lock is freeing we don't do much in the fn below */
+ ocfs2_process_blocked_lock(osb, lockres);
+ return;
+ }
while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3178,7 +3566,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
{
int ret;
- ocfs2_mark_lockres_freeing(lockres);
+ ocfs2_mark_lockres_freeing(osb, lockres);
ret = ocfs2_drop_lock(osb, lockres);
if (ret)
mlog_errno(ret);
@@ -3264,6 +3652,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
lockres->l_level, new_level);
+ /*
+ * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
+ * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
+ * we can recover correctly from node failure. Otherwise, we may get
+ * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
+ */
+ if (ocfs2_userspace_stack(osb) &&
+ lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+ lvb = 1;
+
if (lvb)
dlm_flags |= DLM_LKF_VALBLK;
@@ -3403,9 +3801,9 @@ recheck:
* set when the ast is received for an upconvert just before the
* OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
* on the heels of the ast, we want to delay the downconvert just
- * enough to allow the up requestor to do its task. Because this
+ * enough to allow the up requester to do its task. Because this
* lock is in the blocked queue, the lock will be downconverted
- * as soon as the requestor is done with the lock.
+ * as soon as the requester is done with the lock.
*/
if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
goto leave_requeue;
@@ -3516,6 +3914,17 @@ downconvert:
spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
gen);
+ /* The dlm lock convert is being cancelled in background,
+ * ocfs2_cancel_convert() is asynchronous in fs/dlm,
+ * requeue it, try again later.
+ */
+ if (ret == -EBUSY) {
+ ctl->requeue = 1;
+ mlog(ML_BASTS, "lockres %s, ReQ: Downconvert busy\n",
+ lockres->l_name);
+ ret = 0;
+ msleep(20);
+ }
leave:
if (ret)
@@ -3543,7 +3952,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
oi = OCFS2_I(inode);
oi->ip_dir_lock_gen++;
mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
- goto out;
+ goto out_forget;
}
if (!S_ISREG(inode->i_mode))
@@ -3574,6 +3983,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
filemap_fdatawait(mapping);
}
+out_forget:
+ forget_all_cached_acls(inode);
+
out:
return UNBLOCK_CONTINUE;
}
@@ -3703,8 +4115,10 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
break;
spin_unlock(&dentry_attach_lock);
- mlog(0, "d_delete(%.*s);\n", dentry->d_name.len,
- dentry->d_name.name);
+ if (S_ISDIR(dl->dl_inode->i_mode))
+ shrink_dcache_parent(dentry);
+
+ mlog(0, "d_delete(%pd);\n", dentry);
/*
* The following dcache calls may do an
@@ -3924,7 +4338,7 @@ unqueue:
ocfs2_schedule_blocked_lock(osb, lockres);
mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
- ctl.requeue ? "yes" : "no");
+ str_yes_no(ctl.requeue));
spin_unlock_irqrestore(&lockres->l_lock, flags);
if (ctl.unblock_action != UNBLOCK_CONTINUE
@@ -3971,9 +4385,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
osb->dc_work_sequence = osb->dc_wake_sequence;
processed = osb->blocked_lock_count;
- while (processed) {
- BUG_ON(list_empty(&osb->blocked_lock_list));
-
+ /*
+ * blocked lock processing in this loop might call iput which can
+ * remove items off osb->blocked_lock_list. Downconvert up to
+ * 'processed' number of locks, but stop short if we had some
+ * removed in ocfs2_mark_lockres_freeing when downconverting.
+ */
+ while (processed && !list_empty(&osb->blocked_lock_list)) {
lockres = list_entry(osb->blocked_lock_list.next,
struct ocfs2_lock_res, l_blocked_list);
list_del_init(&lockres->l_blocked_list);
@@ -4018,7 +4436,6 @@ static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
static int ocfs2_downconvert_thread(void *arg)
{
- int status = 0;
struct ocfs2_super *osb = arg;
/* only quit once we've been asked to stop and there is no more
@@ -4036,7 +4453,7 @@ static int ocfs2_downconvert_thread(void *arg)
}
osb->dc_task = NULL;
- return status;
+ return 0;
}
void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 1d596d8c4a4a..a3ebd7303ea2 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* dlmglue.h
*
* description here
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
@@ -70,6 +54,35 @@ struct ocfs2_orphan_scan_lvb {
__be32 lvb_os_seqno;
};
+#define OCFS2_TRIMFS_LVB_VERSION 1
+
+struct ocfs2_trim_fs_lvb {
+ __u8 lvb_version;
+ __u8 lvb_success;
+ __u8 lvb_reserved[2];
+ __be32 lvb_nodenum;
+ __be64 lvb_start;
+ __be64 lvb_len;
+ __be64 lvb_minlen;
+ __be64 lvb_trimlen;
+};
+
+struct ocfs2_trim_fs_info {
+ u8 tf_valid; /* lvb is valid, or not */
+ u8 tf_success; /* trim is successful, or not */
+ u32 tf_nodenum; /* osb node number */
+ u64 tf_start; /* trim start offset in clusters */
+ u64 tf_len; /* trim end offset in clusters */
+ u64 tf_minlen; /* trim minimum contiguous free clusters */
+ u64 tf_trimlen; /* trimmed length in bytes */
+};
+
+struct ocfs2_lock_holder {
+ struct list_head oh_list;
+ struct pid *oh_owner_pid;
+ int oh_ex;
+};
+
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -77,6 +90,8 @@ struct ocfs2_orphan_scan_lvb {
#define OCFS2_META_LOCK_NOQUEUE (0x02)
/* don't block waiting for the downconvert thread, instead return -EAGAIN */
#define OCFS2_LOCK_NONBLOCK (0x04)
+/* just get back disk inode bh if we've got cluster lock. */
+#define OCFS2_META_LOCK_GETBH (0x08)
/* Locking subclasses of inode cluster lock */
enum {
@@ -109,22 +124,21 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
int ocfs2_rw_lock(struct inode *inode, int write);
+int ocfs2_try_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
int ocfs2_open_lock(struct inode *inode);
int ocfs2_try_open_lock(struct inode *inode, int write);
void ocfs2_open_unlock(struct inode *inode);
int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
- int *level);
+ int *level, int wait);
int ocfs2_inode_lock_full_nested(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
int arg_flags,
int subclass);
-int ocfs2_inode_lock_with_page(struct inode *inode,
- struct buffer_head **ret_bh,
- int ex,
- struct page *page);
+int ocfs2_inode_lock_with_folio(struct inode *inode,
+ struct buffer_head **ret_bh, int ex, struct folio *folio);
/* Variants without special locking class or flags */
#define ocfs2_inode_lock_full(i, r, e, f)\
ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
@@ -133,6 +147,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
/* 99% of the time we don't want to supply any additional flags --
* those are for very specific cases only. */
#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
+#define ocfs2_try_inode_lock(i, b, e)\
+ ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\
+ OI_LS_NORMAL)
void ocfs2_inode_unlock(struct inode *inode,
int ex);
int ocfs2_super_lock(struct ocfs2_super *osb,
@@ -146,6 +163,12 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
+void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb);
+void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb);
+int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
+ struct ocfs2_trim_fs_info *info, int trylock);
+void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
+ struct ocfs2_trim_fs_info *info);
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
int ocfs2_file_lock(struct file *file, int ex, int trylock);
@@ -157,7 +180,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
@@ -169,4 +193,15 @@ void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
/* To set the locking protocol on module initialization */
void ocfs2_set_locking_protocol(void);
+
+/* The _tracker pair is used to avoid cluster recursive locking */
+int ocfs2_inode_lock_tracker(struct inode *inode,
+ struct buffer_head **ret_bh,
+ int ex,
+ struct ocfs2_lock_holder *oh);
+void ocfs2_inode_unlock_tracker(struct inode *inode,
+ int ex,
+ struct ocfs2_lock_holder *oh,
+ int had_lock);
+
#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 29651167190d..b95724b767e1 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* export.c
*
* Functions to facilitate NFS exporting
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -82,7 +66,6 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
}
status = ocfs2_test_inode_bit(osb, blkno, &set);
- trace_ocfs2_get_dentry_test_bit(status, set);
if (status < 0) {
if (status == -EINVAL) {
/*
@@ -96,6 +79,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
goto unlock_nfs_sync;
}
+ trace_ocfs2_get_dentry_test_bit(status, set);
/* If the inode allocator bit is clear, this inode must be stale */
if (!set) {
status = -ESTALE;
@@ -119,16 +103,16 @@ check_err:
if (IS_ERR(inode)) {
mlog_errno(PTR_ERR(inode));
- result = (void *)inode;
+ result = ERR_CAST(inode);
goto bail;
}
check_gen:
if (handle->ih_generation != inode->i_generation) {
- iput(inode);
trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
handle->ih_generation,
inode->i_generation);
+ iput(inode);
result = ERR_PTR(-ESTALE);
goto bail;
}
@@ -147,17 +131,25 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
int status;
u64 blkno;
struct dentry *parent;
- struct inode *dir = child->d_inode;
+ struct inode *dir = d_inode(child);
+ int set;
trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
(unsigned long long)OCFS2_I(dir)->ip_blkno);
+ status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1);
+ if (status < 0) {
+ mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+ parent = ERR_PTR(status);
+ goto bail;
+ }
+
status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
parent = ERR_PTR(status);
- goto bail;
+ goto unlock_nfs_sync;
}
status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
@@ -166,11 +158,31 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
goto bail_unlock;
}
+ status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set);
+ if (status < 0) {
+ if (status == -EINVAL) {
+ status = -ESTALE;
+ } else
+ mlog(ML_ERROR, "test inode bit failed %d\n", status);
+ parent = ERR_PTR(status);
+ goto bail_unlock;
+ }
+
+ trace_ocfs2_get_dentry_test_bit(status, set);
+ if (!set) {
+ status = -ESTALE;
+ parent = ERR_PTR(status);
+ goto bail_unlock;
+ }
+
parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
bail_unlock:
ocfs2_inode_unlock(dir, 0);
+unlock_nfs_sync:
+ ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1);
+
bail:
trace_ocfs2_get_parent_end(parent);
@@ -243,9 +255,9 @@ static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb,
if (fh_len < 3 || fh_type > 2)
return NULL;
- handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32;
- handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]);
- handle.ih_generation = le32_to_cpu(fid->raw[2]);
+ handle.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[0]) << 32;
+ handle.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[1]);
+ handle.ih_generation = le32_to_cpu((__force __le32)fid->raw[2]);
return ocfs2_get_dentry(sb, &handle);
}
@@ -257,9 +269,9 @@ static struct dentry *ocfs2_fh_to_parent(struct super_block *sb,
if (fh_type != 2 || fh_len < 6)
return NULL;
- parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32;
- parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]);
- parent.ih_generation = le32_to_cpu(fid->raw[5]);
+ parent.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[3]) << 32;
+ parent.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[4]);
+ parent.ih_generation = le32_to_cpu((__force __le32)fid->raw[5]);
return ocfs2_get_dentry(sb, &parent);
}
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index 41a738678c37..636357400505 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* export.h
*
* Function prototypes
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_EXPORT_H
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2487116d0d33..ef147e8b3271 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -1,25 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* extent_map.c
*
* Block/Cluster mapping functions
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -38,6 +23,7 @@
#include "inode.h"
#include "super.h"
#include "symlink.h"
+#include "aops.h"
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -305,8 +291,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -415,7 +401,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
{
int i, ret, tree_height, len;
struct ocfs2_dinode *di;
- struct ocfs2_extent_block *uninitialized_var(eb);
+ struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
struct buffer_head *eb_bh = NULL;
@@ -441,14 +427,24 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
}
}
+ if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has an invalid extent (next_free_rec %u, count %u)\n",
+ inode->i_ino,
+ le16_to_cpu(el->l_next_free_rec),
+ le16_to_cpu(el->l_count));
+ ret = -EROFS;
+ goto out;
+ }
+
i = ocfs2_search_extent_list(el, v_cluster);
if (i == -1) {
/*
@@ -475,8 +471,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0)", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0)\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
@@ -564,8 +561,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in xattr leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -582,8 +579,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
@@ -600,8 +598,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
*extent_flags = rec->e_flags;
}
out:
- if (eb_bh)
- brelse(eb_bh);
+ brelse(eb_bh);
return ret;
}
@@ -610,7 +607,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
unsigned int *extent_flags)
{
int ret;
- unsigned int uninitialized_var(hole_len), flags = 0;
+ unsigned int hole_len, flags = 0;
struct buffer_head *di_bh = NULL;
struct ocfs2_extent_rec rec;
@@ -709,6 +706,8 @@ out:
* it not only handles the fiemap for inlined files, but also deals
* with the fast symlink, cause they have no difference for extent
* mapping per se.
+ *
+ * Must be called with ip_alloc_sem semaphore held.
*/
static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
struct fiemap_extent_info *fieinfo,
@@ -720,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
u64 phys;
u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ lockdep_assert_held_read(&oi->ip_alloc_sem);
di = (struct ocfs2_dinode *)di_bh->b_data;
if (ocfs2_inode_is_fast_symlink(inode))
@@ -735,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
phys += offsetof(struct ocfs2_dinode,
id2.i_data.id_data);
+ /* Release the ip_alloc_sem to prevent deadlock on page fault */
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
flags);
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret < 0)
return ret;
}
@@ -744,8 +747,6 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
return 0;
}
-#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
-
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len)
{
@@ -757,7 +758,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct buffer_head *di_bh = NULL;
struct ocfs2_extent_rec rec;
- ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+ ret = fiemap_prep(inode, fieinfo, map_start, &map_len, 0);
if (ret)
return ret;
@@ -781,7 +782,6 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
cpos = map_start >> osb->s_clustersize_bits;
mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
map_start + map_len);
- mapping_end -= cpos;
is_last = 0;
while (cpos < mapping_end && !is_last) {
u32 fe_flags;
@@ -808,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
-
+ /* Release the ip_alloc_sem to prevent deadlock on page fault */
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
len_bytes, fe_flags);
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret)
break;
@@ -831,6 +833,50 @@ out:
return ret;
}
+/* Is IO overwriting allocated blocks? */
+int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh,
+ u64 map_start, u64 map_len)
+{
+ int ret = 0, is_last;
+ u32 mapping_end, cpos;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_rec rec;
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if (ocfs2_size_fits_inline_data(di_bh, map_start + map_len))
+ return ret;
+ else
+ return -EAGAIN;
+ }
+
+ cpos = map_start >> osb->s_clustersize_bits;
+ mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+ map_start + map_len);
+ is_last = 0;
+ while (cpos < mapping_end && !is_last) {
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+ NULL, &rec, &is_last);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (rec.e_blkno == 0ULL)
+ break;
+
+ if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
+ break;
+
+ cpos = le32_to_cpu(rec.e_cpos) +
+ le16_to_cpu(rec.e_leaf_clusters);
+ }
+
+ if (cpos < mapping_end)
+ ret = -EAGAIN;
+out:
+ return ret;
+}
+
int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
{
struct inode *inode = file->f_mapping->host;
@@ -852,20 +898,20 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
down_read(&OCFS2_I(inode)->ip_alloc_sem);
- if (*offset >= inode->i_size) {
+ if (*offset >= i_size_read(inode)) {
ret = -ENXIO;
goto out_unlock;
}
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
if (whence == SEEK_HOLE)
- *offset = inode->i_size;
+ *offset = i_size_read(inode);
goto out_unlock;
}
clen = 0;
cpos = *offset >> cs_bits;
- cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
+ cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
while (cpos < cend && !is_last) {
ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
@@ -904,8 +950,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
extlen = clen;
extlen <<= cs_bits;
- if ((extoff + extlen) > inode->i_size)
- extlen = inode->i_size - extoff;
+ if ((extoff + extlen) > i_size_read(inode))
+ extlen = i_size_read(inode) - extoff;
extoff += extlen;
if (extoff > *offset)
*offset = extoff;
@@ -945,7 +991,13 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
}
while (done < nr) {
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
+ rc = -EAGAIN;
+ mlog(ML_ERROR,
+ "Inode #%llu ip_alloc_sem is temporarily unavailable\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ break;
+ }
rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
&p_block, &p_count, NULL);
up_read(&OCFS2_I(inode)->ip_alloc_sem);
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 67ea57d2fd59..bc4ed59fb925 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -1,25 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* extent_map.h
*
* In-memory file extent mappings for OCFS2.
*
* Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef _EXTENT_MAP_H
@@ -53,6 +38,9 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len);
+int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh,
+ u64 map_start, u64 map_len);
+
int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 41000f223ca4..21d797ccccd0 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* file.c
*
* File open, close, extend, truncate
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/capability.h>
@@ -37,6 +21,7 @@
#include <linux/falloc.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <cluster/masklog.h>
@@ -100,19 +85,22 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
trace_ocfs2_file_open(inode, file, file->f_path.dentry,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)oi->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name, mode);
- if (file->f_mode & FMODE_WRITE)
- dquot_initialize(inode);
+ if (file->f_mode & FMODE_WRITE) {
+ status = dquot_initialize(inode);
+ if (status)
+ goto leave;
+ }
spin_lock(&oi->ip_lock);
/* Check that the inode hasn't been wiped from disk by another
* node. If it hasn't then we're safe as long as we hold the
* spin lock until our increment of open count. */
- if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+ if (oi->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&oi->ip_lock);
status = -ENOENT;
@@ -136,6 +124,8 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
spin_unlock(&oi->ip_lock);
}
+ file->f_mode |= FMODE_NOWAIT;
+
leave:
return status;
}
@@ -175,43 +165,40 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
int datasync)
{
int err = 0;
- journal_t *journal;
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ journal_t *journal = osb->journal->j_journal;
+ int ret;
+ tid_t commit_tid;
+ bool needs_barrier = false;
trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
- OCFS2_I(inode)->ip_blkno,
+ oi->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name,
(unsigned long long)datasync);
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ err = file_write_and_wait_range(file, start, end);
if (err)
return err;
- /*
- * Probably don't need the i_mutex at all in here, just putting it here
- * to be consistent with how fsync used to be called, someone more
- * familiar with the fs could possibly remove it.
- */
- mutex_lock(&inode->i_mutex);
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
- /*
- * We still have to flush drive's caches to get data to the
- * platter
- */
- if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
- goto bail;
+ commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = true;
+ err = jbd2_complete_transaction(journal, commit_tid);
+ if (needs_barrier) {
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev);
+ if (!err)
+ err = ret;
}
- journal = osb->journal->j_journal;
- err = jbd2_journal_force_commit(journal);
-
-bail:
if (err)
mlog_errno(err);
- mutex_unlock(&inode->i_mutex);
return (err < 0) ? -EIO : 0;
}
@@ -219,14 +206,14 @@ bail:
int ocfs2_should_update_atime(struct inode *inode,
struct vfsmount *vfsmnt)
{
- struct timespec now;
+ struct timespec64 now;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return 0;
if ((inode->i_flags & S_NOATIME) ||
- ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
+ ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
return 0;
/*
@@ -245,15 +232,19 @@ int ocfs2_should_update_atime(struct inode *inode,
return 0;
if (vfsmnt->mnt_flags & MNT_RELATIME) {
- if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
- (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
+ struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 atime = inode_get_atime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
+
+ if ((timespec64_compare(&atime, &mtime) <= 0) ||
+ (timespec64_compare(&atime, &ctime) <= 0))
return 1;
return 0;
}
- now = CURRENT_TIME;
- if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
+ now = current_time(inode);
+ if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum))
return 0;
else
return 1;
@@ -283,21 +274,22 @@ int ocfs2_update_inode_atime(struct inode *inode,
/*
* Don't use ocfs2_mark_inode_dirty() here as we don't always
- * have i_mutex to guard against concurrent changes to other
+ * have i_rwsem to guard against concurrent changes to other
* inode fields.
*/
- inode->i_atime = CURRENT_TIME;
- di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
- di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+ inode_set_atime_to_ts(inode, current_time(inode));
+ di->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+ di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, bh);
out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ ocfs2_commit_trans(osb, handle);
out:
return ret;
}
-static int ocfs2_set_inode_size(handle_t *handle,
+int ocfs2_set_inode_size(handle_t *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 new_i_size)
@@ -306,7 +298,7 @@ static int ocfs2_set_inode_size(handle_t *handle,
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
@@ -338,6 +330,7 @@ int ocfs2_simple_size_update(struct inode *inode,
if (ret < 0)
mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_commit_trans(osb, handle);
out:
return ret;
@@ -370,7 +363,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
goto out;
- return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
+ return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
out:
return status;
@@ -426,12 +419,13 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
}
i_size_write(inode, new_i_size);
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
di = (struct ocfs2_dinode *) fe_bh->b_data;
di->i_size = cpu_to_le64(new_i_size);
- di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, fe_bh);
@@ -441,7 +435,7 @@ out:
return status;
}
-static int ocfs2_truncate_file(struct inode *inode,
+int ocfs2_truncate_file(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size)
{
@@ -474,11 +468,6 @@ static int ocfs2_truncate_file(struct inode *inode,
goto bail;
}
- /* lets handle the simple truncate cases before doing any more
- * cluster locking. */
- if (new_i_size == le64_to_cpu(fe->i_size))
- goto bail;
-
down_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_resv_discard(&osb->osb_la_resmap,
@@ -491,10 +480,11 @@ static int ocfs2_truncate_file(struct inode *inode,
* greater than page size, so we have to truncate them
* anyway.
*/
- unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(inode->i_mapping, new_i_size);
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ unmap_mapping_range(inode->i_mapping,
+ new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
i_size_read(inode), 1);
if (status)
@@ -513,6 +503,9 @@ static int ocfs2_truncate_file(struct inode *inode,
goto bail_unlock_sem;
}
+ unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
+
status = ocfs2_commit_truncate(osb, inode, di_bh);
if (status < 0) {
mlog_errno(status);
@@ -551,19 +544,16 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret)
{
- int ret;
struct ocfs2_extent_tree et;
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
- ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
- clusters_to_add, mark_unwritten,
- data_ac, meta_ac, reason_ret);
-
- return ret;
+ return ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
+ clusters_to_add, mark_unwritten,
+ data_ac, meta_ac, reason_ret);
}
-static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
- u32 clusters_to_add, int mark_unwritten)
+static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+ u32 clusters_to_add, int mark_unwritten)
{
int status = 0;
int restart_func = 0;
@@ -574,13 +564,13 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
handle_t *handle = NULL;
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
- enum ocfs2_alloc_restarted why;
+ enum ocfs2_alloc_restarted why = RESTART_NONE;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_tree et;
int did_quota = 0;
/*
- * This function only exists for file systems which don't
+ * Unwritten extent only exists for file systems which
* support holes.
*/
BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
@@ -603,8 +593,7 @@ restart_all:
goto leave;
}
- credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
- clusters_to_add);
+ credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
@@ -653,7 +642,7 @@ restarted_transaction:
mlog_errno(status);
goto leave;
}
-
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(inode)->ip_lock);
@@ -671,11 +660,7 @@ restarted_transaction:
} else {
BUG_ON(why != RESTART_TRANS);
- /* TODO: This can be more intelligent. */
- credits = ocfs2_calc_extend_credits(osb->sb,
- &fe->id2.i_list,
- clusters_to_add);
- status = ocfs2_extend_trans(handle, credits);
+ status = ocfs2_allocate_extend_trans(handle, 1);
if (status < 0) {
/* handle still has to be committed at
* this point. */
@@ -723,7 +708,10 @@ leave:
* While a write will already be ordering the data, a truncate will not.
* Thus, we need to explicitly order the zeroed pages.
*/
-static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
+ struct buffer_head *di_bh,
+ loff_t start_byte,
+ loff_t length)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
handle_t *handle = NULL;
@@ -739,9 +727,17 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
goto out;
}
- ret = ocfs2_jbd2_file_inode(handle, inode);
- if (ret < 0)
+ ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret)
mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
out:
if (ret) {
@@ -756,31 +752,41 @@ out:
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->write_begin() and ->write_end(). */
static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
- u64 abs_to)
+ u64 abs_to, struct buffer_head *di_bh)
{
struct address_space *mapping = inode->i_mapping;
- struct page *page;
- unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
- handle_t *handle = NULL;
+ struct folio *folio;
+ unsigned long index = abs_from >> PAGE_SHIFT;
+ handle_t *handle;
int ret = 0;
unsigned zero_from, zero_to, block_start, block_end;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
BUG_ON(abs_from >= abs_to);
- BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
- page = find_or_create_page(mapping, index, GFP_NOFS);
- if (!page) {
- ret = -ENOMEM;
- mlog_errno(ret);
+ handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
+ abs_from,
+ abs_to - abs_from);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
goto out;
}
- /* Get the offsets within the page that we want to zero */
- zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
- zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ mlog_errno(ret);
+ goto out_commit_trans;
+ }
+
+ /* Get the offsets within the folio that we want to zero */
+ zero_from = offset_in_folio(folio, abs_from);
+ zero_to = offset_in_folio(folio, abs_to);
if (!zero_to)
- zero_to = PAGE_CACHE_SIZE;
+ zero_to = folio_size(folio);
trace_ocfs2_write_zero_page(
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -791,44 +797,48 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
/* We know that zero_from is block aligned */
for (block_start = zero_from; block_start < zero_to;
block_start = block_end) {
- block_end = block_start + (1 << inode->i_blkbits);
+ block_end = block_start + i_blocksize(inode);
/*
* block_start is block-aligned. Bump it by one to force
* __block_write_begin and block_commit_write to zero the
* whole block.
*/
- ret = __block_write_begin(page, block_start + 1, 0,
+ ret = __block_write_begin(folio, block_start + 1, 0,
ocfs2_get_block);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
}
- if (!handle) {
- handle = ocfs2_zero_start_ordered_transaction(inode);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
- break;
- }
- }
/* must not update i_size! */
- ret = block_commit_write(page, block_start + 1,
- block_start + 1);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ block_commit_write(folio, block_start + 1, block_start + 1);
}
- if (handle)
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ /*
+ * fs-writeback will release the dirty pages without page lock
+ * whose offset are over inode size, the release happens at
+ * block_write_full_folio().
+ */
+ i_size_write(inode, abs_to);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
+ di->i_mtime_nsec = di->i_ctime_nsec;
+ if (handle) {
+ ocfs2_journal_dirty(handle, di_bh);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ }
out_unlock:
- unlock_page(page);
- page_cache_release(page);
+ folio_unlock(folio);
+ folio_put(folio);
+out_commit_trans:
+ if (handle)
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
return ret;
}
@@ -899,7 +909,7 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
zero_clusters = last_cpos - zero_cpos;
if (needs_cow) {
- rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+ rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
zero_clusters, UINT_MAX);
if (rc) {
mlog_errno(rc);
@@ -920,7 +930,7 @@ out:
* has made sure that the entire range needs zeroing.
*/
static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
- u64 range_end)
+ u64 range_end, struct buffer_head *di_bh)
{
int rc = 0;
u64 next_pos;
@@ -933,10 +943,10 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
BUG_ON(range_start >= range_end);
while (zero_pos < range_end) {
- next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
if (next_pos > range_end)
next_pos = range_end;
- rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+ rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
if (rc < 0) {
mlog_errno(rc);
break;
@@ -982,7 +992,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
range_end = zero_to_size;
ret = ocfs2_zero_extend_range(inode, range_start,
- range_end);
+ range_end, di_bh);
if (ret) {
mlog_errno(ret);
break;
@@ -1004,7 +1014,7 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
* Only quota files call this without a bh, and they can't be
* refcounted.
*/
- BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
@@ -1014,8 +1024,8 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
clusters_to_add -= oi->ip_clusters;
if (clusters_to_add) {
- ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
- clusters_to_add, 0);
+ ret = ocfs2_extend_allocation(inode, oi->ip_clusters,
+ clusters_to_add, 0);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1055,7 +1065,7 @@ static int ocfs2_extend_file(struct inode *inode,
/*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
- * i_mutex to block other extend/truncate calls while we're
+ * i_rwsem to block other extend/truncate calls while we're
* here. We even have to hold it for sparse files because there
* might be some tail zeroing.
*/
@@ -1101,23 +1111,30 @@ out:
return ret;
}
-int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
{
int status = 0, size_change;
- struct inode *inode = dentry->d_inode;
+ int inode_locked = 0;
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
struct buffer_head *bh = NULL;
handle_t *handle = NULL;
struct dquot *transfer_to[MAXQUOTAS] = { };
int qtype;
+ int had_lock;
+ struct ocfs2_lock_holder oh;
trace_ocfs2_setattr(inode, dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
dentry->d_name.len, dentry->d_name.name,
- attr->ia_valid, attr->ia_mode,
- from_kuid(&init_user_ns, attr->ia_uid),
- from_kgid(&init_user_ns, attr->ia_gid));
+ attr->ia_valid,
+ attr->ia_valid & ATTR_MODE ? attr->ia_mode : 0,
+ attr->ia_valid & ATTR_UID ?
+ from_kuid(&init_user_ns, attr->ia_uid) : 0,
+ attr->ia_valid & ATTR_GID ?
+ from_kgid(&init_user_ns, attr->ia_gid) : 0);
/* ensuring we don't even attempt to truncate a symlink */
if (S_ISLNK(inode->i_mode))
@@ -1128,14 +1145,24 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
return 0;
- status = inode_change_ok(inode, attr);
+ status = setattr_prepare(&nop_mnt_idmap, dentry, attr);
if (status)
return status;
- if (is_quota_modification(inode, attr))
- dquot_initialize(inode);
+ if (is_quota_modification(&nop_mnt_idmap, inode, attr)) {
+ status = dquot_initialize(inode);
+ if (status)
+ return status;
+ }
size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
if (size_change) {
+ /*
+ * Here we should wait dio to finish before inode lock
+ * to avoid a deadlock between ocfs2_setattr() and
+ * ocfs2_dio_end_io_write()
+ */
+ inode_dio_wait(inode);
+
status = ocfs2_rw_lock(inode, 1);
if (status < 0) {
mlog_errno(status);
@@ -1143,21 +1170,39 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- status = ocfs2_inode_lock(inode, &bh, 1);
- if (status < 0) {
- if (status != -ENOENT)
- mlog_errno(status);
+ had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
+ if (had_lock < 0) {
+ status = had_lock;
goto bail_unlock_rw;
+ } else if (had_lock) {
+ /*
+ * As far as we know, ocfs2_setattr() could only be the first
+ * VFS entry point in the call chain of recursive cluster
+ * locking issue.
+ *
+ * For instance:
+ * chmod_common()
+ * notify_change()
+ * ocfs2_setattr()
+ * posix_acl_chmod()
+ * ocfs2_iop_get_acl()
+ *
+ * But, we're not 100% sure if it's always true, because the
+ * ordering of the VFS entry points in the call chain is out
+ * of our control. So, we'd better dump the stack here to
+ * catch the other cases of recursive locking.
+ */
+ mlog(ML_ERROR, "Another case of recursive locking:\n");
+ dump_stack();
}
+ inode_locked = 1;
- if (size_change && attr->ia_size != i_size_read(inode)) {
+ if (size_change) {
status = inode_newsize_ok(inode, attr->ia_size);
if (status)
goto bail_unlock;
- inode_dio_wait(inode);
-
- if (i_size_read(inode) > attr->ia_size) {
+ if (i_size_read(inode) >= attr->ia_size) {
if (ocfs2_should_order_data(inode)) {
status = ocfs2_begin_ordered_truncate(inode,
attr->ia_size);
@@ -1186,8 +1231,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
- if (!transfer_to[USRQUOTA]) {
- status = -ESRCH;
+ if (IS_ERR(transfer_to[USRQUOTA])) {
+ status = PTR_ERR(transfer_to[USRQUOTA]);
+ transfer_to[USRQUOTA] = NULL;
goto bail_unlock;
}
}
@@ -1195,31 +1241,34 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
- if (!transfer_to[GRPQUOTA]) {
- status = -ESRCH;
+ if (IS_ERR(transfer_to[GRPQUOTA])) {
+ status = PTR_ERR(transfer_to[GRPQUOTA]);
+ transfer_to[GRPQUOTA] = NULL;
goto bail_unlock;
}
}
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
2 * ocfs2_quota_trans_credits(sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
- goto bail_unlock;
+ goto bail_unlock_alloc;
}
status = __dquot_transfer(inode, transfer_to);
if (status < 0)
goto bail_commit;
} else {
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
- goto bail_unlock;
+ goto bail_unlock_alloc;
}
}
- setattr_copy(inode, attr);
+ setattr_copy(&nop_mnt_idmap, inode, attr);
mark_inode_dirty(inode);
status = ocfs2_mark_inode_dirty(handle, inode, bh);
@@ -1228,44 +1277,58 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
+bail_unlock_alloc:
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
bail_unlock:
- ocfs2_inode_unlock(inode, 1);
+ if (status && inode_locked) {
+ ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
+ inode_locked = 0;
+ }
bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
bail:
- brelse(bh);
/* Release quota pointers in case we acquired them */
- for (qtype = 0; qtype < MAXQUOTAS; qtype++)
+ for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
dqput(transfer_to[qtype]);
if (!status && attr->ia_valid & ATTR_MODE) {
- status = ocfs2_acl_chmod(inode);
+ status = ocfs2_acl_chmod(inode, bh);
if (status < 0)
mlog_errno(status);
}
+ if (inode_locked)
+ ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
+ brelse(bh);
return status;
}
-int ocfs2_getattr(struct vfsmount *mnt,
- struct dentry *dentry,
- struct kstat *stat)
+int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags)
{
- struct inode *inode = dentry->d_inode;
- struct super_block *sb = dentry->d_inode->i_sb;
+ struct inode *inode = d_inode(path->dentry);
+ struct super_block *sb = path->dentry->d_sb;
struct ocfs2_super *osb = sb->s_fs_info;
int err;
- err = ocfs2_inode_revalidate(dentry);
+ err = ocfs2_inode_revalidate(path->dentry);
if (err) {
if (err != -ENOENT)
mlog_errno(err);
goto bail;
}
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ /*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others don't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+ stat->blocks += (stat->size + 511)>>9;
/* We set the blksize from the cluster size for performance */
stat->blksize = osb->s_clustersize;
@@ -1274,23 +1337,35 @@ bail:
return err;
}
-int ocfs2_permission(struct inode *inode, int mask)
+int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
+ int mask)
{
- int ret;
+ int ret, had_lock;
+ struct ocfs2_lock_holder oh;
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
- ret = ocfs2_inode_lock(inode, NULL, 0);
- if (ret) {
- if (ret != -ENOENT)
- mlog_errno(ret);
+ had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
+ if (had_lock < 0) {
+ ret = had_lock;
goto out;
+ } else if (had_lock) {
+ /* See comments in ocfs2_setattr() for details.
+ * The call chain of this case could be:
+ * do_sys_open()
+ * may_open()
+ * inode_permission()
+ * ocfs2_permission()
+ * ocfs2_iop_get_acl()
+ */
+ mlog(ML_ERROR, "Another case of recursive locking:\n");
+ dump_stack();
}
- ret = generic_permission(inode, mask);
+ ret = generic_permission(&nop_mnt_idmap, inode, mask);
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
out:
return ret;
}
@@ -1327,6 +1402,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
di = (struct ocfs2_dinode *) bh->b_data;
di->i_mode = cpu_to_le16(inode->i_mode);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, bh);
@@ -1336,44 +1412,6 @@ out:
return ret;
}
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
- size_t count)
-{
- int ret = 0;
- unsigned int extent_flags;
- u32 cpos, clusters, extent_len, phys_cpos;
- struct super_block *sb = inode->i_sb;
-
- cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
- clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-
- while (clusters) {
- ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
- &extent_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = 1;
- break;
- }
-
- if (extent_len > clusters)
- extent_len = clusters;
-
- clusters -= extent_len;
- cpos += extent_len;
- }
-out:
- return ret;
-}
-
static int ocfs2_write_remove_suid(struct inode *inode)
{
int ret;
@@ -1455,7 +1493,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
goto next;
}
- ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
+ ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
@@ -1495,14 +1533,55 @@ static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
}
}
+/*
+ * zero out partial blocks of one cluster.
+ *
+ * start: file offset where zero starts, will be made upper block aligned.
+ * len: it will be trimmed to the end of current cluster if "start + len"
+ * is bigger than it.
+ */
+static int ocfs2_zeroout_partial_cluster(struct inode *inode,
+ u64 start, u64 len)
+{
+ int ret;
+ u64 start_block, end_block, nr_blocks;
+ u64 p_block, offset;
+ u32 cluster, p_cluster, nr_clusters;
+ struct super_block *sb = inode->i_sb;
+ u64 end = ocfs2_align_bytes_to_clusters(sb, start);
+
+ if (start + len < end)
+ end = start + len;
+
+ start_block = ocfs2_blocks_for_bytes(sb, start);
+ end_block = ocfs2_blocks_for_bytes(sb, end);
+ nr_blocks = end_block - start_block;
+ if (!nr_blocks)
+ return 0;
+
+ cluster = ocfs2_bytes_to_clusters(sb, start);
+ ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
+ &nr_clusters, NULL);
+ if (ret)
+ return ret;
+ if (!p_cluster)
+ return 0;
+
+ offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
+ p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
+ return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
+}
+
static int ocfs2_zero_partial_clusters(struct inode *inode,
u64 start, u64 len)
{
int ret = 0;
- u64 tmpend, end = start + len;
+ u64 tmpend = 0;
+ u64 end = start + len;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
unsigned int csize = osb->s_clustersize;
handle_t *handle;
+ loff_t isize = i_size_read(inode);
/*
* The "start" and "end" values are NOT necessarily part of
@@ -1523,6 +1602,26 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
goto out;
+ /* No page cache for EOF blocks, issue zero out to disk. */
+ if (end > isize) {
+ /*
+ * zeroout eof blocks in last cluster starting from
+ * "isize" even "start" > "isize" because it is
+ * complicated to zeroout just at "start" as "start"
+ * may be not aligned with block size, buffer write
+ * would be required to do that, but out of eof buffer
+ * write is not supported.
+ */
+ ret = ocfs2_zeroout_partial_cluster(inode, isize,
+ end - isize);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ if (start >= isize)
+ goto out;
+ end = isize;
+ }
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -1531,18 +1630,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
}
/*
- * We want to get the byte offset of the end of the 1st cluster.
+ * If start is on a cluster boundary and end is somewhere in another
+ * cluster, we have not COWed the cluster starting at start, unless
+ * end is also within the same cluster. So, in this case, we skip this
+ * first call to ocfs2_zero_range_for_truncate() truncate and move on
+ * to the next one.
*/
- tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
- if (tmpend > end)
- tmpend = end;
+ if ((start & (csize - 1)) != 0) {
+ /*
+ * We want to get the byte offset of the end of the 1st
+ * cluster.
+ */
+ tmpend = (u64)osb->s_clustersize +
+ (start & ~(osb->s_clustersize - 1));
+ if (tmpend > end)
+ tmpend = end;
- trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
- (unsigned long long)tmpend);
+ trace_ocfs2_zero_partial_clusters_range1(
+ (unsigned long long)start,
+ (unsigned long long)tmpend);
- ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
- if (ret)
- mlog_errno(ret);
+ ret = ocfs2_zero_range_for_truncate(inode, handle, start,
+ tmpend);
+ if (ret)
+ mlog_errno(ret);
+ }
if (tmpend < end) {
/*
@@ -1559,6 +1671,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
if (ret)
mlog_errno(ret);
}
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_commit_trans(osb, handle);
out:
@@ -1645,9 +1758,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
*done = ret;
}
-static int ocfs2_remove_inode_range(struct inode *inode,
- struct buffer_head *di_bh, u64 byte_start,
- u64 byte_len)
+int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len)
{
int ret = 0, flags = 0, done = 0, i;
u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
@@ -1674,6 +1787,14 @@ static int ocfs2_remove_inode_range(struct inode *inode,
return 0;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
+
+ if (byte_start > id_count || byte_start + byte_len > id_count) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
byte_start + byte_len, 0);
if (ret) {
@@ -1697,8 +1818,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
* within one cluster(means is not exactly aligned to clustersize).
*/
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
-
+ if (ocfs2_is_refcount_inode(inode)) {
ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
if (ret) {
mlog_errno(ret);
@@ -1786,7 +1906,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
phys_cpos, trunc_len, flags,
- &dealloc, refcount_loc);
+ &dealloc, refcount_loc, false);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1800,6 +1920,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
out:
+ ocfs2_free_path(path);
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &dealloc);
@@ -1816,7 +1937,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
{
int ret;
s64 llen;
- loff_t size;
+ loff_t size, orig_isize;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *di_bh = NULL;
handle_t *handle;
@@ -1825,8 +1946,10 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
+ inode_dio_wait(inode);
/*
* This prevents concurrent writes on other nodes
*/
@@ -1873,14 +1996,15 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
}
size = sr->l_start + sr->l_len;
- if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+ if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
+ cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
if (sr->l_len <= 0) {
ret = -EINVAL;
goto out_inode_unlock;
}
}
- if (file && should_remove_suid(file->f_path.dentry)) {
+ if (file && setattr_should_drop_suidgid(&nop_mnt_idmap, file_inode(file))) {
ret = __ocfs2_write_remove_suid(inode, di_bh);
if (ret) {
mlog_errno(ret);
@@ -1907,6 +2031,15 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
default:
ret = -EINVAL;
}
+
+ orig_isize = i_size_read(inode);
+ /* zeroout eof blocks in the cluster. */
+ if (!ret && change_size && orig_isize < size) {
+ ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
+ size - orig_isize);
+ if (!ret)
+ i_size_write(inode, size);
+ }
up_write(&OCFS2_I(inode)->ip_alloc_sem);
if (ret) {
mlog_errno(ret);
@@ -1923,10 +2056,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out_inode_unlock;
}
- if (change_size && i_size_read(inode) < size)
- i_size_write(inode, size);
-
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
if (ret < 0)
mlog_errno(ret);
@@ -1943,7 +2073,7 @@ out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -1983,14 +2113,20 @@ static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
struct ocfs2_space_resv sr;
int change_size = 1;
int cmd = OCFS2_IOC_RESVSP64;
+ int ret = 0;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
if (!ocfs2_writes_unwritten_extents(osb))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_KEEP_SIZE)
+ if (mode & FALLOC_FL_KEEP_SIZE) {
change_size = 0;
+ } else {
+ ret = inode_newsize_ok(inode, offset + len);
+ if (ret)
+ return ret;
+ }
if (mode & FALLOC_FL_PUNCH_HOLE)
cmd = OCFS2_IOC_UNRESVSP64;
@@ -2012,7 +2148,7 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
struct super_block *sb = inode->i_sb;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
- !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
+ !ocfs2_is_refcount_inode(inode) ||
OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
return 0;
@@ -2042,13 +2178,6 @@ out:
return ret;
}
-static void ocfs2_aiodio_wait(struct inode *inode)
-{
- wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
-
- wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
-}
-
static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
{
int blockmask = inode->i_sb->s_blocksize - 1;
@@ -2059,57 +2188,107 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
return 0;
}
-static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
- struct file *file,
- loff_t pos, size_t count,
- int *meta_level)
+static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+ int write_sem,
+ int wait)
{
- int ret;
- struct buffer_head *di_bh = NULL;
- u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
- u32 clusters =
- ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+ int ret = 0;
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret) {
- mlog_errno(ret);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, di_bh, meta_level);
+ else
+ ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
+ if (ret < 0)
goto out;
+
+ if (wait) {
+ if (write_sem)
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ } else {
+ if (write_sem)
+ ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (!ret) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
}
- *meta_level = 1;
+ return ret;
- ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
- if (ret)
- mlog_errno(ret);
+out_unlock:
+ brelse(*di_bh);
+ *di_bh = NULL;
+ ocfs2_inode_unlock(inode, meta_level);
out:
- brelse(di_bh);
return ret;
}
+static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+ int write_sem)
+{
+ if (write_sem)
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ brelse(*di_bh);
+ *di_bh = NULL;
+
+ if (meta_level >= 0)
+ ocfs2_inode_unlock(inode, meta_level);
+}
+
static int ocfs2_prepare_inode_for_write(struct file *file,
- loff_t *ppos,
- size_t count,
- int appending,
- int *direct_io,
- int *has_refcount)
+ loff_t pos, size_t count, int wait)
{
- int ret = 0, meta_level = 0;
+ int ret = 0, meta_level = 0, overwrite_io = 0;
+ int write_sem = 0;
struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- loff_t saved_pos = 0, end;
+ struct inode *inode = d_inode(dentry);
+ struct buffer_head *di_bh = NULL;
+ u32 cpos;
+ u32 clusters;
/*
* We start with a read level meta lock and only jump to an ex
* if we need to make modifications here.
*/
for(;;) {
- ret = ocfs2_inode_lock(inode, NULL, meta_level);
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem,
+ wait);
if (ret < 0) {
- meta_level = -1;
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out;
}
+ /*
+ * Check if IO will overwrite allocated blocks in case
+ * IOCB_NOWAIT flag is set.
+ */
+ if (!wait && !overwrite_io) {
+ overwrite_io = 1;
+
+ ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+ }
+
/* Clear suid / sgid if necessary. We do this here
* instead of later in the write path because
* remove_suid() calls ->setattr without any hint that
@@ -2119,9 +2298,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* inode. There's also the dinode i_size state which
* can be lost via setattr during extending writes (we
* set inode->i_size at the end of a write. */
- if (should_remove_suid(dentry)) {
+ if (setattr_should_drop_suidgid(&nop_mnt_idmap, inode)) {
if (meta_level == 0) {
- ocfs2_inode_unlock(inode, meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
meta_level = 1;
continue;
}
@@ -2133,146 +2315,106 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
}
}
- /* work on a copy of ppos until we're sure that we won't have
- * to recalculate it due to relocking. */
- if (appending)
- saved_pos = i_size_read(inode);
- else
- saved_pos = *ppos;
-
- end = saved_pos + count;
-
- ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
+ ret = ocfs2_check_range_for_refcount(inode, pos, count);
if (ret == 1) {
- ocfs2_inode_unlock(inode, meta_level);
- meta_level = -1;
-
- ret = ocfs2_prepare_inode_for_refcount(inode,
- file,
- saved_pos,
- count,
- &meta_level);
- if (has_refcount)
- *has_refcount = 1;
- if (direct_io)
- *direct_io = 0;
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
+ meta_level = 1;
+ write_sem = 1;
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem,
+ wait);
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ clusters =
+ ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+ ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
}
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out_unlock;
}
- /*
- * Skip the O_DIRECT checks if we don't need
- * them.
- */
- if (!direct_io || !(*direct_io))
- break;
-
- /*
- * There's no sane way to do direct writes to an inode
- * with inline data.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Allowing concurrent direct writes means
- * i_size changes wouldn't be synchronized, so
- * one node could wind up truncating another
- * nodes writes.
- */
- if (end > i_size_read(inode)) {
- *direct_io = 0;
- break;
- }
-
- /*
- * We don't fill holes during direct io, so
- * check for them here. If any are found, the
- * caller will have to retake some cluster
- * locks and initiate the io as buffered.
- */
- ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
- if (ret == 1) {
- *direct_io = 0;
- ret = 0;
- } else if (ret < 0)
- mlog_errno(ret);
break;
}
- if (appending)
- *ppos = saved_pos;
-
out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
- saved_pos, appending, count,
- direct_io, has_refcount);
+ pos, count, wait);
- if (meta_level >= 0)
- ocfs2_inode_unlock(inode, meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
out:
return ret;
}
-static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
- int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
- int can_do_direct, has_refcount = 0;
+ int rw_level;
ssize_t written = 0;
- size_t ocount; /* original count */
- size_t count; /* after file limit checks */
- loff_t old_size, *ppos = &iocb->ki_pos;
- u32 old_clusters;
+ ssize_t ret;
+ size_t count = iov_iter_count(from);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
- int unaligned_dio = 0;
+ void *saved_ki_complete = NULL;
+ int append_write = ((iocb->ki_pos + count) >=
+ i_size_read(inode) ? 1 : 0);
+ int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
+ int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
- trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+ trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name,
- (unsigned int)nr_segs);
+ (unsigned int)from->nr_segs); /* GRRRRR */
- if (iocb->ki_left == 0)
- return 0;
-
- appending = file->f_flags & O_APPEND ? 1 : 0;
- direct_io = file->f_flags & O_DIRECT ? 1 : 0;
+ if (!direct_io && nowait)
+ return -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
+ if (count == 0)
+ return 0;
- ocfs2_iocb_clear_sem_locked(iocb);
+ if (nowait) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else
+ inode_lock(inode);
-relock:
- /* to match setattr's i_mutex -> rw_lock ordering */
- if (direct_io) {
- have_alloc_sem = 1;
- /* communicate with ocfs2_dio_end_io */
- ocfs2_iocb_set_sem_locked(iocb);
- }
+ ocfs2_iocb_init_rw_locked(iocb);
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
+ * For append write, we must take rw EX.
*/
- rw_level = (!direct_io || full_coherency);
+ rw_level = (!direct_io || full_coherency || append_write);
- ret = ocfs2_rw_lock(inode, rw_level);
+ if (nowait)
+ ret = ocfs2_try_rw_lock(inode, rw_level);
+ else
+ ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
- mlog_errno(ret);
- goto out_sems;
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
+ goto out_mutex;
}
/*
@@ -2285,112 +2427,48 @@ relock:
* other nodes to drop their caches. Buffered I/O
* already does this in write_begin().
*/
- ret = ocfs2_inode_lock(inode, NULL, 1);
+ if (nowait)
+ ret = ocfs2_try_inode_lock(inode, NULL, 1);
+ else
+ ret = ocfs2_inode_lock(inode, NULL, 1);
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out;
}
ocfs2_inode_unlock(inode, 1);
}
- can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file, ppos,
- iocb->ki_left, appending,
- &can_do_direct, &has_refcount);
- if (ret < 0) {
- mlog_errno(ret);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0) {
+ if (ret)
+ mlog_errno(ret);
goto out;
}
+ count = ret;
- if (direct_io && !is_sync_kiocb(iocb))
- unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
- *ppos);
-
- /*
- * We can't complete the direct I/O as requested, fall back to
- * buffered I/O.
- */
- if (direct_io && !can_do_direct) {
- ocfs2_rw_unlock(inode, rw_level);
-
- have_alloc_sem = 0;
- rw_level = -1;
-
- direct_io = 0;
- goto relock;
+ ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
+ goto out;
}
- if (unaligned_dio) {
+ if (direct_io && !is_sync_kiocb(iocb) &&
+ ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
/*
- * Wait on previous unaligned aio to complete before
- * proceeding.
+ * Make it a sync io if it's an unaligned aio.
*/
- ocfs2_aiodio_wait(inode);
-
- /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
- atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
- ocfs2_iocb_set_unaligned_aio(iocb);
+ saved_ki_complete = xchg(&iocb->ki_complete, NULL);
}
- /*
- * To later detect whether a journal commit for sync writes is
- * necessary, we sample i_size, and cluster count here.
- */
- old_size = i_size_read(inode);
- old_clusters = OCFS2_I(inode)->ip_clusters;
-
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
- ret = generic_segment_checks(iov, &nr_segs, &ocount,
- VERIFY_READ);
- if (ret)
- goto out_dio;
-
- count = ocount;
- ret = generic_write_checks(file, ppos, &count,
- S_ISBLK(inode->i_mode));
- if (ret)
- goto out_dio;
-
- if (direct_io) {
- written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
- ppos, count, ocount);
- if (written < 0) {
- ret = written;
- goto out_dio;
- }
- } else {
- current->backing_dev_info = file->f_mapping->backing_dev_info;
- written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
- ppos, count, 0);
- current->backing_dev_info = NULL;
- }
-
-out_dio:
+ written = __generic_file_write_iter(iocb, from);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
-
- if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
- ((file->f_flags & O_DIRECT) && !direct_io)) {
- ret = filemap_fdatawrite_range(file->f_mapping, pos,
- pos + count - 1);
- if (ret < 0)
- written = ret;
-
- if (!ret && ((old_size != i_size_read(inode)) ||
- (old_clusters != OCFS2_I(inode)->ip_clusters) ||
- has_refcount)) {
- ret = jbd2_journal_force_commit(osb->journal->j_journal);
- if (ret < 0)
- written = ret;
- }
-
- if (!ret)
- ret = filemap_fdatawait_range(file->f_mapping, pos,
- pos + count - 1);
- }
+ BUG_ON(written == -EIOCBQUEUED && !direct_io);
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -2401,153 +2479,62 @@ out_dio:
* async dio is going to do it in the future or an end_io after an
* error has already done it.
*/
- if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
+ if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
- have_alloc_sem = 0;
- unaligned_dio = 0;
- }
-
- if (unaligned_dio) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
- atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
-out:
- if (rw_level != -1)
- ocfs2_rw_unlock(inode, rw_level);
-
-out_sems:
- if (have_alloc_sem)
- ocfs2_iocb_clear_sem_locked(iocb);
-
- mutex_unlock(&inode->i_mutex);
-
- if (written)
- ret = written;
- return ret;
-}
-
-static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
- struct file *out,
- struct splice_desc *sd)
-{
- int ret;
-
- ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
- sd->total_len, 0, NULL, NULL);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
}
- return splice_from_pipe_feed(pipe, sd, pipe_to_file);
-}
-
-static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
- struct file *out,
- loff_t *ppos,
- size_t len,
- unsigned int flags)
-{
- int ret;
- struct address_space *mapping = out->f_mapping;
- struct inode *inode = mapping->host;
- struct splice_desc sd = {
- .total_len = len,
- .flags = flags,
- .pos = *ppos,
- .u.file = out,
- };
-
-
- trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- out->f_path.dentry->d_name.len,
- out->f_path.dentry->d_name.name, len);
-
- pipe_lock(pipe);
-
- splice_from_pipe_begin(&sd);
- do {
- ret = splice_from_pipe_next(pipe, &sd);
- if (ret <= 0)
- break;
+ if (unlikely(written <= 0))
+ goto out;
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- ret = ocfs2_rw_lock(inode, 1);
+ if (((file->f_flags & O_DSYNC) && !direct_io) ||
+ IS_SYNC(inode)) {
+ ret = filemap_fdatawrite_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
if (ret < 0)
- mlog_errno(ret);
- else {
- ret = ocfs2_splice_to_file(pipe, out, &sd);
- ocfs2_rw_unlock(inode, 1);
- }
- mutex_unlock(&inode->i_mutex);
- } while (ret > 0);
- splice_from_pipe_end(pipe, &sd);
-
- pipe_unlock(pipe);
-
- if (sd.num_spliced)
- ret = sd.num_spliced;
-
- if (ret > 0) {
- int err;
+ written = ret;
- err = generic_write_sync(out, *ppos, ret);
- if (err)
- ret = err;
- else
- *ppos += ret;
+ if (!ret) {
+ ret = jbd2_journal_force_commit(osb->journal->j_journal);
+ if (ret < 0)
+ written = ret;
+ }
- balance_dirty_pages_ratelimited(mapping);
+ if (!ret)
+ ret = filemap_fdatawait_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
}
- return ret;
-}
-
-static ssize_t ocfs2_file_splice_read(struct file *in,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags)
-{
- int ret = 0, lock_level = 0;
- struct inode *inode = file_inode(in);
-
- trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- in->f_path.dentry->d_name.len,
- in->f_path.dentry->d_name.name, len);
+out:
+ if (saved_ki_complete)
+ xchg(&iocb->ki_complete, saved_ki_complete);
- /*
- * See the comment in ocfs2_file_aio_read()
- */
- ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail;
- }
- ocfs2_inode_unlock(inode, lock_level);
+ if (rw_level != -1)
+ ocfs2_rw_unlock(inode, rw_level);
- ret = generic_file_splice_read(in, ppos, pipe, len, flags);
+out_mutex:
+ inode_unlock(inode);
-bail:
+ if (written)
+ ret = written;
return ret;
}
-static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
{
- int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
+ int ret = 0, rw_level = -1, lock_level = 0;
struct file *filp = iocb->ki_filp;
struct inode *inode = file_inode(filp);
+ int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
+ int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
- trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+ trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
filp->f_path.dentry->d_name.len,
- filp->f_path.dentry->d_name.name, nr_segs);
+ filp->f_path.dentry->d_name.name,
+ to->nr_segs); /* GRRRRR */
if (!inode) {
@@ -2556,19 +2543,24 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
goto bail;
}
- ocfs2_iocb_clear_sem_locked(iocb);
+ if (!direct_io && nowait)
+ return -EOPNOTSUPP;
+
+ ocfs2_iocb_init_rw_locked(iocb);
/*
- * buffered reads protect themselves in ->readpage(). O_DIRECT reads
+ * buffered reads protect themselves in ->read_folio(). O_DIRECT reads
* need locks to protect pending reads from racing with truncate.
*/
- if (filp->f_flags & O_DIRECT) {
- have_alloc_sem = 1;
- ocfs2_iocb_set_sem_locked(iocb);
+ if (direct_io) {
+ if (nowait)
+ ret = ocfs2_try_rw_lock(inode, 0);
+ else
+ ret = ocfs2_rw_lock(inode, 0);
- ret = ocfs2_rw_lock(inode, 0);
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto bail;
}
rw_level = 0;
@@ -2583,50 +2575,94 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
*
* Take and drop the meta data lock to update inode fields
* like i_size. This allows the checks down below
- * generic_file_aio_read() a chance of actually working.
+ * copy_splice_read() a chance of actually working.
*/
- ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
+ !nowait);
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto bail;
}
ocfs2_inode_unlock(inode, lock_level);
- ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
- trace_generic_file_aio_read_ret(ret);
+ ret = generic_file_read_iter(iocb, to);
+ trace_generic_file_read_iter_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+ BUG_ON(ret == -EIOCBQUEUED && !direct_io);
- /* see ocfs2_file_aio_write */
+ /* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
rw_level = -1;
- have_alloc_sem = 0;
}
bail:
- if (have_alloc_sem)
- ocfs2_iocb_clear_sem_locked(iocb);
-
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
return ret;
}
+static ssize_t ocfs2_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode = file_inode(in);
+ ssize_t ret = 0;
+ int lock_level = 0;
+
+ trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ in->f_path.dentry->d_name.len,
+ in->f_path.dentry->d_name.name,
+ flags);
+
+ /*
+ * We're fine letting folks race truncates and extending writes with
+ * read across the cluster, just like they can locally. Hence no
+ * rw_lock during read.
+ *
+ * Take and drop the meta data lock to update inode fields like i_size.
+ * This allows the checks down below filemap_splice_read() a chance of
+ * actually working.
+ */
+ ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level, 1);
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
+ goto bail;
+ }
+ ocfs2_inode_unlock(inode, lock_level);
+
+ ret = filemap_splice_read(in, ppos, pipe, len, flags);
+ trace_filemap_splice_read_ret(ret);
+bail:
+ return ret;
+}
+
/* Refer generic_file_llseek_unlocked() */
static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int ret = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
switch (whence) {
case SEEK_SET:
break;
case SEEK_END:
- offset += inode->i_size;
+ /* SEEK_END requires the OCFS2 inode lock for the file
+ * because it references the file's size.
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ offset += i_size_read(inode);
+ ocfs2_inode_unlock(inode, 0);
break;
case SEEK_CUR:
if (offset == 0) {
@@ -2649,29 +2685,113 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret)
return ret;
return offset;
}
+static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+ struct buffer_head *in_bh = NULL, *out_bh = NULL;
+ bool same_inode = (inode_in == inode_out);
+ loff_t remapped = 0;
+ ssize_t ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+ if (!ocfs2_refcount_tree(osb))
+ return -EOPNOTSUPP;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ /* Lock both files against IO */
+ ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+ if (ret)
+ return ret;
+
+ /* Check file eligibility and prepare for block sharing. */
+ ret = -EINVAL;
+ if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+ (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+ goto out_unlock;
+
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ goto out_unlock;
+
+ /* Lock out changes to the allocation maps and remap. */
+ down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+ SINGLE_DEPTH_NESTING);
+
+ /* Zap any page cache for the destination file's range. */
+ truncate_inode_pages_range(&inode_out->i_data,
+ round_down(pos_out, PAGE_SIZE),
+ round_up(pos_out + len, PAGE_SIZE) - 1);
+
+ remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
+ inode_out, out_bh, pos_out, len);
+ up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+ if (remapped < 0) {
+ ret = remapped;
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ /*
+ * Empty the extent map so that we may get the right extent
+ * record from the disk.
+ */
+ ocfs2_extent_map_trunc(inode_in, 0);
+ ocfs2_extent_map_trunc(inode_out, 0);
+
+ ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+out_unlock:
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return remapped > 0 ? remapped : ret;
+}
+
+static loff_t ocfs2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct ocfs2_file_private *fp = file->private_data;
+
+ return generic_llseek_cookie(file, offset, whence, &fp->cookie);
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
- .removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
+ .fileattr_get = ocfs2_fileattr_get,
+ .fileattr_set = ocfs2_fileattr_set,
};
const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
+ .listxattr = ocfs2_listxattr,
.permission = ocfs2_permission,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
/*
@@ -2680,14 +2800,12 @@ const struct inode_operations ocfs2_special_file_iops = {
*/
const struct file_operations ocfs2_fops = {
.llseek = ocfs2_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .mmap = ocfs2_mmap,
+ .mmap_prepare = ocfs2_mmap_prepare,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
@@ -2695,14 +2813,17 @@ const struct file_operations ocfs2_fops = {
.lock = ocfs2_lock,
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
- .splice_write = ocfs2_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
+ .remap_file_range = ocfs2_remap_file_range,
+ .fop_flags = FOP_ASYNC_LOCK,
};
+WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
const struct file_operations ocfs2_dops = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_dir_llseek,
.read = generic_read_dir,
- .iterate = ocfs2_readdir,
+ .iterate_shared = shared_ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
@@ -2712,6 +2833,7 @@ const struct file_operations ocfs2_dops = {
#endif
.lock = ocfs2_lock,
.flock = ocfs2_flock,
+ .fop_flags = FOP_ASYNC_LOCK,
};
/*
@@ -2728,28 +2850,27 @@ const struct file_operations ocfs2_dops = {
*/
const struct file_operations ocfs2_fops_no_plocks = {
.llseek = ocfs2_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .mmap = ocfs2_mmap,
+ .mmap_prepare = ocfs2_mmap_prepare,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
.flock = ocfs2_flock,
- .splice_read = ocfs2_file_splice_read,
- .splice_write = ocfs2_file_splice_write,
+ .splice_read = filemap_splice_read,
+ .splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
+ .remap_file_range = ocfs2_remap_file_range,
};
const struct file_operations ocfs2_dops_no_plocks = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_dir_llseek,
.read = generic_read_dir,
- .iterate = ocfs2_readdir,
+ .iterate_shared = shared_ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..41e65e45a9f3 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* file.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_FILE_H
@@ -36,6 +20,7 @@ struct ocfs2_alloc_context;
enum ocfs2_alloc_restarted;
struct ocfs2_file_private {
+ u64 cookie;
struct file *fp_file;
struct mutex fp_mutex;
struct ocfs2_lock_res fp_flock;
@@ -51,17 +36,27 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_set_inode_size(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 new_i_size);
int ocfs2_simple_size_update(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size);
+int ocfs2_truncate_file(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size);
int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
u64 new_i_size, u64 zero_to);
int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
loff_t zero_to);
-int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
-int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask);
+int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr);
+int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags);
+int ocfs2_permission(struct mnt_idmap *idmap,
+ struct inode *inode,
+ int mask);
int ocfs2_should_update_atime(struct inode *inode,
struct vfsmount *vfsmnt);
@@ -73,4 +68,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
size_t count);
+int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len);
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
new file mode 100644
index 000000000000..3ad7baf67658
--- /dev/null
+++ b/fs/ocfs2/filecheck.c
@@ -0,0 +1,509 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * filecheck.c
+ *
+ * Code which implements online file check.
+ *
+ * Copyright (C) 2016 SuSE. All rights reserved.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_fs.h"
+#include "stackglue.h"
+#include "inode.h"
+
+#include "filecheck.h"
+
+
+/* File check error strings,
+ * must correspond with error number in header file.
+ */
+static const char * const ocfs2_filecheck_errs[] = {
+ "SUCCESS",
+ "FAILED",
+ "INPROGRESS",
+ "READONLY",
+ "INJBD",
+ "INVALIDINO",
+ "BLOCKECC",
+ "BLOCKNO",
+ "VALIDFLAG",
+ "GENERATION",
+ "UNSUPPORTED"
+};
+
+struct ocfs2_filecheck_entry {
+ struct list_head fe_list;
+ unsigned long fe_ino;
+ unsigned int fe_type;
+ unsigned int fe_done:1;
+ unsigned int fe_status:31;
+};
+
+struct ocfs2_filecheck_args {
+ unsigned int fa_type;
+ union {
+ unsigned long fa_ino;
+ unsigned int fa_len;
+ };
+};
+
+static const char *
+ocfs2_filecheck_error(int errno)
+{
+ if (!errno)
+ return ocfs2_filecheck_errs[errno];
+
+ BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
+ errno > OCFS2_FILECHECK_ERR_END);
+ return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
+}
+
+static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf);
+static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count);
+static struct kobj_attribute ocfs2_filecheck_attr_chk =
+ __ATTR(check, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_attr_show,
+ ocfs2_filecheck_attr_store);
+static struct kobj_attribute ocfs2_filecheck_attr_fix =
+ __ATTR(fix, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_attr_show,
+ ocfs2_filecheck_attr_store);
+static struct kobj_attribute ocfs2_filecheck_attr_set =
+ __ATTR(set, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_attr_show,
+ ocfs2_filecheck_attr_store);
+static struct attribute *ocfs2_filecheck_attrs[] = {
+ &ocfs2_filecheck_attr_chk.attr,
+ &ocfs2_filecheck_attr_fix.attr,
+ &ocfs2_filecheck_attr_set.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(ocfs2_filecheck);
+
+static void ocfs2_filecheck_release(struct kobject *kobj)
+{
+ struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj,
+ struct ocfs2_filecheck_sysfs_entry, fs_kobj);
+
+ complete(&entry->fs_kobj_unregister);
+}
+
+static ssize_t
+ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ ssize_t ret = -EIO;
+ struct kobj_attribute *kattr = container_of(attr,
+ struct kobj_attribute, attr);
+
+ kobject_get(kobj);
+ if (kattr->show)
+ ret = kattr->show(kobj, kattr, buf);
+ kobject_put(kobj);
+ return ret;
+}
+
+static ssize_t
+ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret = -EIO;
+ struct kobj_attribute *kattr = container_of(attr,
+ struct kobj_attribute, attr);
+
+ kobject_get(kobj);
+ if (kattr->store)
+ ret = kattr->store(kobj, kattr, buf, count);
+ kobject_put(kobj);
+ return ret;
+}
+
+static const struct sysfs_ops ocfs2_filecheck_ops = {
+ .show = ocfs2_filecheck_show,
+ .store = ocfs2_filecheck_store,
+};
+
+static struct kobj_type ocfs2_ktype_filecheck = {
+ .default_groups = ocfs2_filecheck_groups,
+ .sysfs_ops = &ocfs2_filecheck_ops,
+ .release = ocfs2_filecheck_release,
+};
+
+static void
+ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ spin_lock(&entry->fs_fcheck->fc_lock);
+ while (!list_empty(&entry->fs_fcheck->fc_head)) {
+ p = list_first_entry(&entry->fs_fcheck->fc_head,
+ struct ocfs2_filecheck_entry, fe_list);
+ list_del(&p->fe_list);
+ BUG_ON(!p->fe_done); /* To free a undone file check entry */
+ kfree(p);
+ }
+ spin_unlock(&entry->fs_fcheck->fc_lock);
+
+ kfree(entry->fs_fcheck);
+ entry->fs_fcheck = NULL;
+}
+
+int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb)
+{
+ int ret;
+ struct ocfs2_filecheck *fcheck;
+ struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent;
+
+ fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
+ if (!fcheck)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&fcheck->fc_head);
+ spin_lock_init(&fcheck->fc_lock);
+ fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
+ fcheck->fc_size = 0;
+ fcheck->fc_done = 0;
+
+ entry->fs_kobj.kset = osb->osb_dev_kset;
+ init_completion(&entry->fs_kobj_unregister);
+ ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck,
+ NULL, "filecheck");
+ if (ret) {
+ kobject_put(&entry->fs_kobj);
+ kfree(fcheck);
+ return ret;
+ }
+
+ entry->fs_fcheck = fcheck;
+ return 0;
+}
+
+void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb)
+{
+ if (!osb->osb_fc_ent.fs_fcheck)
+ return;
+
+ kobject_del(&osb->osb_fc_ent.fs_kobj);
+ kobject_put(&osb->osb_fc_ent.fs_kobj);
+ wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister);
+ ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent);
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count);
+static int
+ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int len)
+{
+ int ret;
+
+ if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE))
+ return -EINVAL;
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
+ mlog(ML_NOTICE,
+ "Cannot set online file check maximum entry number "
+ "to %u due to too many pending entries(%u)\n",
+ len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
+ ret = -EBUSY;
+ } else {
+ if (len < ent->fs_fcheck->fc_size)
+ BUG_ON(!ocfs2_filecheck_erase_entries(ent,
+ ent->fs_fcheck->fc_size - len));
+
+ ent->fs_fcheck->fc_max = len;
+ ret = 0;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ return ret;
+}
+
+#define OCFS2_FILECHECK_ARGS_LEN 24
+static int
+ocfs2_filecheck_args_get_long(const char *buf, size_t count,
+ unsigned long *val)
+{
+ char buffer[OCFS2_FILECHECK_ARGS_LEN];
+
+ memcpy(buffer, buf, count);
+ buffer[count] = '\0';
+
+ if (kstrtoul(buffer, 0, val))
+ return 1;
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_type_parse(const char *name, unsigned int *type)
+{
+ if (!strncmp(name, "fix", 4))
+ *type = OCFS2_FILECHECK_TYPE_FIX;
+ else if (!strncmp(name, "check", 6))
+ *type = OCFS2_FILECHECK_TYPE_CHK;
+ else if (!strncmp(name, "set", 4))
+ *type = OCFS2_FILECHECK_TYPE_SET;
+ else
+ return 1;
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count,
+ struct ocfs2_filecheck_args *args)
+{
+ unsigned long val = 0;
+ unsigned int type;
+
+ /* too short/long args length */
+ if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN))
+ return 1;
+
+ if (ocfs2_filecheck_type_parse(name, &type))
+ return 1;
+ if (ocfs2_filecheck_args_get_long(buf, count, &val))
+ return 1;
+
+ if (val <= 0)
+ return 1;
+
+ args->fa_type = type;
+ if (type == OCFS2_FILECHECK_TYPE_SET)
+ args->fa_len = (unsigned int)val;
+ else
+ args->fa_ino = val;
+
+ return 0;
+}
+
+static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+
+ ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+ unsigned int type;
+ struct ocfs2_filecheck_entry *p;
+ struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj,
+ struct ocfs2_filecheck_sysfs_entry, fs_kobj);
+
+ if (ocfs2_filecheck_type_parse(attr->attr.name, &type))
+ return -EINVAL;
+
+ if (type == OCFS2_FILECHECK_TYPE_SET) {
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max);
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+ goto exit;
+ }
+
+ ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n");
+ total += ret;
+ remain -= ret;
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (p->fe_type != type)
+ continue;
+
+ ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n",
+ p->fe_ino, p->fe_done,
+ ocfs2_filecheck_error(p->fe_status));
+ if (ret >= remain) {
+ /* snprintf() didn't fit */
+ total = -E2BIG;
+ break;
+ }
+ total += ret;
+ remain -= ret;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+exit:
+ return total;
+}
+
+static inline int
+ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned long ino)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (!p->fe_done) {
+ if (p->fe_ino == ino)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static inline int
+ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (p->fe_done) {
+ list_del(&p->fe_list);
+ kfree(p);
+ ent->fs_fcheck->fc_size--;
+ ent->fs_fcheck->fc_done--;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count)
+{
+ unsigned int i = 0;
+ unsigned int ret = 0;
+
+ while (i++ < count) {
+ if (ocfs2_filecheck_erase_entry(ent))
+ ret++;
+ else
+ break;
+ }
+
+ return (ret == count ? 1 : 0);
+}
+
+static void
+ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ entry->fe_done = 1;
+ ent->fs_fcheck->fc_done++;
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+}
+
+static unsigned int
+ocfs2_filecheck_handle(struct ocfs2_super *osb,
+ unsigned long ino, unsigned int flags)
+{
+ unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS;
+ struct inode *inode = NULL;
+ int rc;
+
+ inode = ocfs2_iget(osb, ino, flags, 0);
+ if (IS_ERR(inode)) {
+ rc = (int)(-(long)inode);
+ if (rc >= OCFS2_FILECHECK_ERR_START &&
+ rc < OCFS2_FILECHECK_ERR_END)
+ ret = rc;
+ else
+ ret = OCFS2_FILECHECK_ERR_FAILED;
+ } else
+ iput(inode);
+
+ return ret;
+}
+
+static void
+ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ struct ocfs2_super *osb = container_of(ent, struct ocfs2_super,
+ osb_fc_ent);
+
+ if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
+ entry->fe_status = ocfs2_filecheck_handle(osb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
+ else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
+ entry->fe_status = ocfs2_filecheck_handle(osb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
+ else
+ entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
+
+ ocfs2_filecheck_done_entry(ent, entry);
+}
+
+static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret = 0;
+ struct ocfs2_filecheck_args args;
+ struct ocfs2_filecheck_entry *entry;
+ struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj,
+ struct ocfs2_filecheck_sysfs_entry, fs_kobj);
+
+ if (count == 0)
+ return count;
+
+ if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args))
+ return -EINVAL;
+
+ if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
+ ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
+ goto exit;
+ }
+
+ entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) {
+ ret = -EEXIST;
+ kfree(entry);
+ } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done == 0)) {
+ mlog(ML_NOTICE,
+ "Cannot do more file check "
+ "since file check queue(%u) is full now\n",
+ ent->fs_fcheck->fc_max);
+ ret = -EAGAIN;
+ kfree(entry);
+ } else {
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done > 0)) {
+ /* Delete the oldest entry which was done,
+ * make sure the entry size in list does
+ * not exceed maximum value
+ */
+ BUG_ON(!ocfs2_filecheck_erase_entry(ent));
+ }
+
+ entry->fe_ino = args.fa_ino;
+ entry->fe_type = args.fa_type;
+ entry->fe_done = 0;
+ entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS;
+ list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head);
+ ent->fs_fcheck->fc_size++;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ if (!ret)
+ ocfs2_filecheck_handle_entry(ent, entry);
+
+exit:
+ return ret ?: count;
+}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
new file mode 100644
index 000000000000..d3bcb8bcfeb0
--- /dev/null
+++ b/fs/ocfs2/filecheck.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * filecheck.h
+ *
+ * Online file check.
+ *
+ * Copyright (C) 2016 SuSE. All rights reserved.
+ */
+
+
+#ifndef FILECHECK_H
+#define FILECHECK_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+
+/* File check errno */
+enum {
+ OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */
+ OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */
+ OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */
+ OCFS2_FILECHECK_ERR_READONLY, /* Read only */
+ OCFS2_FILECHECK_ERR_INJBD, /* Buffer in jbd */
+ OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */
+ OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */
+ OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */
+ OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */
+ OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */
+ OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */
+};
+
+#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED
+#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED
+
+struct ocfs2_filecheck {
+ struct list_head fc_head; /* File check entry list head */
+ spinlock_t fc_lock;
+ unsigned int fc_max; /* Maximum number of entry in list */
+ unsigned int fc_size; /* Current entry count in list */
+ unsigned int fc_done; /* Finished entry count in list */
+};
+
+#define OCFS2_FILECHECK_MAXSIZE 100
+#define OCFS2_FILECHECK_MINSIZE 10
+
+/* File check operation type */
+enum {
+ OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
+ OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
+ OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
+};
+
+struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */
+ struct kobject fs_kobj;
+ struct completion fs_kobj_unregister;
+ struct ocfs2_filecheck *fs_fcheck;
+};
+
+
+int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb);
+void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb);
+
+#endif /* FILECHECK_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index d8208b20dc53..22da768e65b7 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -1,29 +1,14 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* heartbeat.c
*
- * Register ourselves with the heartbaet service, keep our node maps
+ * Register ourselves with the heartbeat service, keep our node maps
* up to date, and fire off recovery when needed.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
+#include <linux/bitmap.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
@@ -40,18 +25,12 @@
#include "buffer_head_io.h"
-static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
- int bit);
-static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
- int bit);
-
/* special case -1 for now
* TODO: should *really* make sure the calling func never passes -1!! */
static void ocfs2_node_map_init(struct ocfs2_node_map *map)
{
map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
- memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
- sizeof(unsigned long));
+ bitmap_zero(map->map, OCFS2_NODE_MAP_MAX_NODES);
}
void ocfs2_init_node_maps(struct ocfs2_super *osb)
@@ -81,12 +60,6 @@ void ocfs2_do_node_down(int node_num, void *data)
ocfs2_recovery_thread(osb, node_num);
}
-static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
- int bit)
-{
- set_bit(bit, map->map);
-}
-
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
@@ -95,16 +68,10 @@ void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
return;
BUG_ON(bit >= map->num_nodes);
spin_lock(&osb->node_map_lock);
- __ocfs2_node_map_set_bit(map, bit);
+ set_bit(bit, map->map);
spin_unlock(&osb->node_map_lock);
}
-static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
- int bit)
-{
- clear_bit(bit, map->map);
-}
-
void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
@@ -113,7 +80,7 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
return;
BUG_ON(bit >= map->num_nodes);
spin_lock(&osb->node_map_lock);
- __ocfs2_node_map_clear_bit(map, bit);
+ clear_bit(bit, map->map);
spin_unlock(&osb->node_map_lock);
}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index 74b9c5dda28d..f1f8b1802fe4 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* heartbeat.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_HEARTBEAT_H
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f87f9bd1edff..8340525e5589 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* inode.c
*
* vfs' aops, fops, dops and iops
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -28,6 +12,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
+#include <linux/iversion.h>
#include <asm/byteorder.h>
@@ -53,6 +38,7 @@
#include "xattr.h"
#include "refcounttree.h"
#include "ocfs2_trace.h"
+#include "filecheck.h"
#include "buffer_head_io.h"
@@ -64,8 +50,6 @@ struct ocfs2_find_inode_args
unsigned int fi_sysfile_type;
};
-static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
-
static int ocfs2_read_locked_inode(struct inode *inode,
struct ocfs2_find_inode_args *args);
static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -74,6 +58,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh);
+static int ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh,
+ int flags, int type);
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+
void ocfs2_set_inode_flags(struct inode *inode)
{
unsigned int flags = OCFS2_I(inode)->ip_attr;
@@ -127,9 +119,11 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
int sysfile_type)
{
+ int rc = -ESTALE;
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
+ journal_t *journal = osb->journal->j_journal;
trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
sysfile_type);
@@ -158,17 +152,43 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
mlog_errno(PTR_ERR(inode));
goto bail;
}
- trace_ocfs2_iget5_locked(inode->i_state);
- if (inode->i_state & I_NEW) {
- ocfs2_read_locked_inode(inode, &args);
+ trace_ocfs2_iget5_locked(inode_state_read_once(inode));
+ if (inode_state_read_once(inode) & I_NEW) {
+ rc = ocfs2_read_locked_inode(inode, &args);
unlock_new_inode(inode);
}
if (is_bad_inode(inode)) {
iput(inode);
- inode = ERR_PTR(-ESTALE);
+ inode = ERR_PTR(rc);
goto bail;
}
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ read_unlock(&journal->j_state_lock);
+ oi->i_sync_tid = tid;
+ oi->i_datasync_tid = tid;
+ }
+
bail:
if (!IS_ERR(inode)) {
trace_ocfs2_iget_end(inode,
@@ -178,6 +198,22 @@ bail:
return inode;
}
+static int ocfs2_dinode_has_extents(struct ocfs2_dinode *di)
+{
+ /* inodes flagged with other stuff in id2 */
+ if (le32_to_cpu(di->i_flags) &
+ (OCFS2_SUPER_BLOCK_FL | OCFS2_LOCAL_ALLOC_FL | OCFS2_CHAIN_FL |
+ OCFS2_DEALLOC_FL))
+ return 0;
+ /* i_flags doesn't indicate when id2 is a fast symlink */
+ if (S_ISLNK(le16_to_cpu(di->i_mode)) && le64_to_cpu(di->i_size) &&
+ !le32_to_cpu(di->i_clusters))
+ return 0;
+ if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ return 1;
+}
/*
* here's how inodes get read from disk:
@@ -214,14 +250,77 @@ bail:
static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
{
struct ocfs2_find_inode_args *args = opaque;
+#ifdef CONFIG_LOCKDEP
+ static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
ocfs2_file_ip_alloc_sem_key;
+#endif
inode->i_ino = args->fi_ino;
OCFS2_I(inode)->ip_blkno = args->fi_blkno;
- if (args->fi_sysfile_type != 0)
- lockdep_set_class(&inode->i_mutex,
- &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
+#ifdef CONFIG_LOCKDEP
+ switch (args->fi_sysfile_type) {
+ case BAD_BLOCK_SYSTEM_INODE:
+ break;
+ case GLOBAL_INODE_ALLOC_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[GLOBAL_INODE_ALLOC_SYSTEM_INODE]);
+ break;
+ case SLOT_MAP_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[SLOT_MAP_SYSTEM_INODE]);
+ break;
+ case HEARTBEAT_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[HEARTBEAT_SYSTEM_INODE]);
+ break;
+ case GLOBAL_BITMAP_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[GLOBAL_BITMAP_SYSTEM_INODE]);
+ break;
+ case USER_QUOTA_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[USER_QUOTA_SYSTEM_INODE]);
+ break;
+ case GROUP_QUOTA_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[GROUP_QUOTA_SYSTEM_INODE]);
+ break;
+ case ORPHAN_DIR_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[ORPHAN_DIR_SYSTEM_INODE]);
+ break;
+ case EXTENT_ALLOC_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE]);
+ break;
+ case INODE_ALLOC_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[INODE_ALLOC_SYSTEM_INODE]);
+ break;
+ case JOURNAL_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[JOURNAL_SYSTEM_INODE]);
+ break;
+ case LOCAL_ALLOC_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[LOCAL_ALLOC_SYSTEM_INODE]);
+ break;
+ case TRUNCATE_LOG_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[TRUNCATE_LOG_SYSTEM_INODE]);
+ break;
+ case LOCAL_USER_QUOTA_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[LOCAL_USER_QUOTA_SYSTEM_INODE]);
+ break;
+ case LOCAL_GROUP_QUOTA_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[LOCAL_GROUP_QUOTA_SYSTEM_INODE]);
+ break;
+ default:
+ WARN_ONCE(1, "Unknown sysfile type %d\n", args->fi_sysfile_type);
+ }
if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
@@ -231,6 +330,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
else
lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
&ocfs2_file_ip_alloc_sem_key);
+#endif
return 0;
}
@@ -265,7 +365,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
- inode->i_version = 1;
+ inode_set_iversion(inode, 1);
inode->i_generation = le32_to_cpu(fe->i_generation);
inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
inode->i_mode = le16_to_cpu(fe->i_mode);
@@ -280,12 +380,12 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_mapping->a_ops = &ocfs2_aops;
}
- inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
- inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
- inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
- inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
- inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
- inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+ inode_set_atime(inode, le64_to_cpu(fe->i_atime),
+ le32_to_cpu(fe->i_atime_nsec));
+ inode_set_mtime(inode, le64_to_cpu(fe->i_mtime),
+ le32_to_cpu(fe->i_mtime_nsec));
+ inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
+ le32_to_cpu(fe->i_ctime_nsec));
if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
mlog(ML_ERROR,
@@ -334,6 +434,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
break;
case S_IFLNK:
inode->i_op = &ocfs2_symlink_inode_operations;
+ inode_nohighmem(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
default:
@@ -382,23 +483,13 @@ static int ocfs2_read_locked_inode(struct inode *inode,
struct ocfs2_super *osb;
struct ocfs2_dinode *fe;
struct buffer_head *bh = NULL;
- int status, can_lock;
+ int status, can_lock, lock_level = 0;
u32 generation = 0;
status = -EINVAL;
- if (inode == NULL || inode->i_sb == NULL) {
- mlog(ML_ERROR, "bad inode\n");
- return status;
- }
sb = inode->i_sb;
osb = OCFS2_SB(sb);
- if (!args) {
- mlog(ML_ERROR, "bad inode args\n");
- make_bad_inode(inode);
- return status;
- }
-
/*
* To improve performance of cold-cache inode stats, we take
* the cluster lock here if possible.
@@ -460,7 +551,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
mlog_errno(status);
return status;
}
- status = ocfs2_inode_lock(inode, NULL, 0);
+ status = ocfs2_inode_lock(inode, NULL, lock_level);
if (status) {
make_bad_inode(inode);
mlog_errno(status);
@@ -477,16 +568,32 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
if (can_lock) {
- status = ocfs2_read_inode_block_full(inode, &bh,
- OCFS2_BH_IGNORE_CACHE);
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 0);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 1);
+ else
+ status = ocfs2_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
/*
* If buffer is in jbd, then its checksum may not have been
* computed as yet.
*/
- if (!status && !buffer_jbd(bh))
- status = ocfs2_validate_inode_block(osb->sb, bh);
+ if (!status && !buffer_jbd(bh)) {
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_validate_inode_block(
+ osb->sb, bh);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_repair_inode_block(
+ osb->sb, bh);
+ else
+ status = ocfs2_validate_inode_block(
+ osb->sb, bh);
+ }
}
if (status < 0) {
mlog_errno(status);
@@ -503,7 +610,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
*/
mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
!!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
- "Inode %llu: system file state is ambigous\n",
+ "Inode %llu: system file state is ambiguous\n",
(unsigned long long)args->fi_blkno);
if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
@@ -514,17 +621,29 @@ static int ocfs2_read_locked_inode(struct inode *inode,
BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+ if (buffer_dirty(bh) && !buffer_jbd(bh)) {
+ if (can_lock) {
+ ocfs2_inode_unlock(inode, lock_level);
+ lock_level = 1;
+ ocfs2_inode_lock(inode, NULL, lock_level);
+ }
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = 0;
bail:
if (can_lock)
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, lock_level);
if (status < 0)
make_bad_inode(inode);
- if (args && bh)
- brelse(bh);
+ brelse(bh);
return status;
}
@@ -580,10 +699,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
handle = NULL;
status = ocfs2_commit_truncate(osb, inode, fe_bh);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto out;
- }
}
out:
@@ -608,15 +725,15 @@ static int ocfs2_remove_inode(struct inode *inode,
ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
le16_to_cpu(di->i_suballoc_slot));
if (!inode_alloc_inode) {
- status = -EEXIST;
+ status = -ENOENT;
mlog_errno(status);
goto bail;
}
- mutex_lock(&inode_alloc_inode->i_mutex);
+ inode_lock(inode_alloc_inode);
status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
if (status < 0) {
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
mlog_errno(status);
goto bail;
@@ -632,7 +749,7 @@ static int ocfs2_remove_inode(struct inode *inode,
if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
- orphan_dir_bh);
+ orphan_dir_bh, false);
if (status < 0) {
mlog_errno(status);
goto bail_commit;
@@ -647,7 +764,7 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail_commit;
}
- di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+ di->i_dtime = cpu_to_le64(ktime_get_real_seconds());
di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
ocfs2_journal_dirty(handle, di_bh);
@@ -663,7 +780,7 @@ bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
ocfs2_inode_unlock(inode_alloc_inode, 1);
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
brelse(inode_alloc_bh);
bail:
iput(inode_alloc_inode);
@@ -674,7 +791,7 @@ bail:
/*
* Serialize with orphan dir recovery. If the process doing
* recovery on this orphan dir does an iget() with the dir
- * i_mutex held, we'll deadlock here. Instead we detect this
+ * i_rwsem held, we'll deadlock here. Instead we detect this
* and exit early - recovery will wipe this inode for us.
*/
static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
@@ -726,7 +843,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
ORPHAN_DIR_SYSTEM_INODE,
orphaned_slot);
if (!orphan_dir_inode) {
- status = -EEXIST;
+ status = -ENOENT;
mlog_errno(status);
goto bail;
}
@@ -734,10 +851,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
/* Lock the orphan dir. The lock will be held for the entire
* delete_inode operation. We do this now to avoid races with
* recovery completion on other nodes. */
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
mlog_errno(status);
goto bail;
@@ -786,7 +903,7 @@ bail_unlock_dir:
return status;
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
brelse(orphan_dir_bh);
bail:
iput(orphan_dir_inode);
@@ -814,11 +931,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail;
}
- /* If we're coming from downconvert_thread we can't go into our own
- * voting [hello, deadlock city!], so unforuntately we just
- * have to skip deleting this guy. That's OK though because
- * the node who's doing the actual deleting should handle it
- * anyway. */
+ /*
+ * If we're coming from downconvert_thread we can't go into our own
+ * voting [hello, deadlock city!] so we cannot delete the inode. But
+ * since we dropped last inode ref when downconverting dentry lock,
+ * we cannot have the file open and thus the node doing unlink will
+ * take care of deleting the inode.
+ */
if (current == osb->dc_task)
goto bail;
@@ -832,12 +951,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail_unlock;
}
- /* If we have allowd wipe of this inode for another node, it
- * will be marked here so we can safely skip it. Recovery will
- * cleanup any inodes we might inadvertently skip here. */
- if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
- goto bail_unlock;
-
ret = 1;
bail_unlock:
spin_unlock(&oi->ip_lock);
@@ -951,7 +1064,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
if (sync_data)
filemap_write_and_wait(inode->i_mapping);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
}
static void ocfs2_delete_inode(struct inode *inode)
@@ -959,6 +1072,7 @@ static void ocfs2_delete_inode(struct inode *inode)
int wipe, status;
sigset_t oldset;
struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
trace_ocfs2_delete_inode(inode->i_ino,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -970,8 +1084,6 @@ static void ocfs2_delete_inode(struct inode *inode)
if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
goto bail;
- dquot_initialize(inode);
-
if (!ocfs2_inode_is_valid_to_delete(inode)) {
/* It's probably not necessary to truncate_inode_pages
* here but we do it for safety anyway (it will most
@@ -980,6 +1092,8 @@ static void ocfs2_delete_inode(struct inode *inode)
goto bail;
}
+ dquot_initialize(inode);
+
/* We want to block signals in delete_inode as the lock and
* messaging paths may return us -ERESTARTSYS. Which would
* cause us to exit early, resulting in inodes being orphaned
@@ -1013,6 +1127,14 @@ static void ocfs2_delete_inode(struct inode *inode)
goto bail_unlock_nfs_sync;
}
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ /* Skip inode deletion and wait for dio orphan entry recovered
+ * first */
+ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+ ocfs2_cleanup_delete_inode(inode, 0);
+ goto bail_unlock_inode;
+ }
+
/* Query the cluster. This will be the final decision made
* before we go ahead and wipe the inode. */
status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
@@ -1067,27 +1189,28 @@ static void ocfs2_clear_inode(struct inode *inode)
{
int status;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
clear_inode(inode);
trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
inode->i_nlink);
- mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
+ mlog_bug_on_msg(osb == NULL,
"Inode=%lu\n", inode->i_ino);
dquot_drop(inode);
- /* To preven remote deletes we hold open lock before, now it
+ /* To prevent remote deletes we hold open lock before, now it
* is time to unlock PR and EX open locks. */
ocfs2_open_unlock(inode);
/* Do these before all the other work so that we don't bounce
* the downconvert thread while waiting to destroy the locks. */
- ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
- ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+ ocfs2_resv_discard(&osb->osb_la_resmap,
&oi->ip_la_data_resv);
ocfs2_resv_init_once(&oi->ip_la_data_resv);
@@ -1097,12 +1220,15 @@ static void ocfs2_clear_inode(struct inode *inode)
* exception here are successfully wiped inodes - their
* metadata can now be considered to be part of the system
* inodes from which it came. */
- if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
+ if (!(oi->ip_flags & OCFS2_INODE_DELETED))
ocfs2_checkpoint_inode(inode);
mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
+ mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+ "Clear inode of %llu, inode has unwritten extents\n",
+ (unsigned long long)oi->ip_blkno);
ocfs2_extent_map_trunc(inode, 0);
@@ -1157,46 +1283,32 @@ static void ocfs2_clear_inode(struct inode *inode)
* the journal is flushed before journal shutdown. Thus it is safe to
* have inodes get cleaned up after journal shutdown.
*/
- jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+ if (!osb->journal)
+ return;
+
+ jbd2_journal_release_jbd_inode(osb->journal->j_journal,
&oi->ip_jinode);
}
void ocfs2_evict_inode(struct inode *inode)
{
+ write_inode_now(inode, 1);
+
if (!inode->i_nlink ||
(OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
ocfs2_delete_inode(inode);
} else {
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
}
ocfs2_clear_inode(inode);
}
-/* Called under inode_lock, with no more references on the
- * struct inode, so it's safe here to check the flags field
- * and to manipulate i_nlink without any other locks. */
-int ocfs2_drop_inode(struct inode *inode)
-{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- int res;
-
- trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
- inode->i_nlink, oi->ip_flags);
-
- if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- res = 1;
- else
- res = generic_drop_inode(inode);
-
- return res;
-}
-
/*
* This is called from our getattr.
*/
int ocfs2_inode_revalidate(struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int status = 0;
trace_ocfs2_inode_revalidate(inode,
@@ -1262,14 +1374,15 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_uid = cpu_to_le32(i_uid_read(inode));
fe->i_gid = cpu_to_le32(i_gid_read(inode));
fe->i_mode = cpu_to_le16(inode->i_mode);
- fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
- fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
- fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
- fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+ fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+ fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+ fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
ocfs2_journal_dirty(handle, bh);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
leave:
return status;
}
@@ -1297,12 +1410,12 @@ void ocfs2_refresh_inode(struct inode *inode,
inode->i_blocks = 0;
else
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
- inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
- inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
- inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
- inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
- inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+ inode_set_atime(inode, le64_to_cpu(fe->i_atime),
+ le32_to_cpu(fe->i_atime_nsec));
+ inode_set_mtime(inode, le64_to_cpu(fe->i_mtime),
+ le32_to_cpu(fe->i_mtime_nsec));
+ inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
+ le32_to_cpu(fe->i_ctime_nsec));
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
@@ -1336,41 +1449,256 @@ int ocfs2_validate_inode_block(struct super_block *sb,
rc = -EINVAL;
if (!OCFS2_IS_VALID_DINODE(di)) {
- ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- di->i_signature);
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ di->i_signature);
goto bail;
}
if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
- ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
goto bail;
}
- if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
- (unsigned long long)bh->b_blocknr);
+ if (!(le32_to_cpu(di->i_flags) & OCFS2_VALID_FL)) {
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
goto bail;
}
if (le32_to_cpu(di->i_fs_generation) !=
OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: fs_generation is %u\n",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(di->i_fs_generation));
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
goto bail;
}
+ if (le16_to_cpu(di->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
+ (u32)le16_to_cpu(di->i_suballoc_slot) > OCFS2_SB(sb)->max_slots - 1) {
+ rc = ocfs2_error(sb, "Invalid dinode %llu: suballoc slot %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(di->i_suballoc_slot));
+ goto bail;
+ }
+
+ if ((le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) &&
+ le32_to_cpu(di->i_clusters)) {
+ rc = ocfs2_error(sb, "Invalid dinode %llu: %u clusters\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_clusters));
+ goto bail;
+ }
+
+ if (le32_to_cpu(di->i_flags) & OCFS2_CHAIN_FL) {
+ struct ocfs2_chain_list *cl = &di->id2.i_chain;
+ u16 bpc = 1 << (OCFS2_SB(sb)->s_clustersize_bits -
+ sb->s_blocksize_bits);
+
+ if (le16_to_cpu(cl->cl_count) != ocfs2_chain_recs_per_inode(sb)) {
+ rc = ocfs2_error(sb, "Invalid dinode %llu: chain list count %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(cl->cl_count));
+ goto bail;
+ }
+ if (le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) {
+ rc = ocfs2_error(sb, "Invalid dinode %llu: chain list index %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(cl->cl_next_free_rec));
+ goto bail;
+ }
+ if (OCFS2_SB(sb)->bitmap_blkno &&
+ OCFS2_SB(sb)->bitmap_blkno != le64_to_cpu(di->i_blkno) &&
+ le16_to_cpu(cl->cl_bpc) != bpc) {
+ rc = ocfs2_error(sb, "Invalid dinode %llu: bits per cluster %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(cl->cl_bpc));
+ goto bail;
+ }
+ }
+
rc = 0;
bail:
return rc;
}
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ trace_ocfs2_filecheck_validate_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * Call ocfs2_validate_meta_ecc() first since it has ecc repair
+ * function, but we should not return error immediately when ecc
+ * validation fails, because the reason is quite likely the invalid
+ * inode number inputted.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+ if (rc) {
+ mlog(ML_ERROR,
+ "Filecheck: checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_BLOCKECC;
+ }
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7, di->i_signature);
+ rc = -OCFS2_FILECHECK_ERR_INVALIDINO;
+ goto bail;
+ } else if (rc)
+ goto bail;
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = -OCFS2_FILECHECK_ERR_BLOCKNO;
+ goto bail;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL "
+ "not set\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_VALIDFLAG;
+ goto bail;
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ rc = -OCFS2_FILECHECK_ERR_GENERATION;
+ }
+
+bail:
+ return rc;
+}
+
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int changed = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ if (!ocfs2_filecheck_validate_inode_block(sb, bh))
+ return 0;
+
+ trace_ocfs2_filecheck_repair_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
+ ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
+ mlog(ML_ERROR,
+ "Filecheck: cannot repair dinode #%llu "
+ "on readonly filesystem\n",
+ (unsigned long long)bh->b_blocknr);
+ return -OCFS2_FILECHECK_ERR_READONLY;
+ }
+
+ if (buffer_jbd(bh)) {
+ mlog(ML_ERROR,
+ "Filecheck: cannot repair dinode #%llu, "
+ "its buffer is in jbd\n",
+ (unsigned long long)bh->b_blocknr);
+ return -OCFS2_FILECHECK_ERR_INJBD;
+ }
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ /* Cannot fix invalid inode block */
+ return -OCFS2_FILECHECK_ERR_INVALIDINO;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ /* Cannot just add VALID_FL flag back as a fix,
+ * need more things to check here.
+ */
+ return -OCFS2_FILECHECK_ERR_VALIDFLAG;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ di->i_blkno = cpu_to_le64(bh->b_blocknr);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: fs_generation to %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ }
+
+ if (ocfs2_dinode_has_extents(di) &&
+ le16_to_cpu(di->id2.i_list.l_next_free_rec) > le16_to_cpu(di->id2.i_list.l_count)) {
+ di->id2.i_list.l_next_free_rec = di->id2.i_list.l_count;
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: l_next_free_rec to %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(di->id2.i_list.l_next_free_rec));
+ }
+
+ if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) {
+ ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check);
+ mark_buffer_dirty(bh);
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: compute meta ecc\n",
+ (unsigned long long)bh->b_blocknr);
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh,
+ int flags, int type)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ if (!type) /* Check inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_validate_inode_block);
+ else /* Repair inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_repair_inode_block);
+
+ /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int flags)
{
@@ -1380,6 +1708,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno,
1, &tmp, flags, ocfs2_validate_inode_block);
+ if (rc < 0)
+ make_bad_inode(inode);
/* If ocfs2_read_blocks() got us a new bh, pass it up. */
if (!rc && !*bh)
*bh = tmp;
@@ -1408,6 +1738,7 @@ static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info
}
static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
+__acquires(&oi->ip_lock)
{
struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
@@ -1415,6 +1746,7 @@ static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
}
static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci)
+__releases(&oi->ip_lock)
{
struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 621fc73bf23d..07bd838e7843 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* inode.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_INODE_H
@@ -43,9 +27,6 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
- /* Number of outstanding AIO's which are not page aligned */
- atomic_t ip_unaligned_aio;
-
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
@@ -57,6 +38,9 @@ struct ocfs2_inode_info
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
+ /* Record unwritten extents during direct io. */
+ struct list_head ip_unwritten_list;
+
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
@@ -73,6 +57,15 @@ struct ocfs2_inode_info
u32 ip_dir_lock_gen;
struct ocfs2_alloc_reservation ip_la_data_resv;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
+
+ struct dquot __rcu *i_dquot[MAXQUOTAS];
};
/*
@@ -84,8 +77,6 @@ struct ocfs2_inode_info
#define OCFS2_INODE_BITMAP 0x00000004
/* This inode has been wiped from disk */
#define OCFS2_INODE_DELETED 0x00000008
-/* Another node is deleting, so our delete is a nop */
-#define OCFS2_INODE_SKIP_DELETE 0x00000010
/* Has the inode been orphaned on another node?
*
* This hints to ocfs2_drop_inode that it should clear i_nlink before
@@ -100,11 +91,13 @@ struct ocfs2_inode_info
* rely on ocfs2_delete_inode to sort things out under the proper
* cluster locks.
*/
-#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
+#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010
/* Does someone have the file open O_DIRECT */
-#define OCFS2_INODE_OPEN_DIRECT 0x00000040
+#define OCFS2_INODE_OPEN_DIRECT 0x00000020
/* Tell the inode wipe code it's not in orphan dir */
-#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080
+#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
+/* Entry in orphan dir with 'dio-' prefix */
+#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
@@ -114,8 +107,6 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
-extern struct kmem_cache *ocfs2_inode_cache;
-
extern const struct address_space_operations ocfs2_aops;
extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
@@ -125,30 +116,25 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
}
void ocfs2_evict_inode(struct inode *inode);
-int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
+#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4
+#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8
+
struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
-int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
int create_ino);
-void ocfs2_read_inode(struct inode *inode);
-void ocfs2_read_inode2(struct inode *inode, void *opaque);
-ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
- size_t size, loff_t *offp);
void ocfs2_sync_blockdev(struct super_block *sb);
void ocfs2_refresh_inode(struct inode *inode,
struct ocfs2_dinode *fe);
int ocfs2_mark_inode_dirty(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
-struct buffer_head *ocfs2_bread(struct inode *inode,
- int block, int *err, int reada);
void ocfs2_set_inode_flags(struct inode *inode);
void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -157,7 +143,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
{
int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
- return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+ return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
}
/* Validate that a bh contains a valid inode */
@@ -178,4 +164,10 @@ static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_
return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
}
+/* Does this inode have the reflink flag set? */
+static inline bool ocfs2_is_refcount_inode(struct inode *inode)
+{
+ return (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+}
+
#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 0c60ef2d8056..b6864602814c 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ocfs2/ioctl.c
*
@@ -7,7 +8,9 @@
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/blkdev.h>
#include <linux/compat.h>
+#include <linux/fileattr.h>
#include <cluster/masklog.h>
@@ -34,9 +37,8 @@
copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
/*
- * This call is void because we are already reporting an error that may
- * be -EFAULT. The error will be returned from the ioctl(2) call. It's
- * just a best-effort to tell userspace that this request caused the error.
+ * This is just a best-effort to tell userspace that this request
+ * caused the error.
*/
static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
struct ocfs2_info_request __user *req)
@@ -60,8 +62,10 @@ static inline int o2info_coherent(struct ocfs2_info_request *req)
return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
}
-static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
+int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
+ struct inode *inode = d_inode(dentry);
+ unsigned int flags;
int status;
status = ocfs2_inode_lock(inode, NULL, 0);
@@ -70,15 +74,19 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
return status;
}
ocfs2_get_inode_flags(OCFS2_I(inode));
- *flags = OCFS2_I(inode)->ip_attr;
+ flags = OCFS2_I(inode)->ip_attr;
ocfs2_inode_unlock(inode, 0);
+ fileattr_fill_flags(fa, flags & OCFS2_FL_VISIBLE);
+
return status;
}
-static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
- unsigned mask)
+int ocfs2_fileattr_set(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct file_kattr *fa)
{
+ struct inode *inode = d_inode(dentry);
+ unsigned int flags = fa->flags;
struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
handle_t *handle = NULL;
@@ -86,7 +94,8 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
unsigned oldflags;
int status;
- mutex_lock(&inode->i_mutex);
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
@@ -94,27 +103,18 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
goto bail;
}
- status = -EACCES;
- if (!inode_owner_or_capable(inode))
- goto bail_unlock;
-
if (!S_ISDIR(inode->i_mode))
flags &= ~OCFS2_DIRSYNC_FL;
oldflags = ocfs2_inode->ip_attr;
- flags = flags & mask;
- flags |= oldflags & ~mask;
+ flags = flags & OCFS2_FL_MODIFIABLE;
+ flags |= oldflags & ~OCFS2_FL_MODIFIABLE;
- /*
- * The IMMUTABLE and APPEND_ONLY flags can only be changed by
- * the relevant capability.
- */
+ /* Check already done by VFS, but repeat with ocfs lock */
status = -EPERM;
- if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
- (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
- if (!capable(CAP_LINUX_IMMUTABLE))
- goto bail_unlock;
- }
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
+ !capable(CAP_LINUX_IMMUTABLE))
+ goto bail_unlock;
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
@@ -125,6 +125,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
ocfs2_inode->ip_attr = flags;
ocfs2_set_inode_flags(inode);
+ inode_set_ctime_current(inode);
status = ocfs2_mark_inode_dirty(handle, inode, bh);
if (status < 0)
@@ -135,146 +136,113 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
bail_unlock:
ocfs2_inode_unlock(inode, 1);
bail:
- mutex_unlock(&inode->i_mutex);
-
brelse(bh);
return status;
}
-int ocfs2_info_handle_blocksize(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_blocksize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_blocksize oib;
if (o2info_from_user(oib, req))
- goto bail;
+ return -EFAULT;
oib.ib_blocksize = inode->i_sb->s_blocksize;
o2info_set_request_filled(&oib.ib_req);
if (o2info_to_user(oib, req))
- goto bail;
-
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oib.ib_req, req);
+ return -EFAULT;
- return status;
+ return 0;
}
-int ocfs2_info_handle_clustersize(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_clustersize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_clustersize oic;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oic, req))
- goto bail;
+ return -EFAULT;
oic.ic_clustersize = osb->s_clustersize;
o2info_set_request_filled(&oic.ic_req);
if (o2info_to_user(oic, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oic.ic_req, req);
-
- return status;
+ return 0;
}
-int ocfs2_info_handle_maxslots(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_maxslots(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_maxslots oim;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oim, req))
- goto bail;
+ return -EFAULT;
oim.im_max_slots = osb->max_slots;
o2info_set_request_filled(&oim.im_req);
if (o2info_to_user(oim, req))
- goto bail;
-
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oim.im_req, req);
+ return -EFAULT;
- return status;
+ return 0;
}
-int ocfs2_info_handle_label(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_label(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_label oil;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oil, req))
- goto bail;
+ return -EFAULT;
memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
o2info_set_request_filled(&oil.il_req);
if (o2info_to_user(oil, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oil.il_req, req);
-
- return status;
+ return 0;
}
-int ocfs2_info_handle_uuid(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_uuid(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_uuid oiu;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oiu, req))
- goto bail;
+ return -EFAULT;
memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
o2info_set_request_filled(&oiu.iu_req);
if (o2info_to_user(oiu, req))
- goto bail;
-
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oiu.iu_req, req);
+ return -EFAULT;
- return status;
+ return 0;
}
-int ocfs2_info_handle_fs_features(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_fs_features(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_fs_features oif;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oif, req))
- goto bail;
+ return -EFAULT;
oif.if_compat_features = osb->s_feature_compat;
oif.if_incompat_features = osb->s_feature_incompat;
@@ -283,44 +251,34 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
o2info_set_request_filled(&oif.if_req);
if (o2info_to_user(oif, req))
- goto bail;
-
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oif.if_req, req);
+ return -EFAULT;
- return status;
+ return 0;
}
-int ocfs2_info_handle_journal_size(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_journal_size(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_journal_size oij;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oij, req))
- goto bail;
+ return -EFAULT;
- oij.ij_journal_size = osb->journal->j_inode->i_size;
+ oij.ij_journal_size = i_size_read(osb->journal->j_inode);
o2info_set_request_filled(&oij.ij_req);
if (o2info_to_user(oij, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oij.ij_req, req);
-
- return status;
+ return 0;
}
-int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
- struct inode *inode_alloc, u64 blkno,
- struct ocfs2_info_freeinode *fi, u32 slot)
+static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+ struct inode *inode_alloc, u64 blkno,
+ struct ocfs2_info_freeinode *fi,
+ u32 slot)
{
int status = 0, unlock = 0;
@@ -328,9 +286,9 @@ int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
struct ocfs2_dinode *dinode_alloc = NULL;
if (inode_alloc)
- mutex_lock(&inode_alloc->i_mutex);
+ inode_lock(inode_alloc);
- if (o2info_coherent(&fi->ifi_req)) {
+ if (inode_alloc && o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
if (status < 0) {
mlog_errno(status);
@@ -358,20 +316,20 @@ bail:
ocfs2_inode_unlock(inode_alloc, 0);
if (inode_alloc)
- mutex_unlock(&inode_alloc->i_mutex);
+ inode_unlock(inode_alloc);
brelse(bh);
return status;
}
-int ocfs2_info_handle_freeinode(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_freeinode(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
u32 i;
u64 blkno = -1;
char namebuf[40];
- int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+ int status, type = INODE_ALLOC_SYSTEM_INODE;
struct ocfs2_info_freeinode *oifi = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *inode_alloc = NULL;
@@ -383,8 +341,10 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
goto out_err;
}
- if (o2info_from_user(*oifi, req))
- goto bail;
+ if (o2info_from_user(*oifi, req)) {
+ status = -EFAULT;
+ goto out_free;
+ }
oifi->ifi_slotnum = osb->max_slots;
@@ -398,13 +358,11 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
goto bail;
}
} else {
- ocfs2_sprintf_system_inode_name(namebuf,
- sizeof(namebuf),
- type, i);
+ int len = ocfs2_sprintf_system_inode_name(namebuf,
+ sizeof(namebuf),
+ type, i);
status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
- namebuf,
- strlen(namebuf),
- &blkno);
+ namebuf, len, &blkno);
if (status < 0) {
status = -ENOENT;
goto bail;
@@ -412,23 +370,26 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
}
status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
- if (status < 0)
- goto bail;
iput(inode_alloc);
inode_alloc = NULL;
+
+ if (status < 0)
+ goto bail;
}
o2info_set_request_filled(&oifi->ifi_req);
- if (o2info_to_user(*oifi, req))
- goto bail;
+ if (o2info_to_user(*oifi, req)) {
+ status = -EFAULT;
+ goto out_free;
+ }
status = 0;
bail:
if (status)
o2info_set_request_error(&oifi->ifi_req, req);
-
+out_free:
kfree(oifi);
out_err:
return status;
@@ -437,7 +398,7 @@ out_err:
static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
unsigned int chunksize)
{
- int index;
+ u32 index;
index = __ilog2_u32(chunksize);
if (index >= OCFS2_INFO_MAX_HIST)
@@ -460,19 +421,19 @@ static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
stats->ffs_free_chunks_real++;
}
-void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
- unsigned int chunksize)
+static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+ unsigned int chunksize)
{
o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
}
-int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
- struct inode *gb_inode,
- struct ocfs2_dinode *gb_dinode,
- struct ocfs2_chain_rec *rec,
- struct ocfs2_info_freefrag *ffg,
- u32 chunks_in_group)
+static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+ struct inode *gb_inode,
+ struct ocfs2_dinode *gb_dinode,
+ struct ocfs2_chain_rec *rec,
+ struct ocfs2_info_freefrag *ffg,
+ u32 chunks_in_group)
{
int status = 0, used;
u64 blkno;
@@ -570,9 +531,9 @@ bail:
return status;
}
-int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
- struct inode *gb_inode, u64 blkno,
- struct ocfs2_info_freefrag *ffg)
+static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+ struct inode *gb_inode, u64 blkno,
+ struct ocfs2_info_freefrag *ffg)
{
u32 chunks_in_group;
int status = 0, unlock = 0, i;
@@ -583,7 +544,7 @@ int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
struct ocfs2_dinode *gb_dinode = NULL;
if (gb_inode)
- mutex_lock(&gb_inode->i_mutex);
+ inode_lock(gb_inode);
if (o2info_coherent(&ffg->iff_req)) {
status = ocfs2_inode_lock(gb_inode, &bh, 0);
@@ -640,22 +601,20 @@ bail:
ocfs2_inode_unlock(gb_inode, 0);
if (gb_inode)
- mutex_unlock(&gb_inode->i_mutex);
-
- if (gb_inode)
- iput(gb_inode);
+ inode_unlock(gb_inode);
+ iput(gb_inode);
brelse(bh);
return status;
}
-int ocfs2_info_handle_freefrag(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_freefrag(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
u64 blkno = -1;
char namebuf[40];
- int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+ int status, type = GLOBAL_BITMAP_SYSTEM_INODE;
struct ocfs2_info_freefrag *oiff;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -668,8 +627,10 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
goto out_err;
}
- if (o2info_from_user(*oiff, req))
- goto bail;
+ if (o2info_from_user(*oiff, req)) {
+ status = -EFAULT;
+ goto out_free;
+ }
/*
* chunksize from userspace should be power of 2.
*/
@@ -688,12 +649,10 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
goto bail;
}
} else {
- ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
- OCFS2_INVALID_SLOT);
+ int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf),
+ type, OCFS2_INVALID_SLOT);
status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
- namebuf,
- strlen(namebuf),
- &blkno);
+ namebuf, len, &blkno);
if (status < 0) {
status = -ENOENT;
goto bail;
@@ -708,39 +667,33 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
if (o2info_to_user(*oiff, req)) {
status = -EFAULT;
- goto bail;
+ goto out_free;
}
status = 0;
bail:
if (status)
o2info_set_request_error(&oiff->iff_req, req);
-
+out_free:
kfree(oiff);
out_err:
return status;
}
-int ocfs2_info_handle_unknown(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_unknown(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_request oir;
if (o2info_from_user(oir, req))
- goto bail;
+ return -EFAULT;
o2info_clear_request_filled(&oir);
if (o2info_to_user(oir, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oir, req);
-
- return status;
+ return 0;
}
/*
@@ -750,8 +703,8 @@ bail:
* - distinguish different requests.
* - validate size of different requests.
*/
-int ocfs2_info_handle_request(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_request(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
int status = -EFAULT;
struct ocfs2_info_request oir;
@@ -809,8 +762,8 @@ bail:
return status;
}
-int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
- u64 *req_addr, int compat_flag)
+static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+ u64 *req_addr, int compat_flag)
{
int status = -EFAULT;
u64 __user *bp = NULL;
@@ -839,7 +792,7 @@ bail:
/*
* OCFS2_IOC_INFO handles an array of requests passed from userspace.
*
- * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * ocfs2_info_handle() receives a large info aggregation, grab and
* validate the request count from header, then break it into small
* pieces, later specific handlers can handle them one by one.
*
@@ -847,8 +800,8 @@ bail:
* a better backward&forward compatibility, since a small piece of
* request will be less likely to be broken if disk layout get changed.
*/
-int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
- int compat_flag)
+static noinline_for_stack int
+ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, int compat_flag)
{
int i, status = 0;
u64 req_addr;
@@ -884,46 +837,26 @@ bail:
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
- unsigned int flags;
- int new_clusters;
- int status;
- struct ocfs2_space_resv sr;
- struct ocfs2_new_group_input input;
- struct reflink_arguments args;
- const char __user *old_path;
- const char __user *new_path;
- bool preserve;
- struct ocfs2_info info;
void __user *argp = (void __user *)arg;
+ int status;
switch (cmd) {
- case OCFS2_IOC_GETFLAGS:
- status = ocfs2_get_inode_attr(inode, &flags);
- if (status < 0)
- return status;
-
- flags &= OCFS2_FL_VISIBLE;
- return put_user(flags, (int __user *) arg);
- case OCFS2_IOC_SETFLAGS:
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
-
- status = mnt_want_write_file(filp);
- if (status)
- return status;
- status = ocfs2_set_inode_attr(inode, flags,
- OCFS2_FL_MODIFIABLE);
- mnt_drop_write_file(filp);
- return status;
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
case OCFS2_IOC_UNRESVSP:
case OCFS2_IOC_UNRESVSP64:
+ {
+ struct ocfs2_space_resv sr;
+
if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
return -EFAULT;
return ocfs2_change_file_space(filp, cmd, &sr);
+ }
case OCFS2_IOC_GROUP_EXTEND:
+ {
+ int new_clusters;
+
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -936,8 +869,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
status = ocfs2_group_extend(inode, new_clusters);
mnt_drop_write_file(filp);
return status;
+ }
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
+ {
+ struct ocfs2_new_group_input input;
+
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -950,7 +887,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
status = ocfs2_group_add(inode, &input);
mnt_drop_write_file(filp);
return status;
+ }
case OCFS2_IOC_REFLINK:
+ {
+ struct reflink_arguments args;
+ const char __user *old_path;
+ const char __user *new_path;
+ bool preserve;
+
if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
old_path = (const char __user *)(unsigned long)args.old_path;
@@ -958,11 +902,16 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
preserve = (args.preserve != 0);
return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+ }
case OCFS2_IOC_INFO:
+ {
+ struct ocfs2_info info;
+
if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
return -EFAULT;
return ocfs2_info_handle(inode, &info, 0);
+ }
case FITRIM:
{
struct super_block *sb = inode->i_sb;
@@ -972,9 +921,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!bdev_max_discard_sectors(sb->s_bdev))
+ return -EOPNOTSUPP;
+
if (copy_from_user(&range, argp, sizeof(range)))
return -EFAULT;
+ range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev),
+ range.minlen);
ret = ocfs2_trim_fs(sb, &range);
if (ret < 0)
return ret;
@@ -1001,12 +955,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
void __user *argp = (void __user *)arg;
switch (cmd) {
- case OCFS2_IOC32_GETFLAGS:
- cmd = OCFS2_IOC_GETFLAGS;
- break;
- case OCFS2_IOC32_SETFLAGS:
- cmd = OCFS2_IOC_SETFLAGS;
- break;
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
case OCFS2_IOC_UNRESVSP:
@@ -1014,7 +962,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC_GROUP_EXTEND:
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
- case FITRIM:
break;
case OCFS2_IOC_REFLINK:
if (copy_from_user(&args, argp, sizeof(args)))
@@ -1028,6 +975,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return -EFAULT;
return ocfs2_info_handle(inode, &info, 1);
+ case FITRIM:
case OCFS2_IOC_MOVE_EXT:
break;
default:
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 0cd5323bd3f0..4a1c2313b429 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* ioctl.h
*
@@ -10,6 +11,9 @@
#ifndef OCFS2_IOCTL_PROTO_H
#define OCFS2_IOCTL_PROTO_H
+int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
+int ocfs2_fileattr_set(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct file_kattr *fa);
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 242170d83971..85239807dec7 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* journal.c
*
* Defines functions of journalling api
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -30,6 +14,8 @@
#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/writeback.h>
#include <cluster/masklog.h>
@@ -49,6 +35,8 @@
#include "sysfile.h"
#include "uptodate.h"
#include "quota.h"
+#include "file.h"
+#include "namei.h"
#include "buffer_head_io.h"
#include "ocfs2_trace.h"
@@ -68,13 +56,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
static int ocfs2_trylock_journal(struct ocfs2_super *osb,
int slot_num);
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
- int slot);
+ int slot,
+ enum ocfs2_orphan_reco_type orphan_reco_type);
static int ocfs2_commit_thread(void *arg);
static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
int slot_num,
struct ocfs2_dinode *la_dinode,
struct ocfs2_dinode *tl_dinode,
- struct ocfs2_quota_recovery *qrec);
+ struct ocfs2_quota_recovery *qrec,
+ enum ocfs2_orphan_reco_type orphan_reco_type);
static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
{
@@ -100,10 +90,10 @@ enum ocfs2_replay_state {
struct ocfs2_replay_map {
unsigned int rm_slots;
enum ocfs2_replay_state rm_state;
- unsigned char rm_replay_slots[0];
+ unsigned char rm_replay_slots[] __counted_by(rm_slots);
};
-void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
{
if (!osb->replay_map)
return;
@@ -124,9 +114,9 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
if (osb->replay_map)
return 0;
- replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
- (osb->max_slots * sizeof(char)), GFP_KERNEL);
-
+ replay_map = kzalloc(struct_size(replay_map, rm_replay_slots,
+ osb->max_slots),
+ GFP_KERNEL);
if (!replay_map) {
mlog_errno(-ENOMEM);
return -ENOMEM;
@@ -148,7 +138,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
return 0;
}
-void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
struct ocfs2_replay_map *replay_map = osb->replay_map;
int i;
@@ -162,7 +153,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
for (i = 0; i < replay_map->rm_slots; i++)
if (replay_map->rm_replay_slots[i])
ocfs2_queue_recovery_completion(osb->journal, i, NULL,
- NULL, NULL);
+ NULL, NULL,
+ orphan_reco_type);
replay_map->rm_state = REPLAY_DONE;
}
@@ -182,49 +174,69 @@ int ocfs2_recovery_init(struct ocfs2_super *osb)
struct ocfs2_recovery_map *rm;
mutex_init(&osb->recovery_lock);
- osb->disable_recovery = 0;
+ osb->recovery_state = OCFS2_REC_ENABLED;
osb->recovery_thread_task = NULL;
init_waitqueue_head(&osb->recovery_event);
- rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
- osb->max_slots * sizeof(unsigned int),
+ rm = kzalloc(struct_size(rm, rm_entries, osb->max_slots),
GFP_KERNEL);
if (!rm) {
mlog_errno(-ENOMEM);
return -ENOMEM;
}
- rm->rm_entries = (unsigned int *)((char *)rm +
- sizeof(struct ocfs2_recovery_map));
osb->recovery_map = rm;
return 0;
}
-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
{
- mb();
return osb->recovery_thread_task != NULL;
}
+static void ocfs2_recovery_disable(struct ocfs2_super *osb,
+ enum ocfs2_recovery_state state)
+{
+ mutex_lock(&osb->recovery_lock);
+ /*
+ * If recovery thread is not running, we can directly transition to
+ * final state.
+ */
+ if (!ocfs2_recovery_thread_running(osb)) {
+ osb->recovery_state = state + 1;
+ goto out_lock;
+ }
+ osb->recovery_state = state;
+ /* Wait for recovery thread to acknowledge state transition */
+ wait_event_cmd(osb->recovery_event,
+ !ocfs2_recovery_thread_running(osb) ||
+ osb->recovery_state >= state + 1,
+ mutex_unlock(&osb->recovery_lock),
+ mutex_lock(&osb->recovery_lock));
+out_lock:
+ mutex_unlock(&osb->recovery_lock);
+
+ /*
+ * At this point we know that no more recovery work can be queued so
+ * wait for any recovery completion work to complete.
+ */
+ if (osb->ocfs2_wq)
+ flush_workqueue(osb->ocfs2_wq);
+}
+
+void ocfs2_recovery_disable_quota(struct ocfs2_super *osb)
+{
+ ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE);
+}
+
void ocfs2_recovery_exit(struct ocfs2_super *osb)
{
struct ocfs2_recovery_map *rm;
/* disable any new recovery threads and wait for any currently
* running ones to exit. Do this before setting the vol_state. */
- mutex_lock(&osb->recovery_lock);
- osb->disable_recovery = 1;
- mutex_unlock(&osb->recovery_lock);
- wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
-
- /* At this point, we know that no more recovery threads can be
- * launched, so wait for any recovery completion work to
- * complete. */
- flush_workqueue(ocfs2_wq);
+ ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE);
/*
* Now that recovery is shut down, and the osb is about to be
@@ -316,7 +328,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
}
jbd2_journal_lock_updates(journal->j_journal);
- status = jbd2_journal_flush(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, 0);
jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0) {
up_write(&journal->j_trans_barrier);
@@ -367,7 +379,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
mlog_errno(PTR_ERR(handle));
if (is_journal_aborted(journal)) {
- ocfs2_abort(osb->sb, "Detected aborted journal");
+ ocfs2_abort(osb->sb, "Detected aborted journal\n");
handle = ERR_PTR(-EROFS);
}
} else {
@@ -426,14 +438,14 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
if (!nblocks)
return 0;
- old_nblocks = handle->h_buffer_credits;
+ old_nblocks = jbd2_handle_buffer_credits(handle);
trace_ocfs2_extend_trans(old_nblocks, nblocks);
#ifdef CONFIG_OCFS2_DEBUG_FS
status = 1;
#else
- status = jbd2_journal_extend(handle, nblocks);
+ status = jbd2_journal_extend(handle, nblocks, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -455,10 +467,56 @@ bail:
return status;
}
-struct ocfs2_triggers {
- struct jbd2_buffer_trigger_type ot_triggers;
- int ot_offset;
-};
+/*
+ * Make sure handle has at least 'nblocks' credits available. If it does not
+ * have that many credits available, we will try to extend the handle to have
+ * enough credits. If that fails, we will restart transaction to have enough
+ * credits. Similar notes regarding data consistency and locking implications
+ * as for ocfs2_extend_trans() apply here.
+ */
+int ocfs2_assure_trans_credits(handle_t *handle, int nblocks)
+{
+ int old_nblks = jbd2_handle_buffer_credits(handle);
+
+ trace_ocfs2_assure_trans_credits(old_nblks);
+ if (old_nblks >= nblocks)
+ return 0;
+ return ocfs2_extend_trans(handle, nblocks - old_nblks);
+}
+
+/*
+ * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for metadata modifications.
+ * Taken from Ext4: extend_or_restart_transaction()
+ */
+int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
+{
+ int status, old_nblks;
+
+ BUG_ON(!handle);
+
+ old_nblks = jbd2_handle_buffer_credits(handle);
+ trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
+
+ if (old_nblks < thresh)
+ return 0;
+
+ status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ if (status > 0) {
+ status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
+bail:
+ return status;
+}
static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
{
@@ -523,87 +581,76 @@ static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh)
{
+ struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
mlog(ML_ERROR,
"ocfs2_abort_trigger called by JBD2. bh = 0x%lx, "
"bh->b_blocknr = %llu\n",
(unsigned long)bh,
(unsigned long long)bh->b_blocknr);
- /* We aren't guaranteed to have the superblock here - but if we
- * don't, it'll just crash. */
- ocfs2_error(bh->b_assoc_map->host->i_sb,
+ ocfs2_error(ot->sb,
"JBD2 has aborted our journal, ocfs2 cannot continue\n");
}
-static struct ocfs2_triggers di_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_dinode, i_check),
-};
-
-static struct ocfs2_triggers eb_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
-};
-
-static struct ocfs2_triggers rb_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
-};
-
-static struct ocfs2_triggers gd_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
-};
-
-static struct ocfs2_triggers db_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_db_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
-};
+static void ocfs2_setup_csum_triggers(struct super_block *sb,
+ enum ocfs2_journal_trigger_type type,
+ struct ocfs2_triggers *ot)
+{
+ BUG_ON(type >= OCFS2_JOURNAL_TRIGGER_COUNT);
-static struct ocfs2_triggers xb_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
-};
+ switch (type) {
+ case OCFS2_JTR_DI:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_dinode, i_check);
+ break;
+ case OCFS2_JTR_EB:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_extent_block, h_check);
+ break;
+ case OCFS2_JTR_RB:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_refcount_block, rf_check);
+ break;
+ case OCFS2_JTR_GD:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_group_desc, bg_check);
+ break;
+ case OCFS2_JTR_DB:
+ ot->ot_triggers.t_frozen = ocfs2_db_frozen_trigger;
+ break;
+ case OCFS2_JTR_XB:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_xattr_block, xb_check);
+ break;
+ case OCFS2_JTR_DQ:
+ ot->ot_triggers.t_frozen = ocfs2_dq_frozen_trigger;
+ break;
+ case OCFS2_JTR_DR:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check);
+ break;
+ case OCFS2_JTR_DL:
+ ot->ot_triggers.t_frozen = ocfs2_frozen_trigger;
+ ot->ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check);
+ break;
+ case OCFS2_JTR_NONE:
+ /* To make compiler happy... */
+ return;
+ }
-static struct ocfs2_triggers dq_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_dq_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
-};
+ ot->ot_triggers.t_abort = ocfs2_abort_trigger;
+ ot->sb = sb;
+}
-static struct ocfs2_triggers dr_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
-};
+void ocfs2_initialize_journal_triggers(struct super_block *sb,
+ struct ocfs2_triggers triggers[])
+{
+ enum ocfs2_journal_trigger_type type;
-static struct ocfs2_triggers dl_triggers = {
- .ot_triggers = {
- .t_frozen = ocfs2_frozen_trigger,
- .t_abort = ocfs2_abort_trigger,
- },
- .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
-};
+ for (type = OCFS2_JTR_DI; type < OCFS2_JOURNAL_TRIGGER_COUNT; type++)
+ ocfs2_setup_csum_triggers(sb, type, &triggers[type]);
+}
static int __ocfs2_journal_access(handle_t *handle,
struct ocfs2_caching_info *ci,
@@ -626,9 +673,26 @@ static int __ocfs2_journal_access(handle_t *handle,
/* we can safely remove this assertion after testing. */
if (!buffer_uptodate(bh)) {
mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
- mlog(ML_ERROR, "b_blocknr=%llu\n",
- (unsigned long long)bh->b_blocknr);
- BUG();
+ mlog(ML_ERROR, "b_blocknr=%llu, b_state=0x%lx\n",
+ (unsigned long long)bh->b_blocknr, bh->b_state);
+
+ lock_buffer(bh);
+ /*
+ * A previous transaction with a couple of buffer heads fail
+ * to checkpoint, so all the bhs are marked as BH_Write_EIO.
+ * For current transaction, the bh is just among those error
+ * bhs which previous transaction handle. We can't just clear
+ * its BH_Write_EIO and reuse directly, since other bhs are
+ * not written to disk yet and that will cause metadata
+ * inconsistency. So we should set fs read-only to avoid
+ * further damage.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return ocfs2_error(osb->sb, "A previous attempt to "
+ "write this buffer head failed\n");
+ }
+ unlock_buffer(bh);
}
/* Set the current transaction information on the ci so
@@ -668,56 +732,91 @@ static int __ocfs2_journal_access(handle_t *handle,
int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DI],
+ type);
}
int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_EB],
+ type);
}
int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_RB],
type);
}
int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_GD],
+ type);
}
int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DB],
+ type);
}
int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_XB],
+ type);
}
int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DQ],
+ type);
}
int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DR],
+ type);
}
int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
+ struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
+
+ return __ocfs2_journal_access(handle, ci, bh,
+ &osb->s_journal_triggers[OCFS2_JTR_DL],
+ type);
}
int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
@@ -733,7 +832,22 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
status = jbd2_journal_dirty_metadata(handle, bh);
- BUG_ON(status);
+ if (status) {
+ mlog_errno(status);
+ if (!is_handle_aborted(handle)) {
+ journal_t *journal = handle->h_transaction->t_journal;
+
+ mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed: "
+ "handle type %u started at line %u, credits %u/%u "
+ "errcode %d. Aborting transaction and journal.\n",
+ handle->h_type, handle->h_line_no,
+ handle->h_requested_credits,
+ jbd2_handle_buffer_credits(handle), status);
+ handle->h_err = status;
+ jbd2_journal_abort_handle(handle);
+ jbd2_journal_abort(journal, status);
+ }
+ }
}
#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -755,20 +869,54 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
write_unlock(&journal->j_state_lock);
}
-int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+/*
+ * alloc & initialize skeleton for journal structure.
+ * ocfs2_journal_init() will make fs have journal ability.
+ */
+int ocfs2_journal_alloc(struct ocfs2_super *osb)
+{
+ int status = 0;
+ struct ocfs2_journal *journal;
+
+ journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
+ if (!journal) {
+ mlog(ML_ERROR, "unable to alloc journal\n");
+ status = -ENOMEM;
+ goto bail;
+ }
+ osb->journal = journal;
+ journal->j_osb = osb;
+
+ atomic_set(&journal->j_num_trans, 0);
+ init_rwsem(&journal->j_trans_barrier);
+ init_waitqueue_head(&journal->j_checkpointed);
+ spin_lock_init(&journal->j_lock);
+ journal->j_trans_id = 1UL;
+ INIT_LIST_HEAD(&journal->j_la_cleanups);
+ INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
+ journal->j_state = OCFS2_JOURNAL_FREE;
+
+bail:
+ return status;
+}
+
+static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
+ jinode->i_dirty_start, jinode->i_dirty_end);
+}
+
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
{
int status = -1;
struct inode *inode = NULL; /* the journal inode */
journal_t *j_journal = NULL;
+ struct ocfs2_journal *journal = osb->journal;
struct ocfs2_dinode *di = NULL;
struct buffer_head *bh = NULL;
- struct ocfs2_super *osb;
int inode_lock = 0;
BUG_ON(!journal);
-
- osb = journal->j_osb;
-
/* already have the inode for our journal */
inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
osb->slot_num);
@@ -801,31 +949,35 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
inode_lock = 1;
di = (struct ocfs2_dinode *)bh->b_data;
- if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
+ if (i_size_read(inode) < OCFS2_MIN_JOURNAL_SIZE) {
mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
- inode->i_size);
+ i_size_read(inode));
status = -EINVAL;
goto done;
}
- trace_ocfs2_journal_init(inode->i_size,
+ trace_ocfs2_journal_init(i_size_read(inode),
(unsigned long long)inode->i_blocks,
OCFS2_I(inode)->ip_clusters);
/* call the kernels journal init function now */
j_journal = jbd2_journal_init_inode(inode);
- if (j_journal == NULL) {
+ if (IS_ERR(j_journal)) {
mlog(ML_ERROR, "Linux journal layer error\n");
- status = -EINVAL;
+ status = PTR_ERR(j_journal);
goto done;
}
- trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
+ trace_ocfs2_journal_init_maxlen(j_journal->j_total_len);
*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
OCFS2_JOURNAL_DIRTY_FL);
journal->j_journal = j_journal;
+ journal->j_journal->j_submit_inode_data_buffers =
+ ocfs2_journal_submit_inode_data_buffers;
+ journal->j_journal->j_finish_inode_data_buffers =
+ jbd2_journal_finish_inode_data_buffers;
journal->j_inode = inode;
journal->j_bh = bh;
@@ -918,7 +1070,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
if (!igrab(inode))
BUG();
- num_running_trans = atomic_read(&(osb->journal->j_num_trans));
+ num_running_trans = atomic_read(&(journal->j_num_trans));
trace_ocfs2_journal_shutdown(num_running_trans);
/* Do a commit_cache here. It will flush our journal, *and*
@@ -937,17 +1089,19 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
osb->commit_task = NULL;
}
- BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
+ BUG_ON(atomic_read(&(journal->j_num_trans)) != 0);
- if (ocfs2_mount_local(osb)) {
+ if (ocfs2_mount_local(osb) &&
+ (journal->j_journal->j_flags & JBD2_LOADED)) {
jbd2_journal_lock_updates(journal->j_journal);
- status = jbd2_journal_flush(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, 0);
jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0)
mlog_errno(status);
}
- if (status == 0) {
+ /* Shutdown the kernel journal system */
+ if (!jbd2_journal_destroy(journal->j_journal) && !status) {
/*
* Do not toggle if flush was unsuccessful otherwise
* will leave dirty metadata in a "clean" journal
@@ -956,9 +1110,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
if (status < 0)
mlog_errno(status);
}
-
- /* Shutdown the kernel journal system */
- jbd2_journal_destroy(journal->j_journal);
journal->j_journal = NULL;
OCFS2_I(inode)->ip_open_count--;
@@ -971,10 +1122,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
journal->j_state = OCFS2_JOURNAL_FREE;
-// up_write(&journal->j_trans_barrier);
done:
- if (inode)
- iput(inode);
+ iput(inode);
+ kfree(journal);
+ osb->journal = NULL;
}
static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1012,6 +1163,14 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
+ if (replayed) {
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, 0);
+ jbd2_journal_unlock_updates(journal->j_journal);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
if (status < 0) {
mlog_errno(status);
@@ -1021,7 +1180,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
/* Launch the commit thread */
if (!local) {
osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
- "ocfs2cmt");
+ "ocfs2cmt-%s", osb->uuid_str);
if (IS_ERR(osb->commit_task)) {
status = PTR_ERR(osb->commit_task);
osb->commit_task = NULL;
@@ -1091,12 +1250,10 @@ static int ocfs2_force_read_journal(struct inode *inode)
int status = 0;
int i;
u64 v_blkno, p_blkno, p_blocks, num_blocks;
-#define CONCURRENT_JOURNAL_FILL 32ULL
- struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
-
- memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
+ struct buffer_head *bh = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
+ num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
v_blkno = 0;
while (v_blkno < num_blocks) {
status = ocfs2_extent_map_get_blocks(inode, v_blkno,
@@ -1106,29 +1263,32 @@ static int ocfs2_force_read_journal(struct inode *inode)
goto bail;
}
- if (p_blocks > CONCURRENT_JOURNAL_FILL)
- p_blocks = CONCURRENT_JOURNAL_FILL;
-
- /* We are reading journal data which should not
- * be put in the uptodate cache */
- status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
- p_blkno, p_blocks, bhs);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- for(i = 0; i < p_blocks; i++) {
- brelse(bhs[i]);
- bhs[i] = NULL;
+ for (i = 0; i < p_blocks; i++, p_blkno++) {
+ bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno,
+ osb->sb->s_blocksize);
+ /* block not cached. */
+ if (!bh)
+ continue;
+
+ brelse(bh);
+ bh = NULL;
+ /* We are reading journal data which should not
+ * be put in the uptodate cache.
+ */
+ status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ brelse(bh);
+ bh = NULL;
}
v_blkno += p_blocks;
}
bail:
- for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
- brelse(bhs[i]);
return status;
}
@@ -1138,6 +1298,7 @@ struct ocfs2_la_recovery_item {
struct ocfs2_dinode *lri_la_dinode;
struct ocfs2_dinode *lri_tl_dinode;
struct ocfs2_quota_recovery *lri_qrec;
+ enum ocfs2_orphan_reco_type lri_orphan_reco_type;
};
/* Does the second half of the recovery process. By this point, the
@@ -1159,6 +1320,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
struct ocfs2_dinode *la_dinode, *tl_dinode;
struct ocfs2_la_recovery_item *item, *n;
struct ocfs2_quota_recovery *qrec;
+ enum ocfs2_orphan_reco_type orphan_reco_type;
LIST_HEAD(tmp_la_list);
trace_ocfs2_complete_recovery(
@@ -1176,6 +1338,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
la_dinode = item->lri_la_dinode;
tl_dinode = item->lri_tl_dinode;
qrec = item->lri_qrec;
+ orphan_reco_type = item->lri_orphan_reco_type;
trace_ocfs2_complete_recovery_slot(item->lri_slot,
la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
@@ -1200,7 +1363,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
kfree(tl_dinode);
}
- ret = ocfs2_recover_orphans(osb, item->lri_slot);
+ ret = ocfs2_recover_orphans(osb, item->lri_slot,
+ orphan_reco_type);
if (ret < 0)
mlog_errno(ret);
@@ -1225,7 +1389,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
int slot_num,
struct ocfs2_dinode *la_dinode,
struct ocfs2_dinode *tl_dinode,
- struct ocfs2_quota_recovery *qrec)
+ struct ocfs2_quota_recovery *qrec,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
struct ocfs2_la_recovery_item *item;
@@ -1249,10 +1414,11 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
item->lri_slot = slot_num;
item->lri_tl_dinode = tl_dinode;
item->lri_qrec = qrec;
+ item->lri_orphan_reco_type = orphan_reco_type;
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
- queue_work(ocfs2_wq, &journal->j_recovery_work);
+ queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
spin_unlock(&journal->j_lock);
}
@@ -1268,15 +1434,15 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
/* No need to queue up our truncate_log as regular cleanup will catch
* that */
ocfs2_queue_recovery_completion(journal, osb->slot_num,
- osb->local_alloc_copy, NULL, NULL);
+ osb->local_alloc_copy, NULL, NULL,
+ ORPHAN_NEED_TRUNCATE);
ocfs2_schedule_truncate_log_flush(osb, 0);
osb->local_alloc_copy = NULL;
- osb->dirty = 0;
/* queue to recover orphan slots for all offline slots */
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
- ocfs2_queue_replay_slots(osb);
+ ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
ocfs2_free_replay_slots(osb);
}
@@ -1287,7 +1453,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
osb->slot_num,
NULL,
NULL,
- osb->quota_rec);
+ osb->quota_rec,
+ ORPHAN_NEED_TRUNCATE);
osb->quota_rec = NULL;
}
}
@@ -1301,17 +1468,37 @@ static int __ocfs2_recovery_thread(void *arg)
int rm_quota_used = 0, i;
struct ocfs2_quota_recovery *qrec;
+ /* Whether the quota supported. */
+ int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+ || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA);
+
status = ocfs2_wait_on_mount(osb);
if (status < 0) {
goto bail;
}
- rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
- if (!rm_quota) {
- status = -ENOMEM;
- goto bail;
+ if (quota_enabled) {
+ rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS);
+ if (!rm_quota) {
+ status = -ENOMEM;
+ goto bail;
+ }
}
restart:
+ if (quota_enabled) {
+ mutex_lock(&osb->recovery_lock);
+ /* Confirm that recovery thread will no longer recover quotas */
+ if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) {
+ osb->recovery_state = OCFS2_REC_QUOTA_DISABLED;
+ wake_up(&osb->recovery_event);
+ }
+ if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED)
+ quota_enabled = 0;
+ mutex_unlock(&osb->recovery_lock);
+ }
+
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
@@ -1324,7 +1511,7 @@ restart:
/* queue recovery for our own slot */
ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
- NULL, NULL);
+ NULL, NULL, ORPHAN_NO_NEED_TRUNCATE);
spin_lock(&osb->osb_lock);
while (rm->rm_used) {
@@ -1345,9 +1532,14 @@ restart:
* then quota usage would be out of sync until some node takes
* the slot. So we remember which nodes need quota recovery
* and when everything else is done, we recover quotas. */
- for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
- if (i == rm_quota_used)
- rm_quota[rm_quota_used++] = slot_num;
+ if (quota_enabled) {
+ for (i = 0; i < rm_quota_used
+ && rm_quota[i] != slot_num; i++)
+ ;
+
+ if (i == rm_quota_used)
+ rm_quota[rm_quota_used++] = slot_num;
+ }
status = ocfs2_recover_node(osb, node_num, slot_num);
skip_recovery:
@@ -1375,21 +1567,25 @@ skip_recovery:
/* Now it is right time to recover quotas... We have to do this under
* superblock lock so that no one can start using the slot (and crash)
* before we recover it */
- for (i = 0; i < rm_quota_used; i++) {
- qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
- if (IS_ERR(qrec)) {
- status = PTR_ERR(qrec);
- mlog_errno(status);
- continue;
+ if (quota_enabled) {
+ for (i = 0; i < rm_quota_used; i++) {
+ qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+ if (IS_ERR(qrec)) {
+ status = PTR_ERR(qrec);
+ mlog_errno(status);
+ continue;
+ }
+ ocfs2_queue_recovery_completion(osb->journal,
+ rm_quota[i],
+ NULL, NULL, qrec,
+ ORPHAN_NEED_TRUNCATE);
}
- ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
- NULL, NULL, qrec);
}
ocfs2_super_unlock(osb, 1);
/* queue recovery for offline slots */
- ocfs2_queue_replay_slots(osb);
+ ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
bail:
mutex_lock(&osb->recovery_lock);
@@ -1400,37 +1596,36 @@ bail:
ocfs2_free_replay_slots(osb);
osb->recovery_thread_task = NULL;
- mb(); /* sync with ocfs2_recovery_thread_running */
+ if (osb->recovery_state == OCFS2_REC_WANT_DISABLE)
+ osb->recovery_state = OCFS2_REC_DISABLED;
wake_up(&osb->recovery_event);
mutex_unlock(&osb->recovery_lock);
kfree(rm_quota);
- /* no one is callint kthread_stop() for us so the kthread() api
- * requires that we call do_exit(). And it isn't exported, but
- * complete_and_exit() seems to be a minimal wrapper around it. */
- complete_and_exit(NULL, status);
return status;
}
void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
{
+ int was_set = -1;
+
mutex_lock(&osb->recovery_lock);
+ if (osb->recovery_state < OCFS2_REC_WANT_DISABLE)
+ was_set = ocfs2_recovery_map_set(osb, node_num);
trace_ocfs2_recovery_thread(node_num, osb->node_num,
- osb->disable_recovery, osb->recovery_thread_task,
- osb->disable_recovery ?
- -1 : ocfs2_recovery_map_set(osb, node_num));
+ osb->recovery_state, osb->recovery_thread_task, was_set);
- if (osb->disable_recovery)
+ if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE)
goto out;
if (osb->recovery_thread_task)
goto out;
osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
- "ocfs2rec");
+ "ocfs2rec-%s", osb->uuid_str);
if (IS_ERR(osb->recovery_thread_task)) {
mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
osb->recovery_thread_task = NULL;
@@ -1558,17 +1753,16 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
}
journal = jbd2_journal_init_inode(inode);
- if (journal == NULL) {
+ if (IS_ERR(journal)) {
mlog(ML_ERROR, "Linux journal layer error\n");
- status = -EIO;
+ status = PTR_ERR(journal);
goto done;
}
status = jbd2_journal_load(journal);
if (status < 0) {
mlog_errno(status);
- if (!igrab(inode))
- BUG();
+ BUG_ON(!igrab(inode));
jbd2_journal_destroy(journal);
goto done;
}
@@ -1577,7 +1771,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
/* wipe the journal */
jbd2_journal_lock_updates(journal);
- status = jbd2_journal_flush(journal);
+ status = jbd2_journal_flush(journal, 0);
jbd2_journal_unlock_updates(journal);
if (status < 0)
mlog_errno(status);
@@ -1597,8 +1791,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
if (status < 0)
mlog_errno(status);
- if (!igrab(inode))
- BUG();
+ BUG_ON(!igrab(inode));
jbd2_journal_destroy(journal);
@@ -1610,9 +1803,7 @@ done:
if (got_lock)
ocfs2_inode_unlock(inode, 1);
- if (inode)
- iput(inode);
-
+ iput(inode);
brelse(bh);
return status;
@@ -1676,7 +1867,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
/* This will kfree the memory pointed to by la_copy and tl_copy */
ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
- tl_copy, NULL);
+ tl_copy, NULL, ORPHAN_NEED_TRUNCATE);
status = 0;
done:
@@ -1719,8 +1910,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
ocfs2_inode_unlock(inode, 1);
bail:
- if (inode)
- iput(inode);
+ iput(inode);
return status;
}
@@ -1795,7 +1985,7 @@ bail:
/*
* Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
- * randomness to the timeout to minimize multple nodes firing the timer at the
+ * randomness to the timeout to minimize multiple nodes firing the timer at the
* same time.
*/
static inline unsigned long ocfs2_orphan_scan_timeout(void)
@@ -1834,7 +2024,7 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
* hasn't happened. The node queues a scan and increments the
* sequence number in the LVB.
*/
-void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
{
struct ocfs2_orphan_scan *os;
int status, i;
@@ -1866,14 +2056,14 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
for (i = 0; i < osb->max_slots; i++)
ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
- NULL);
+ NULL, ORPHAN_NO_NEED_TRUNCATE);
/*
* We queued a recovery on orphan slots, increment the sequence
* number and update LVB so other node will skip the scan for a while
*/
seqno++;
os->os_count++;
- os->os_scantime = CURRENT_TIME;
+ os->os_scantime = ktime_get_seconds();
unlock:
ocfs2_orphan_scan_unlock(osb, seqno);
out:
@@ -1883,7 +2073,7 @@ out:
}
/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
-void ocfs2_orphan_scan_work(struct work_struct *work)
+static void ocfs2_orphan_scan_work(struct work_struct *work)
{
struct ocfs2_orphan_scan *os;
struct ocfs2_super *osb;
@@ -1895,7 +2085,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
mutex_lock(&os->os_lock);
ocfs2_queue_orphan_scan(osb);
if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
mutex_unlock(&os->os_lock);
}
@@ -1930,12 +2120,12 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
struct ocfs2_orphan_scan *os;
os = &osb->osb_orphan_scan;
- os->os_scantime = CURRENT_TIME;
+ os->os_scantime = ktime_get_seconds();
if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
}
}
@@ -1944,24 +2134,44 @@ struct ocfs2_orphan_filldir_priv {
struct dir_context ctx;
struct inode *head;
struct ocfs2_super *osb;
+ enum ocfs2_orphan_reco_type orphan_reco_type;
};
-static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
- loff_t pos, u64 ino, unsigned type)
+static bool ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
+ int name_len, loff_t pos, u64 ino,
+ unsigned type)
{
- struct ocfs2_orphan_filldir_priv *p = priv;
+ struct ocfs2_orphan_filldir_priv *p =
+ container_of(ctx, struct ocfs2_orphan_filldir_priv, ctx);
struct inode *iter;
if (name_len == 1 && !strncmp(".", name, 1))
- return 0;
+ return true;
if (name_len == 2 && !strncmp("..", name, 2))
- return 0;
+ return true;
+
+ /* do not include dio entry in case of orphan scan */
+ if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
+ (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+ OCFS2_DIO_ORPHAN_PREFIX_LEN)))
+ return true;
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget(p->osb, ino,
OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
if (IS_ERR(iter))
- return 0;
+ return true;
+
+ if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+ OCFS2_DIO_ORPHAN_PREFIX_LEN))
+ OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY;
+
+ /* Skip inodes which are already added to recover list, since dio may
+ * happen concurrently with unlink/rename */
+ if (OCFS2_I(iter)->ip_next_orphan) {
+ iput(iter);
+ return true;
+ }
trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
/* No locking is required for the next_orphan queue as there
@@ -1969,19 +2179,21 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
OCFS2_I(iter)->ip_next_orphan = p->head;
p->head = iter;
- return 0;
+ return true;
}
static int ocfs2_queue_orphans(struct ocfs2_super *osb,
int slot,
- struct inode **head)
+ struct inode **head,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
int status;
struct inode *orphan_dir_inode = NULL;
struct ocfs2_orphan_filldir_priv priv = {
.ctx.actor = ocfs2_orphan_filldir,
.osb = osb,
- .head = *head
+ .head = *head,
+ .orphan_reco_type = orphan_reco_type
};
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
@@ -1993,7 +2205,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
return status;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
if (status < 0) {
mlog_errno(status);
@@ -2011,7 +2223,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
out_cluster:
ocfs2_inode_unlock(orphan_dir_inode, 0);
out:
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
return status;
}
@@ -2071,17 +2283,20 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
* advertising our state to ocfs2_delete_inode().
*/
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
- int slot)
+ int slot,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
int ret = 0;
struct inode *inode = NULL;
struct inode *iter;
struct ocfs2_inode_info *oi;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
trace_ocfs2_recover_orphans(slot);
ocfs2_mark_recovering_orphan_dir(osb, slot);
- ret = ocfs2_queue_orphans(osb, slot, &inode);
+ ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type);
ocfs2_clear_recovering_orphan_dir(osb, slot);
/* Error here should be noted, but we want to continue with as
@@ -2095,21 +2310,61 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
(unsigned long long)oi->ip_blkno);
iter = oi->ip_next_orphan;
+ oi->ip_next_orphan = NULL;
- spin_lock(&oi->ip_lock);
- /* The remote delete code may have set these on the
- * assumption that the other node would wipe them
- * successfully. If they are still in the node's
- * orphan dir, we need to reset that state. */
- oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
-
- /* Set the proper information to get us going into
- * ocfs2_delete_inode. */
- oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
- spin_unlock(&oi->ip_lock);
+ if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
+ inode_lock(inode);
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock_mutex;
+ }
+ /*
+ * We need to take and drop the inode lock to
+ * force read inode from disk.
+ */
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock_rw;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) {
+ ret = ocfs2_truncate_file(inode, di_bh,
+ i_size_read(inode));
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto unlock_inode;
+ }
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode,
+ di_bh, 0, 0);
+ if (ret)
+ mlog_errno(ret);
+ }
+unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ di_bh = NULL;
+unlock_rw:
+ ocfs2_rw_unlock(inode, 1);
+unlock_mutex:
+ inode_unlock(inode);
+
+ /* clear dio flag in ocfs2_inode_info */
+ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
+ } else {
+ spin_lock(&oi->ip_lock);
+ /* Set the proper information to get us going into
+ * ocfs2_delete_inode. */
+ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+ spin_unlock(&oi->ip_lock);
+ }
iput(inode);
-
inode = iter;
}
@@ -2156,8 +2411,20 @@ static int ocfs2_commit_thread(void *arg)
|| kthread_should_stop());
status = ocfs2_commit_cache(osb);
- if (status < 0)
- mlog_errno(status);
+ if (status < 0) {
+ static unsigned long abort_warn_time;
+
+ /* Warn about this once per minute */
+ if (printk_timed_ratelimit(&abort_warn_time, 60*HZ))
+ mlog(ML_ERROR, "status = %d, journal is "
+ "already aborted.\n", status);
+ /*
+ * After ocfs2_commit_cache() fails, j_num_trans has a
+ * non-zero value. Sleep here to avoid a busy-wait
+ * loop.
+ */
+ msleep_interruptible(1000);
+ }
if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
mlog(ML_KTHREAD,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 96f9ac237e86..6397170f302f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* journal.h
*
* Defines journalling api and structures.
*
* Copyright (C) 2003, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_JOURNAL_H
@@ -45,7 +29,7 @@ struct ocfs2_dinode;
struct ocfs2_recovery_map {
unsigned int rm_used;
- unsigned int *rm_entries;
+ unsigned int rm_entries[];
};
@@ -158,19 +142,21 @@ static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
-void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
void ocfs2_complete_recovery(struct work_struct *work);
void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
int ocfs2_recovery_init(struct ocfs2_super *osb);
void ocfs2_recovery_exit(struct ocfs2_super *osb);
+void ocfs2_recovery_disable_quota(struct ocfs2_super *osb);
int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
+void ocfs2_free_replay_slots(struct ocfs2_super *osb);
/*
* Journal Control:
* Initialize, Load, Shutdown, Wipe a journal.
*
+ * ocfs2_journal_alloc - Initialize skeleton for journal structure.
* ocfs2_journal_init - Initialize journal structures in the OSB.
* ocfs2_journal_load - Load the given journal off disk. Replay it if
* there's transactions still in there.
@@ -184,8 +170,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
*/
void ocfs2_set_journal_params(struct ocfs2_super *osb);
-int ocfs2_journal_init(struct ocfs2_journal *journal,
- int *dirty);
+int ocfs2_journal_alloc(struct ocfs2_super *osb);
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty);
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
int full);
@@ -246,8 +232,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
* ocfs2_journal_access_*() unless you intend to
* manage the checksum by hand.
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
- * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
- * the current handle commits.
+ * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes
+ * out before the current handle commits.
*/
/* You must always start_trans with a number of buffs > 0, but it's
@@ -258,6 +244,19 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb,
int ocfs2_commit_trans(struct ocfs2_super *osb,
handle_t *handle);
int ocfs2_extend_trans(handle_t *handle, int nblocks);
+int ocfs2_assure_trans_credits(handle_t *handle,
+ int nblocks);
+int ocfs2_allocate_extend_trans(handle_t *handle,
+ int thresh);
+
+/*
+ * Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction. For unbounded transactions such as
+ * fallocate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go.
+ */
+#define OCFS2_MAX_TRANS_DATA 64U
/*
* Create access is for when we get a newly created buffer and we're
@@ -444,7 +443,7 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
* previous dirblock update in the free list */
static inline int ocfs2_link_credits(struct super_block *sb)
{
- return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
+ return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 +
ocfs2_quota_trans_credits(sb);
}
@@ -461,6 +460,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
* orphan dir index leaf */
#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
+/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry +
+ * orphan dir index root + orphan dir index leaf */
+#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4)
+#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS
+
/* dinode update, old dir dinode update, new dir dinode update, old
* dir dir entry, new dir dir entry, dir entry update for renaming
* directory + target unlink + 3 x dir index leaves */
@@ -513,8 +517,7 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
* the result may be wrong.
*/
static inline int ocfs2_calc_extend_credits(struct super_block *sb,
- struct ocfs2_extent_list *root_el,
- u32 bits_wanted)
+ struct ocfs2_extent_list *root_el)
{
int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
@@ -537,7 +540,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
- ocfs2_quota_trans_credits(sb) + bits_wanted;
+ ocfs2_quota_trans_credits(sb);
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
@@ -574,37 +577,12 @@ static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
return ocfs2_extent_recs_per_gd(sb);
}
-static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
- unsigned int clusters_to_del,
- struct ocfs2_dinode *fe,
- struct ocfs2_extent_list *last_el)
+static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode,
+ loff_t start_byte, loff_t length)
{
- /* for dinode + all headers in this pass + update to next leaf */
- u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
- u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
- int credits = 1 + tree_depth + 1;
- int i;
-
- i = next_free - 1;
- BUG_ON(i < 0);
-
- /* We may be deleting metadata blocks, so metadata alloc dinode +
- one desc. block for each possible delete. */
- if (tree_depth && next_free == 1 &&
- ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
- credits += 1 + tree_depth;
-
- /* update to the truncate log. */
- credits += OCFS2_TRUNCATE_LOG_UPDATE;
-
- credits += ocfs2_quota_trans_credits(sb);
-
- return credits;
-}
-
-static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
-{
- return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+ return jbd2_journal_inode_ranged_write(handle,
+ &OCFS2_I(inode)->ip_jinode,
+ start_byte, length);
}
static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
@@ -616,4 +594,17 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
new_size);
}
+static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ if (!is_handle_aborted(handle)) {
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+}
+
#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index aebeacd807c3..d1aa04a5af1b 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* localalloc.c
*
* Node local data allocation
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -228,14 +212,15 @@ static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
unsigned int num_clusters)
{
- spin_lock(&osb->osb_lock);
- if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
- osb->local_alloc_state == OCFS2_LA_THROTTLED)
- if (num_clusters >= osb->local_alloc_default_bits) {
+ if (num_clusters >= osb->local_alloc_default_bits) {
+ spin_lock(&osb->osb_lock);
+ if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+ osb->local_alloc_state == OCFS2_LA_THROTTLED) {
cancel_delayed_work(&osb->la_enable_wq);
osb->local_alloc_state = OCFS2_LA_ENABLED;
}
- spin_unlock(&osb->osb_lock);
+ spin_unlock(&osb->osb_lock);
+ }
}
void ocfs2_la_enable_worker(struct work_struct *work)
@@ -345,12 +330,17 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
if (num_used
|| alloc->id1.bitmap1.i_used
|| alloc->id1.bitmap1.i_total
- || la->la_bm_off)
- mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+ || la->la_bm_off) {
+ mlog(ML_ERROR, "inconsistent detected, clean journal with"
+ " unrecovered local alloc, please run fsck.ocfs2!\n"
"found = %u, set = %u, taken = %u, off = %u\n",
num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
le32_to_cpu(alloc->id1.bitmap1.i_total),
- OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+ le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off));
+
+ status = -EINVAL;
+ goto bail;
+ }
osb->local_alloc_bh = alloc_bh;
osb->local_alloc_state = OCFS2_LA_ENABLED;
@@ -358,8 +348,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
bail:
if (status < 0)
brelse(alloc_bh);
- if (inode)
- iput(inode);
+ iput(inode);
trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
@@ -387,7 +376,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq);
- flush_workqueue(ocfs2_wq);
+ if (osb->ocfs2_wq)
+ flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
@@ -415,7 +405,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
@@ -434,12 +424,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
bh = osb->local_alloc_bh;
alloc = (struct ocfs2_dinode *) bh->b_data;
- alloc_copy = kmalloc(bh->b_size, GFP_NOFS);
+ alloc_copy = kmemdup(alloc, bh->b_size, GFP_NOFS);
if (!alloc_copy) {
status = -ENOMEM;
goto out_commit;
}
- memcpy(alloc_copy, alloc, bh->b_size);
status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -469,12 +458,11 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
- if (local_alloc_inode)
- iput(local_alloc_inode);
+ iput(local_alloc_inode);
kfree(alloc_copy);
}
@@ -508,7 +496,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
goto bail;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
status = ocfs2_read_inode_block_full(inode, &alloc_bh,
OCFS2_BH_IGNORE_CACHE);
@@ -541,7 +529,7 @@ bail:
brelse(alloc_bh);
if (inode) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
}
@@ -573,7 +561,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
@@ -603,7 +591,7 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
brelse(main_bm_bh);
@@ -619,7 +607,7 @@ out:
/*
* make sure we've got at least bits_wanted contiguous bits in the
- * local alloc. You lose them when you drop i_mutex.
+ * local alloc. You lose them when you drop i_rwsem.
*
* We will add ourselves to the transaction passed in, but may start
* our own in order to shift windows.
@@ -645,11 +633,11 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
goto bail;
}
- mutex_lock(&local_alloc_inode->i_mutex);
+ inode_lock(local_alloc_inode);
/*
* We must double check state and allocator bits because
- * another process may have changed them while holding i_mutex.
+ * another process may have changed them while holding i_rwsem.
*/
spin_lock(&osb->osb_lock);
if (!ocfs2_la_state_enabled(osb) ||
@@ -665,12 +653,10 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
- ocfs2_error(osb->sb, "local alloc inode %llu says it has "
- "%u free bits, but a count shows %u",
- (unsigned long long)le64_to_cpu(alloc->i_blkno),
- le32_to_cpu(alloc->id1.bitmap1.i_used),
- ocfs2_local_alloc_count_bits(alloc));
- status = -EIO;
+ status = ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
+ (unsigned long long)le64_to_cpu(alloc->i_blkno),
+ le32_to_cpu(alloc->id1.bitmap1.i_used),
+ ocfs2_local_alloc_count_bits(alloc));
goto bail;
}
#endif
@@ -690,7 +676,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
/*
* Under certain conditions, the window slide code
* might have reduced the number of bits available or
- * disabled the the local alloc entirely. Re-check
+ * disabled the local alloc entirely. Re-check
* here and return -ENOSPC if necessary.
*/
status = -ENOSPC;
@@ -712,7 +698,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
status = 0;
bail:
if (status < 0 && local_alloc_inode) {
- mutex_unlock(&local_alloc_inode->i_mutex);
+ inode_unlock(local_alloc_inode);
iput(local_alloc_inode);
}
@@ -781,6 +767,48 @@ bail:
return status;
}
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits)
+{
+ int status, start;
+ u32 clear_bits;
+ struct inode *local_alloc_inode;
+ void *bitmap;
+ struct ocfs2_dinode *alloc;
+ struct ocfs2_local_alloc *la;
+
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+ local_alloc_inode = ac->ac_inode;
+ alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+ la = OCFS2_LOCAL_ALLOC(alloc);
+
+ bitmap = la->la_bitmap;
+ start = bit_off - le32_to_cpu(la->la_bm_off);
+ clear_bits = num_bits;
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ while (clear_bits--)
+ ocfs2_clear_bit(start++, bitmap);
+
+ le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits);
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+bail:
+ return status;
+}
+
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
{
u32 count;
@@ -797,7 +825,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
u32 *numbits,
struct ocfs2_alloc_reservation *resv)
{
- int numfound, bitoff, left, startoff, lastzero;
+ int numfound = 0, bitoff, left, startoff;
int local_resv = 0;
struct ocfs2_alloc_reservation r;
void *bitmap = NULL;
@@ -835,16 +863,9 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
numfound = bitoff = startoff = 0;
- lastzero = -1;
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
- while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
- if (bitoff == left) {
- /* mlog(0, "bitoff (%d) == left", bitoff); */
- break;
- }
- /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
- "numfound = %d\n", bitoff, startoff, numfound);*/
-
+ while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) <
+ left) {
/* Ok, we found a zero bit... is it contig. or do we
* start over?*/
if (bitoff == startoff) {
@@ -947,11 +968,11 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
le32_to_cpu(la->la_bm_off));
bitmap = la->la_bitmap;
- start = count = bit_off = 0;
+ start = count = 0;
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
- while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
- != -1) {
+ while (1) {
+ bit_off = ocfs2_find_next_zero_bit(bitmap, left, start);
if ((bit_off < left) && (bit_off == start)) {
count++;
start++;
@@ -976,6 +997,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
goto bail;
}
}
+
if (bit_off >= left)
break;
count = 1;
@@ -1003,7 +1025,7 @@ enum ocfs2_la_event {
/*
* Given an event, calculate the size of our next local alloc window.
*
- * This should always be called under i_mutex of the local alloc inode
+ * This should always be called under i_rwsem of the local alloc inode
* so that local alloc disabling doesn't race with processes trying to
* use the allocator.
*
@@ -1046,7 +1068,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
} else {
osb->local_alloc_state = OCFS2_LA_DISABLED;
}
- queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
OCFS2_LA_ENABLE_INTERVAL);
goto out_unlock;
}
@@ -1082,7 +1104,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
}
retry_enospc:
- (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
+ (*ac)->ac_bits_wanted = osb->local_alloc_bits;
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
if (status == -ENOSPC) {
if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1154,7 +1176,7 @@ retry_enospc:
OCFS2_LA_DISABLED)
goto bail;
- ac->ac_bits_wanted = osb->local_alloc_default_bits;
+ ac->ac_bits_wanted = osb->local_alloc_bits;
status = ocfs2_claim_clusters(handle, ac,
osb->local_alloc_bits,
&cluster_off,
@@ -1194,7 +1216,7 @@ retry_enospc:
OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
trace_ocfs2_local_alloc_new_window_result(
- OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
+ le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off),
le32_to_cpu(alloc->id1.bitmap1.i_total));
bail:
@@ -1244,13 +1266,12 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
* local alloc shutdown won't try to double free main bitmap
* bits. Make a copy so the sync function knows which bits to
* free. */
- alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS);
+ alloc_copy = kmemdup(alloc, osb->local_alloc_bh->b_size, GFP_NOFS);
if (!alloc_copy) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
- memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
status = ocfs2_journal_access_di(handle,
INODE_CACHE(local_alloc_inode),
@@ -1286,9 +1307,7 @@ bail:
brelse(main_bm_bh);
- if (main_bm_inode)
- iput(main_bm_inode);
-
+ iput(main_bm_inode);
kfree(alloc_copy);
if (ac)
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 1be9b5864460..08f925b7ec6d 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* localalloc.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_LOCALALLOC_H
@@ -55,6 +39,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
u32 *bit_off,
u32 *num_bits);
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits);
+
void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
unsigned int num_clusters);
void ocfs2_la_enable_worker(struct work_struct *work);
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index e57c804069ea..6de944818c56 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -1,29 +1,14 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* locks.c
*
* Userspace file locking support
*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
+#include <linux/filelock.h>
#include <linux/fcntl.h>
#include <cluster/masklog.h>
@@ -42,7 +27,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
- if (fl->fl_type == F_WRLCK)
+ if (lock_is_write(fl))
level = 1;
if (!IS_SETLKW(cmd))
trylock = 1;
@@ -52,6 +37,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
lockres->l_level > LKM_NLMODE) {
int old_level = 0;
+ struct file_lock request;
if (lockres->l_level == LKM_EXMODE)
old_level = 1;
@@ -66,8 +52,10 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
* level.
*/
- flock_lock_file_wait(file,
- &(struct file_lock){.fl_type = F_UNLCK});
+ locks_init_lock(&request);
+ request.c.flc_type = F_UNLCK;
+ request.c.flc_flags = FL_FLOCK;
+ locks_lock_file_wait(file, &request);
ocfs2_file_unlock(file);
}
@@ -81,7 +69,9 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
goto out;
}
- ret = flock_lock_file_wait(file, fl);
+ ret = locks_lock_file_wait(file, fl);
+ if (ret)
+ ocfs2_file_unlock(file);
out:
mutex_unlock(&fp->fp_mutex);
@@ -96,7 +86,7 @@ static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
mutex_lock(&fp->fp_mutex);
ocfs2_file_unlock(file);
- ret = flock_lock_file_wait(file, fl);
+ ret = locks_lock_file_wait(file, fl);
mutex_unlock(&fp->fp_mutex);
return ret;
@@ -110,16 +100,14 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!(fl->fl_flags & FL_FLOCK))
- return -ENOLCK;
- if (__mandatory_lock(inode))
+ if (!(fl->c.flc_flags & FL_FLOCK))
return -ENOLCK;
if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
ocfs2_mount_local(osb))
- return flock_lock_file_wait(file, fl);
+ return locks_lock_file_wait(file, fl);
- if (fl->fl_type == F_UNLCK)
+ if (lock_is_unlock(fl))
return ocfs2_do_funlock(file, cmd, fl);
else
return ocfs2_do_flock(file, inode, cmd, fl);
@@ -130,9 +118,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!(fl->fl_flags & FL_POSIX))
- return -ENOLCK;
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+ if (!(fl->c.flc_flags & FL_POSIX))
return -ENOLCK;
return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 496d488b271f..b52de3947d5f 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* locks.h
*
* Function prototypes for Userspace file locking support
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_LOCKS_H
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 10d66c75cecb..50e2faf64c19 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* mmap.c
*
* Code to deal with the mess that is clustered mmap.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -44,37 +28,39 @@
#include "ocfs2_trace.h"
-static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+static vm_fault_t ocfs2_fault(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
sigset_t oldset;
- int ret;
+ vm_fault_t ret;
ocfs2_block_signals(&oldset);
- ret = filemap_fault(area, vmf);
+ ret = filemap_fault(vmf);
ocfs2_unblock_signals(&oldset);
- trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
- area, vmf->page, vmf->pgoff);
+ trace_ocfs2_fault(OCFS2_I(vma->vm_file->f_mapping->host)->ip_blkno,
+ vma, vmf->page, vmf->pgoff);
return ret;
}
-static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
- struct page *page)
+static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
+ struct buffer_head *di_bh, struct folio *folio)
{
- int ret = VM_FAULT_NOPAGE;
+ int err;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
- loff_t pos = page_offset(page);
- unsigned int len = PAGE_CACHE_SIZE;
+ loff_t pos = folio_pos(folio);
+ unsigned int len = PAGE_SIZE;
pgoff_t last_index;
- struct page *locked_page = NULL;
+ struct folio *locked_folio = NULL;
void *fsdata;
loff_t size = i_size_read(inode);
- last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (size - 1) >> PAGE_SHIFT;
/*
- * There are cases that lead to the page no longer bebongs to the
+ * There are cases that lead to the page no longer belonging to the
* mapping.
* 1) pagecache truncates locally due to memory pressure.
* 2) pagecache truncates when another is taking EX lock against
@@ -86,9 +72,9 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
*
* Let VM retry with these cases.
*/
- if ((page->mapping != inode->i_mapping) ||
- (!PageUptodate(page)) ||
- (page_offset(page) >= size))
+ if ((folio->mapping != inode->i_mapping) ||
+ !folio_test_uptodate(folio) ||
+ (pos >= size))
goto out;
/*
@@ -101,40 +87,37 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
* worry about ocfs2_write_begin() skipping some buffer reads
* because the "write" would invalidate their data.
*/
- if (page->index == last_index)
- len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
-
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
- &fsdata, di_bh, page);
- if (ret) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_SIGBUS;
+ if (folio->index == last_index)
+ len = ((size - 1) & ~PAGE_MASK) + 1;
+
+ err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+ &locked_folio, &fsdata, di_bh, folio);
+ if (err) {
+ if (err != -ENOSPC)
+ mlog_errno(err);
+ ret = vmf_error(err);
goto out;
}
- if (!locked_page) {
+ if (!locked_folio) {
ret = VM_FAULT_NOPAGE;
goto out;
}
- ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
- fsdata);
- BUG_ON(ret != len);
+ err = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
+ BUG_ON(err != len);
ret = VM_FAULT_LOCKED;
out:
return ret;
}
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
- struct inode *inode = file_inode(vma->vm_file);
+ struct folio *folio = page_folio(vmf->page);
+ struct inode *inode = file_inode(vmf->vma->vm_file);
struct buffer_head *di_bh = NULL;
sigset_t oldset;
- int ret;
+ int err;
+ vm_fault_t ret;
sb_start_pagefault(inode->i_sb);
ocfs2_block_signals(&oldset);
@@ -144,9 +127,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
* node. Taking the data lock will also ensure that we don't
* attempt page truncation as part of a downconvert.
*/
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
+ err = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (err < 0) {
+ mlog_errno(err);
+ ret = vmf_error(err);
goto out;
}
@@ -157,7 +141,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
+ ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, folio);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -173,22 +157,22 @@ out:
static const struct vm_operations_struct ocfs2_file_vm_ops = {
.fault = ocfs2_fault,
.page_mkwrite = ocfs2_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+int ocfs2_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
int ret = 0, lock_level = 0;
ret = ocfs2_inode_lock_atime(file_inode(file),
- file->f_path.mnt, &lock_level);
+ file->f_path.mnt, &lock_level, 1);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
ocfs2_inode_unlock(file_inode(file), lock_level);
out:
- vma->vm_ops = &ocfs2_file_vm_ops;
+ desc->vm_ops = &ocfs2_file_vm_ops;
return 0;
}
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
index 1274ee0f1fe2..d21c30de6b8c 100644
--- a/fs/ocfs2/mmap.h
+++ b/fs/ocfs2/mmap.h
@@ -1,6 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef OCFS2_MMAP_H
#define OCFS2_MMAP_H
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
+int ocfs2_mmap_prepare(struct vm_area_desc *desc);
#endif /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index f1fc172175b6..ce978a2497d9 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -1,18 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* move_extents.c
*
* Copyright (C) 2011 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/fs.h>
#include <linux/types.h>
@@ -25,6 +15,7 @@
#include "ocfs2_ioctl.h"
#include "alloc.h"
+#include "localalloc.h"
#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
@@ -69,7 +60,7 @@ static int __ocfs2_move_extent(handle_t *handle,
u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
- ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
+ ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
p_cpos, new_p_cpos, len);
if (ret) {
mlog_errno(ret);
@@ -98,32 +89,28 @@ static int __ocfs2_move_extent(handle_t *handle,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
- if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ if (index == -1) {
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
rec = &el->l_recs[index];
- BUG_ON(ext_flags != rec->e_flags);
+ if (ext_flags != rec->e_flags) {
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %llu has corrupted extent %d with flags 0x%x at cpos %u\n",
+ (unsigned long long)ino, index, rec->e_flags, cpos);
+ goto out;
+ }
+
/*
* after moving/defraging to new location, the extent is not going
* to be refcounted anymore.
*/
replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
- context->et.et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_split_extent(handle, &context->et, path, index,
&replace_rec, context->meta_ac,
&context->dealloc);
@@ -132,8 +119,6 @@ static int __ocfs2_move_extent(handle_t *handle,
goto out;
}
- ocfs2_journal_dirty(handle, context->et.et_root_bh);
-
context->new_phys_cpos = new_p_cpos;
/*
@@ -151,23 +136,21 @@ static int __ocfs2_move_extent(handle_t *handle,
old_blkno, len);
}
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
out:
+ ocfs2_free_path(path);
return ret;
}
/*
- * lock allocators, and reserving appropriate number of bits for
- * meta blocks and data clusters.
- *
- * in some cases, we don't need to reserve clusters, just let data_ac
- * be NULL.
+ * lock allocator, and reserve appropriate number of bits for
+ * meta blocks.
*/
-static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 clusters_to_move,
u32 extents_to_split,
struct ocfs2_alloc_context **meta_ac,
- struct ocfs2_alloc_context **data_ac,
int extra_blocks,
int *credits)
{
@@ -175,7 +158,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- num_free_extents = ocfs2_num_free_extents(osb, et);
+ num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
@@ -192,16 +175,8 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
goto out;
}
- if (data_ac) {
- ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
- *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
- clusters_to_move + 2);
+ *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
extra_blocks, clusters_to_move, *credits);
@@ -234,12 +209,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
struct ocfs2_refcount_tree *ref_tree = NULL;
u32 new_phys_cpos, new_len;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ int need_free = 0;
if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
-
- BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
- OCFS2_HAS_REFCOUNT_FL));
-
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
BUG_ON(!context->refcount_loc);
ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
@@ -261,10 +234,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
}
}
- ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
- &context->meta_ac,
- &context->data_ac,
- extra_blocks, &credits);
+ ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
+ *len, 1,
+ &context->meta_ac,
+ extra_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -277,7 +250,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
* context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
*/
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
@@ -287,6 +260,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
}
}
+ /*
+ * Make sure ocfs2_reserve_cluster is called after
+ * __ocfs2_flush_truncate_log, otherwise, dead lock may happen.
+ *
+ * If ocfs2_reserve_cluster is called
+ * before __ocfs2_flush_truncate_log, dead lock on global bitmap
+ * may happen.
+ *
+ */
+ ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -312,6 +300,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
if (!partial) {
context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
ret = -ENOSPC;
+ need_free = 1;
goto out_commit;
}
}
@@ -336,10 +325,24 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
mlog_errno(ret);
out_commit:
+ if (need_free && context->data_ac) {
+ struct ocfs2_alloc_context *data_ac = context->data_ac;
+
+ if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ new_phys_cpos, new_len);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos),
+ new_len);
+ }
+
ocfs2_commit_trans(osb, handle);
out_unlock_mutex:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
if (context->data_ac) {
ocfs2_free_alloc_context(context->data_ac);
@@ -367,7 +370,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
int *vict_bit,
struct buffer_head **ret_bh)
{
- int ret, i, bits_per_unit = 0;
+ int ret, i, len, bits_per_unit = 0;
u64 blkno;
char namebuf[40];
@@ -378,9 +381,9 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
struct ocfs2_dinode *ac_dinode;
struct ocfs2_group_desc *bg;
- ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
- ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
- strlen(namebuf), &blkno);
+ len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+ ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno);
+
if (ret) {
ret = -ENOENT;
goto out;
@@ -403,7 +406,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
* 'vict_blkno' was out of the valid range.
*/
if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
- (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+ (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
bits_per_unit))) {
ret = -EINVAL;
goto out;
@@ -437,7 +440,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
bg = (struct ocfs2_group_desc *)gd_bh->b_data;
if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
- le16_to_cpu(bg->bg_bits))) {
+ (le16_to_cpu(bg->bg_bits) << bits_per_unit))) {
*ret_bh = gd_bh;
*vict_bit = (vict_blkno - blkno) >>
@@ -495,7 +498,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
bg = (struct ocfs2_group_desc *)gd_bh->b_data;
/*
- * moving goal is not allowd to start with a group desc blok(#0 blk)
+ * moving goal is not allowed to start with a group desc blok(#0 blk)
* let's compromise to the latter cluster.
*/
if (range->me_goal == le64_to_cpu(bg->bg_blkno))
@@ -552,6 +555,7 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
last_free_bits++;
if (last_free_bits == move_len) {
+ i -= move_len;
*goal_bit = i;
*phys_cpos = base_cpos + i;
break;
@@ -561,83 +565,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
}
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
- handle_t *handle,
- struct buffer_head *di_bh,
- u32 num_bits,
- u16 chain)
-{
- int ret;
- u32 tmp_used;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
- struct ocfs2_chain_list *cl =
- (struct ocfs2_chain_list *) &di->id2.i_chain;
-
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
- di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
- le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
- ocfs2_journal_dirty(handle, di_bh);
-
-out:
- return ret;
-}
-
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits)
-{
- int status;
- void *bitmap = bg->bg_bitmap;
- int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
-
- /* All callers get the descriptor via
- * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
- BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
- BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
-
- mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
- num_bits);
-
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-
- status = ocfs2_journal_access_gd(handle,
- INODE_CACHE(alloc_inode),
- group_bh,
- journal_type);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
- if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
- }
- while (num_bits--)
- ocfs2_set_bit(bit_off++, bitmap);
-
- ocfs2_journal_dirty(handle, group_bh);
-
-bail:
- return status;
-}
-
static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
u32 len, int ext_flags)
@@ -659,10 +586,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
-
- BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
- OCFS2_HAS_REFCOUNT_FL));
-
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
BUG_ON(!context->refcount_loc);
ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
@@ -684,9 +608,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
}
}
- ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
- &context->meta_ac,
- NULL, extra_blocks, &credits);
+ ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
+ len, 1,
+ &context->meta_ac,
+ extra_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -698,6 +623,8 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
*/
credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+ inode_lock(tl_inode);
+
/*
* ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
* logic, while we still need to lock the global_bitmap.
@@ -707,24 +634,22 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
if (!gb_inode) {
mlog(ML_ERROR, "unable to get global_bitmap inode\n");
ret = -EIO;
- goto out;
+ goto out_unlock_tl_inode;
}
- mutex_lock(&gb_inode->i_mutex);
+ inode_lock(gb_inode);
ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
if (ret) {
mlog_errno(ret);
- goto out_unlock_gb_mutex;
+ goto out_unlock_gb_inode;
}
- mutex_lock(&tl_inode->i_mutex);
-
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
- goto out_unlock_tl_inode;
+ goto out_unlock;
}
new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
@@ -739,7 +664,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
/*
* probe the victim cluster group to find a proper
- * region to fit wanted movement, it even will perfrom
+ * region to fit wanted movement, it even will perform
* a best-effort attempt by compromising to a threshold
* around the goal.
*/
@@ -766,9 +691,12 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
}
ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
- goal_bit, len);
- if (ret)
+ goal_bit, len, 0, 0);
+ if (ret) {
+ ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
+ le16_to_cpu(gd->bg_chain));
mlog_errno(ret);
+ }
/*
* Here we should write the new page out first if we are
@@ -781,15 +709,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
out_commit:
ocfs2_commit_trans(osb, handle);
brelse(gd_bh);
-
-out_unlock_tl_inode:
- mutex_unlock(&tl_inode->i_mutex);
-
+out_unlock:
ocfs2_inode_unlock(gb_inode, 1);
-out_unlock_gb_mutex:
- mutex_unlock(&gb_inode->i_mutex);
+out_unlock_gb_inode:
+ inode_unlock(gb_inode);
brelse(gb_bh);
iput(gb_inode);
+out_unlock_tl_inode:
+ inode_unlock(tl_inode);
out:
if (context->meta_ac) {
@@ -845,7 +772,7 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
struct ocfs2_move_extents *range = context->range;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if ((inode->i_size == 0) || (range->me_len == 0))
+ if ((i_size_read(inode) == 0) || (range->me_len == 0))
return 0;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
@@ -946,6 +873,11 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
mlog_errno(ret);
goto out;
}
+ /*
+ * Invalidate extent cache after moving/defragging to prevent
+ * stale cached data with outdated extent flags.
+ */
+ ocfs2_extent_map_trunc(inode, cpos);
context->clusters_moved += alloc_size;
next:
@@ -977,13 +909,10 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
struct buffer_head *di_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!inode)
- return -ENOENT;
-
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* This prevents concurrent writes from other nodes
@@ -1001,7 +930,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
}
/*
- * rememer ip_xattr_sem also needs to be held if necessary
+ * remember ip_xattr_sem also needs to be held if necessary
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -1031,9 +960,10 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
}
di = (struct ocfs2_dinode *)di_bh->b_data;
- inode->i_ctime = CURRENT_TIME;
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ inode_set_ctime_current(inode);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
@@ -1046,7 +976,7 @@ out_inode_unlock:
out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return status;
}
@@ -1066,8 +996,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
if (status)
return status;
- if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+ if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
+ status = -EPERM;
goto out_drop;
+ }
if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
status = -EPERM;
@@ -1089,26 +1021,35 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
goto out_free;
}
- if (range.me_start > i_size_read(inode))
+ if (range.me_start > i_size_read(inode)) {
+ status = -EINVAL;
goto out_free;
+ }
if (range.me_start + range.me_len > i_size_read(inode))
range.me_len = i_size_read(inode) - range.me_start;
context->range = &range;
+ /*
+ * ok, the default threshold for the defragmentation
+ * is 1M, since our maximum clustersize was 1M also.
+ * any thought?
+ */
+ if (!range.me_threshold)
+ range.me_threshold = 1024 * 1024;
+
+ if (range.me_threshold > i_size_read(inode))
+ range.me_threshold = i_size_read(inode);
+
+ if (range.me_flags & ~(OCFS2_MOVE_EXT_FL_AUTO_DEFRAG |
+ OCFS2_MOVE_EXT_FL_PART_DEFRAG)) {
+ status = -EINVAL;
+ goto out_free;
+ }
+
if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
context->auto_defrag = 1;
- /*
- * ok, the default theshold for the defragmentation
- * is 1M, since our maximum clustersize was 1M also.
- * any thought?
- */
- if (!range.me_threshold)
- range.me_threshold = 1024 * 1024;
-
- if (range.me_threshold > i_size_read(inode))
- range.me_threshold = i_size_read(inode);
if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
context->partial = 1;
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
index 4e143e811441..987f9e559f30 100644
--- a/fs/ocfs2/move_extents.h
+++ b/fs/ocfs2/move_extents.h
@@ -1,18 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* move_extents.h
*
* Copyright (C) 2011 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef OCFS2_MOVE_EXTENTS_H
#define OCFS2_MOVE_EXTENTS_H
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index be3f8676a438..c90b254da75e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1,6 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* namei.c
*
* Create and rename file, directory, symlinks
@@ -19,21 +18,6 @@
* linux/fs/minix/dir.c
*
* Copyright (C) 1991, 1992 Linux Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -41,6 +25,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/quotaops.h>
+#include <linux/iversion.h>
#include <cluster/masklog.h>
@@ -63,6 +48,7 @@
#include "xattr.h"
#include "acl.h"
#include "ocfs2_trace.h"
+#include "ioctl.h"
#include "buffer_head_io.h"
@@ -79,7 +65,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
struct inode **ret_orphan_dir,
u64 blkno,
char *name,
- struct ocfs2_dir_lookup_result *lookup);
+ struct ocfs2_dir_lookup_result *lookup,
+ bool dio);
static int ocfs2_orphan_add(struct ocfs2_super *osb,
handle_t *handle,
@@ -87,13 +74,22 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
char *name,
struct ocfs2_dir_lookup_result *lookup,
- struct inode *orphan_dir_inode);
+ struct inode *orphan_dir_inode,
+ bool dio);
static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
const char *symname);
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+ struct buffer_head **bh1,
+ struct inode *inode1,
+ struct buffer_head **bh2,
+ struct inode *inode2,
+ int rename);
+
+static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
/* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
@@ -146,6 +142,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
bail_add:
ret = d_splice_alias(inode, dentry);
+ if (IS_ERR(ret))
+ goto bail_unlock;
if (inode) {
/*
@@ -165,8 +163,9 @@ bail_add:
OCFS2_I(dir)->ip_blkno);
if (status) {
mlog_errno(status);
+ if (ret)
+ dput(ret);
ret = ERR_PTR(status);
- goto bail_unlock;
}
} else
ocfs2_dentry_attach_gen(dentry);
@@ -188,11 +187,12 @@ bail:
static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
{
struct inode *inode;
+ int status;
inode = new_inode(dir->i_sb);
if (!inode) {
mlog(ML_ERROR, "new_inode failed!\n");
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
/* populate as many fields early on as possible - many of
@@ -200,12 +200,34 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
* callers. */
if (S_ISDIR(mode))
set_nlink(inode, 2);
- inode_init_owner(inode, dir, mode);
- dquot_initialize(inode);
+ mode = mode_strip_sgid(&nop_mnt_idmap, dir, mode);
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+ status = dquot_initialize(inode);
+ if (status) {
+ iput(inode);
+ return ERR_PTR(status);
+ }
+
return inode;
}
-static int ocfs2_mknod(struct inode *dir,
+static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
+ struct dentry *dentry, struct inode *inode)
+{
+ struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+ ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+ ocfs2_lock_res_free(&dl->dl_lockres);
+ BUG_ON(dl->dl_count != 1);
+ spin_lock(&dentry_attach_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry_attach_lock);
+ kfree(dl);
+ iput(inode);
+}
+
+static int ocfs2_mknod(struct mnt_idmap *idmap,
+ struct inode *dir,
struct dentry *dentry,
umode_t mode,
dev_t dev)
@@ -215,6 +237,7 @@ static int ocfs2_mknod(struct inode *dir,
handle_t *handle = NULL;
struct ocfs2_super *osb;
struct ocfs2_dinode *dirfe;
+ struct ocfs2_dinode *fe = NULL;
struct buffer_head *new_fe_bh = NULL;
struct inode *inode = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
@@ -224,18 +247,24 @@ static int ocfs2_mknod(struct inode *dir,
int want_meta = 0;
int xattr_credits = 0;
struct ocfs2_security_xattr_info si = {
+ .name = NULL,
.enable = 1,
};
int did_quota_inode = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
int did_block_signals = 0;
+ struct ocfs2_dentry_lock *dl = NULL;
trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
(unsigned long)dev, mode);
- dquot_initialize(dir);
+ status = dquot_initialize(dir);
+ if (status) {
+ mlog_errno(status);
+ return status;
+ }
/* get our super block */
osb = OCFS2_SB(dir->i_sb);
@@ -282,8 +311,9 @@ static int ocfs2_mknod(struct inode *dir,
}
inode = ocfs2_get_init_inode(dir, mode);
- if (!inode) {
- status = -ENOMEM;
+ if (IS_ERR(inode)) {
+ status = PTR_ERR(inode);
+ inode = NULL;
mlog_errno(status);
goto leave;
}
@@ -359,6 +389,7 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
if (S_ISDIR(mode)) {
status = ocfs2_fill_new_dir(osb, handle, dir, inode,
new_fe_bh, data_ac, meta_ac);
@@ -380,10 +411,11 @@ static int ocfs2_mknod(struct inode *dir,
}
status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
- meta_ac, data_ac);
+ meta_ac, data_ac);
+
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto roll_back;
}
if (si.enable) {
@@ -391,7 +423,7 @@ static int ocfs2_mknod(struct inode *dir,
meta_ac, data_ac);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto roll_back;
}
}
@@ -404,25 +436,37 @@ static int ocfs2_mknod(struct inode *dir,
OCFS2_I(dir)->ip_blkno);
if (status) {
mlog_errno(status);
- goto leave;
+ goto roll_back;
}
+ dl = dentry->d_fsdata;
+
status = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno, parent_fe_bh,
&lookup);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto roll_back;
}
insert_inode_hash(inode);
d_instantiate(dentry, inode);
status = 0;
+
+roll_back:
+ if (status < 0 && S_ISDIR(mode)) {
+ ocfs2_add_links_count(dirfe, -1);
+ drop_nlink(dir);
+ }
+
leave:
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && fe)
+ ocfs2_set_links_count(fe, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -430,7 +474,6 @@ leave:
brelse(new_fe_bh);
brelse(parent_fe_bh);
- kfree(si.name);
kfree(si.value);
ocfs2_free_dir_lookup_result(&lookup);
@@ -445,11 +488,14 @@ leave:
ocfs2_free_alloc_context(meta_ac);
/*
- * We should call iput after the i_mutex of the bitmap been
+ * We should call iput after the i_rwsem of the bitmap been
* unlocked in ocfs2_free_alloc_context, or the
* ocfs2_delete_inode will mutex_lock again.
*/
if ((status < 0) && inode) {
+ if (dl)
+ ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
clear_nlink(inode);
iput(inode);
@@ -465,7 +511,6 @@ static int __ocfs2_mknod_locked(struct inode *dir,
struct inode *inode,
dev_t dev,
struct buffer_head **new_fe_bh,
- struct buffer_head *parent_fe_bh,
handle_t *handle,
struct ocfs2_alloc_context *inode_ac,
u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
@@ -475,6 +520,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
struct ocfs2_dinode *fe = NULL;
struct ocfs2_extent_list *fel;
u16 feat;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct timespec64 ts;
*new_fe_bh = NULL;
@@ -482,14 +529,14 @@ static int __ocfs2_mknod_locked(struct inode *dir,
* these are used by the support functions here and in
* callers. */
inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
- OCFS2_I(inode)->ip_blkno = fe_blkno;
+ oi->ip_blkno = fe_blkno;
spin_lock(&osb->osb_lock);
inode->i_generation = osb->s_next_generation++;
spin_unlock(&osb->osb_lock);
*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
if (!*new_fe_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto leave;
}
@@ -523,10 +570,11 @@ static int __ocfs2_mknod_locked(struct inode *dir,
fe->i_last_eb_blk = 0;
strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
+ ktime_get_coarse_real_ts64(&ts);
fe->i_atime = fe->i_ctime = fe->i_mtime =
- cpu_to_le64(CURRENT_TIME.tv_sec);
+ cpu_to_le64(ts.tv_sec);
fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
- cpu_to_le32(CURRENT_TIME.tv_nsec);
+ cpu_to_le32(ts.tv_nsec);
fe->i_dtime = 0;
/*
@@ -556,8 +604,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
mlog_errno(status);
}
- status = 0; /* error in ocfs2_create_new_inode_locks is not
- * critical */
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
leave:
if (status < 0) {
@@ -596,26 +643,28 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
}
return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
- parent_fe_bh, handle, inode_ac,
- fe_blkno, suballoc_loc, suballoc_bit);
+ handle, inode_ac, fe_blkno,
+ suballoc_loc, suballoc_bit);
}
-static int ocfs2_mkdir(struct inode *dir,
- struct dentry *dentry,
- umode_t mode)
+static struct dentry *ocfs2_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
int ret;
trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
OCFS2_I(dir)->ip_blkno, mode);
- ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
+ ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0);
if (ret)
mlog_errno(ret);
- return ret;
+ return ERR_PTR(ret);
}
-static int ocfs2_create(struct inode *dir,
+static int ocfs2_create(struct mnt_idmap *idmap,
+ struct inode *dir,
struct dentry *dentry,
umode_t mode,
bool excl)
@@ -624,7 +673,7 @@ static int ocfs2_create(struct inode *dir,
trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
(unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
- ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
+ ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0);
if (ret)
mlog_errno(ret);
@@ -636,14 +685,17 @@ static int ocfs2_link(struct dentry *old_dentry,
struct dentry *dentry)
{
handle_t *handle;
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
+ struct inode *old_dir = d_inode(old_dentry->d_parent);
int err;
struct buffer_head *fe_bh = NULL;
+ struct buffer_head *old_dir_bh = NULL;
struct buffer_head *parent_fe_bh = NULL;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
+ u64 old_de_ino;
trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
old_dentry->d_name.len, old_dentry->d_name.name,
@@ -652,20 +704,54 @@ static int ocfs2_link(struct dentry *old_dentry,
if (S_ISDIR(inode->i_mode))
return -EPERM;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
- err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
+ err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
+ &parent_fe_bh, dir, 0);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
return err;
}
+ /* make sure both dirs have bhs
+ * get an extra ref on old_dir_bh if old==new */
+ if (!parent_fe_bh) {
+ if (old_dir_bh) {
+ parent_fe_bh = old_dir_bh;
+ get_bh(parent_fe_bh);
+ } else {
+ mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
+ err = -EIO;
+ goto out;
+ }
+ }
+
if (!dir->i_nlink) {
err = -ENOENT;
goto out;
}
+ err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
+ old_dentry->d_name.len, &old_de_ino);
+ if (err) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * Check whether another node removed the source inode while we
+ * were in the vfs.
+ */
+ if (old_de_ino != OCFS2_I(inode)->ip_blkno) {
+ err = -ENOENT;
+ goto out;
+ }
+
err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
dentry->d_name.len);
if (err)
@@ -711,10 +797,11 @@ static int ocfs2_link(struct dentry *old_dentry,
}
inc_nlink(inode);
- inode->i_ctime = CURRENT_TIME;
+ inode_set_ctime_current(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
- fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
@@ -743,10 +830,11 @@ out_unlock_inode:
ocfs2_inode_unlock(inode, 1);
out:
- ocfs2_inode_unlock(dir, 1);
+ ocfs2_double_unlock(old_dir, dir);
brelse(fe_bh);
brelse(parent_fe_bh);
+ brelse(old_dir_bh);
ocfs2_free_dir_lookup_result(&lookup);
@@ -792,7 +880,7 @@ static int ocfs2_unlink(struct inode *dir,
int status;
int child_locked = 0;
bool is_unlinkable = false;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct inode *orphan_dir = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
u64 blkno;
@@ -809,9 +897,13 @@ static int ocfs2_unlink(struct inode *dir,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- dquot_initialize(dir);
+ status = dquot_initialize(dir);
+ if (status) {
+ mlog_errno(status);
+ return status;
+ }
- BUG_ON(dentry->d_parent->d_inode != dir);
+ BUG_ON(d_inode(dentry->d_parent) != dir);
if (inode == osb->root_inode)
return -EPERM;
@@ -869,7 +961,8 @@ static int ocfs2_unlink(struct inode *dir,
if (ocfs2_inode_is_unlinkable(inode)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
OCFS2_I(inode)->ip_blkno,
- orphan_name, &orphan_insert);
+ orphan_name, &orphan_insert,
+ false);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -905,9 +998,10 @@ static int ocfs2_unlink(struct inode *dir,
drop_nlink(inode);
drop_nlink(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, fe_bh);
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
if (S_ISDIR(inode->i_mode))
drop_nlink(dir);
@@ -921,7 +1015,7 @@ static int ocfs2_unlink(struct inode *dir,
if (is_unlinkable) {
status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
- orphan_name, &orphan_insert, orphan_dir);
+ orphan_name, &orphan_insert, orphan_dir, false);
if (status < 0)
mlog_errno(status);
}
@@ -930,45 +1024,104 @@ leave:
if (handle)
ocfs2_commit_trans(osb, handle);
- if (child_locked)
- ocfs2_inode_unlock(inode, 1);
-
- ocfs2_inode_unlock(dir, 1);
-
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
iput(orphan_dir);
}
+ if (child_locked)
+ ocfs2_inode_unlock(inode, 1);
+
+ ocfs2_inode_unlock(dir, 1);
+
brelse(fe_bh);
brelse(parent_node_bh);
ocfs2_free_dir_lookup_result(&orphan_insert);
ocfs2_free_dir_lookup_result(&lookup);
- if (status && (status != -ENOTEMPTY))
+ if (status && (status != -ENOTEMPTY) && (status != -ENOENT))
mlog_errno(status);
return status;
}
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+ u64 src_inode_no, u64 dest_inode_no)
+{
+ int ret = 0, i = 0;
+ u64 parent_inode_no = 0;
+ u64 child_inode_no = src_inode_no;
+ struct inode *child_inode;
+
+#define MAX_LOOKUP_TIMES 32
+ while (1) {
+ child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+ if (IS_ERR(child_inode)) {
+ ret = PTR_ERR(child_inode);
+ break;
+ }
+
+ ret = ocfs2_inode_lock(child_inode, NULL, 0);
+ if (ret < 0) {
+ iput(child_inode);
+ if (ret != -ENOENT)
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+ &parent_inode_no);
+ ocfs2_inode_unlock(child_inode, 0);
+ iput(child_inode);
+ if (ret < 0) {
+ ret = -ENOENT;
+ break;
+ }
+
+ if (parent_inode_no == dest_inode_no) {
+ ret = 1;
+ break;
+ }
+
+ if (parent_inode_no == osb->root_inode->i_ino) {
+ ret = 0;
+ break;
+ }
+
+ child_inode_no = parent_inode_no;
+
+ if (++i >= MAX_LOOKUP_TIMES) {
+ mlog_ratelimited(ML_NOTICE, "max lookup times reached, "
+ "filesystem may have nested directories, "
+ "src inode: %llu, dest inode: %llu.\n",
+ (unsigned long long)src_inode_no,
+ (unsigned long long)dest_inode_no);
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
/*
- * The only place this should be used is rename!
+ * The only place this should be used is rename and link!
* if they have the same id, then the 1st one is the only one locked.
*/
static int ocfs2_double_lock(struct ocfs2_super *osb,
struct buffer_head **bh1,
struct inode *inode1,
struct buffer_head **bh2,
- struct inode *inode2)
+ struct inode *inode2,
+ int rename)
{
int status;
+ int inode1_is_ancestor, inode2_is_ancestor;
struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
- struct buffer_head **tmpbh;
- struct inode *tmpinode;
trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
(unsigned long long)oi2->ip_blkno);
@@ -978,21 +1131,33 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
if (*bh2)
*bh2 = NULL;
- /* we always want to lock the one with the lower lockid first. */
+ /* we always want to lock the one with the lower lockid first.
+ * and if they are nested, we lock ancestor first */
if (oi1->ip_blkno != oi2->ip_blkno) {
- if (oi1->ip_blkno < oi2->ip_blkno) {
- /* switch id1 and id2 around */
- tmpbh = bh2;
- bh2 = bh1;
- bh1 = tmpbh;
+ inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+ oi1->ip_blkno);
+ if (inode1_is_ancestor < 0) {
+ status = inode1_is_ancestor;
+ goto bail;
+ }
- tmpinode = inode2;
- inode2 = inode1;
- inode1 = tmpinode;
+ inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+ oi2->ip_blkno);
+ if (inode2_is_ancestor < 0) {
+ status = inode2_is_ancestor;
+ goto bail;
+ }
+
+ if ((inode1_is_ancestor == 1) ||
+ (oi1->ip_blkno < oi2->ip_blkno &&
+ inode2_is_ancestor == 0)) {
+ /* switch id1 and id2 around */
+ swap(bh2, bh1);
+ swap(inode2, inode1);
}
/* lock id2 */
status = ocfs2_inode_lock_nested(inode2, bh2, 1,
- OI_LS_RENAME1);
+ rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -1001,7 +1166,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
/* lock id1 */
- status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
+ status = ocfs2_inode_lock_nested(inode1, bh1, 1,
+ rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT);
if (status < 0) {
/*
* An error return must mean that no cluster locks
@@ -1018,8 +1184,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
trace_ocfs2_double_lock_end(
- (unsigned long long)OCFS2_I(inode1)->ip_blkno,
- (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+ (unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
bail:
if (status)
@@ -1035,15 +1201,17 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
ocfs2_inode_unlock(inode2, 1);
}
-static int ocfs2_rename(struct inode *old_dir,
+static int ocfs2_rename(struct mnt_idmap *idmap,
+ struct inode *old_dir,
struct dentry *old_dentry,
struct inode *new_dir,
- struct dentry *new_dentry)
+ struct dentry *new_dentry,
+ unsigned int flags)
{
int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct inode *orphan_dir = NULL;
struct ocfs2_dinode *newfe = NULL;
char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
@@ -1061,6 +1229,10 @@ static int ocfs2_rename(struct inode *old_dir,
struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
struct ocfs2_dir_lookup_result target_insert = { NULL, };
+ bool should_add_orphan = false;
+
+ if (flags)
+ return -EINVAL;
/* At some point it might be nice to break this function up a
* bit. */
@@ -1069,8 +1241,16 @@ static int ocfs2_rename(struct inode *old_dir,
old_dentry->d_name.len, old_dentry->d_name.name,
new_dentry->d_name.len, new_dentry->d_name.name);
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
+ status = dquot_initialize(old_dir);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ status = dquot_initialize(new_dir);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
osb = OCFS2_SB(old_dir->i_sb);
@@ -1097,17 +1277,37 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
rename_lock = 1;
+
+ /* here we cannot guarantee the inodes haven't just been
+ * changed, so check if they are nested again */
+ status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+ old_inode->i_ino);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ } else if (status == 1) {
+ status = -EPERM;
+ trace_ocfs2_rename_not_permitted(
+ (unsigned long long)old_inode->i_ino,
+ (unsigned long long)new_dir->i_ino);
+ goto bail;
+ }
}
/* if old and new are the same, this'll just do one lock. */
status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
- &new_dir_bh, new_dir);
+ &new_dir_bh, new_dir, 1);
if (status < 0) {
mlog_errno(status);
goto bail;
}
parents_locked = 1;
+ if (!new_dir->i_nlink) {
+ status = -EACCES;
+ goto bail;
+ }
+
/* make sure both dirs have bhs
* get an extra ref on old_dir_bh if old==new */
if (!new_dir_bh) {
@@ -1142,7 +1342,7 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
- if (S_ISDIR(old_inode->i_mode)) {
+ if (S_ISDIR(old_inode->i_mode) && new_dir != old_dir) {
u64 old_inode_parent;
update_dot_dot = 1;
@@ -1159,8 +1359,7 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
- if (!new_inode && new_dir != old_dir &&
- new_dir->i_nlink >= ocfs2_link_max(osb)) {
+ if (!new_inode && new_dir->i_nlink >= ocfs2_link_max(osb)) {
status = -EMLINK;
goto bail;
}
@@ -1256,20 +1455,22 @@ static int ocfs2_rename(struct inode *old_dir,
newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
trace_ocfs2_rename_over_existing(
- (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
- (unsigned long long)newfe_bh->b_blocknr : 0ULL);
+ (unsigned long long)newfe_blkno, newfe_bh,
+ (unsigned long long)newfe_bh->b_blocknr);
if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
OCFS2_I(new_inode)->ip_blkno,
- orphan_name, &orphan_insert);
+ orphan_name, &orphan_insert,
+ false);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ should_add_orphan = true;
}
} else {
- BUG_ON(new_dentry->d_parent->d_inode != new_dir);
+ BUG_ON(d_inode(new_dentry->d_parent) != new_dir);
status = ocfs2_check_dir_for_entry(new_dir,
new_dentry->d_name.name,
@@ -1311,17 +1512,6 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
- if (S_ISDIR(new_inode->i_mode) ||
- (ocfs2_read_links_count(newfe) == 1)) {
- status = ocfs2_orphan_add(osb, handle, new_inode,
- newfe_bh, orphan_name,
- &orphan_insert, orphan_dir);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
/* change the dirent to point to the correct inode */
status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
old_inode);
@@ -1329,21 +1519,34 @@ static int ocfs2_rename(struct inode *old_dir,
mlog_errno(status);
goto bail;
}
- new_dir->i_version++;
+ inode_inc_iversion(new_dir);
if (S_ISDIR(new_inode->i_mode))
ocfs2_set_links_count(newfe, 0);
else
ocfs2_add_links_count(newfe, -1);
ocfs2_journal_dirty(handle, newfe_bh);
+ if (should_add_orphan) {
+ status = ocfs2_orphan_add(osb, handle, new_inode,
+ newfe_bh, orphan_name,
+ &orphan_insert, orphan_dir, false);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
} else {
/* if the name was not found in new_dir, add it now */
status = ocfs2_add_entry(handle, new_dentry, old_inode,
OCFS2_I(old_inode)->ip_blkno,
new_dir_bh, &target_insert);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
}
- old_inode->i_ctime = CURRENT_TIME;
+ inode_set_ctime_current(old_inode);
mark_inode_dirty(old_inode);
status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
@@ -1352,8 +1555,8 @@ static int ocfs2_rename(struct inode *old_dir,
if (status >= 0) {
old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
- old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
- old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+ old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode));
+ old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode));
ocfs2_journal_dirty(handle, old_inode_bh);
} else
mlog_errno(status);
@@ -1368,24 +1571,44 @@ static int ocfs2_rename(struct inode *old_dir,
status = ocfs2_find_entry(old_dentry->d_name.name,
old_dentry->d_name.len, old_dir,
&old_entry_lookup);
- if (status)
+ if (status) {
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
+ }
status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
if (status < 0) {
mlog_errno(status);
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
}
if (new_inode) {
drop_nlink(new_inode);
- new_inode->i_ctime = CURRENT_TIME;
+ inode_set_ctime_current(new_inode);
}
- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+ inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
&old_inode_dot_dot_res, new_dir);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ if (S_ISDIR(old_inode->i_mode)) {
drop_nlink(old_dir);
if (new_inode) {
drop_nlink(new_inode);
@@ -1403,7 +1626,8 @@ static int ocfs2_rename(struct inode *old_dir,
if (old_dir != new_dir) {
/* Keep the same times on both directories.*/
- new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime;
+ inode_set_mtime_to_ts(new_dir,
+ inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir)));
/*
* This will also pick up the i_nlink change from the
@@ -1424,6 +1648,10 @@ static int ocfs2_rename(struct inode *old_dir,
INODE_CACHE(old_dir),
old_dir_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
ocfs2_set_links_count(fe, old_dir->i_nlink);
ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1432,33 +1660,32 @@ static int ocfs2_rename(struct inode *old_dir,
ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
status = 0;
bail:
- if (rename_lock)
- ocfs2_rename_unlock(osb);
-
if (handle)
ocfs2_commit_trans(osb, handle);
- if (parents_locked)
- ocfs2_double_unlock(old_dir, new_dir);
-
- if (old_child_locked)
- ocfs2_inode_unlock(old_inode, 1);
-
- if (new_child_locked)
- ocfs2_inode_unlock(new_inode, 1);
-
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
iput(orphan_dir);
}
+ if (new_child_locked)
+ ocfs2_inode_unlock(new_inode, 1);
+
+ if (old_child_locked)
+ ocfs2_inode_unlock(old_inode, 1);
+
+ if (parents_locked)
+ ocfs2_double_unlock(old_dir, new_dir);
+
+ if (rename_lock)
+ ocfs2_rename_unlock(osb);
+
if (new_inode)
sync_mapping_buffers(old_inode->i_mapping);
- if (new_inode)
- iput(new_inode);
+ iput(new_inode);
ocfs2_free_dir_lookup_result(&target_lookup_res);
ocfs2_free_dir_lookup_result(&old_entry_lookup);
@@ -1579,7 +1806,8 @@ bail:
return status;
}
-static int ocfs2_symlink(struct inode *dir,
+static int ocfs2_symlink(struct mnt_idmap *idmap,
+ struct inode *dir,
struct dentry *dentry,
const char *symname)
{
@@ -1599,17 +1827,23 @@ static int ocfs2_symlink(struct inode *dir,
int want_clusters = 0;
int xattr_credits = 0;
struct ocfs2_security_xattr_info si = {
+ .name = NULL,
.enable = 1,
};
int did_quota = 0, did_quota_inode = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
int did_block_signals = 0;
+ struct ocfs2_dentry_lock *dl = NULL;
trace_ocfs2_symlink_begin(dir, dentry, symname,
dentry->d_name.len, dentry->d_name.name);
- dquot_initialize(dir);
+ status = dquot_initialize(dir);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
sb = dir->i_sb;
osb = OCFS2_SB(sb);
@@ -1654,8 +1888,9 @@ static int ocfs2_symlink(struct inode *dir,
}
inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
- if (!inode) {
- status = -ENOMEM;
+ if (IS_ERR(inode)) {
+ status = PTR_ERR(inode);
+ inode = NULL;
mlog_errno(status);
goto bail;
}
@@ -1726,6 +1961,7 @@ static int ocfs2_symlink(struct inode *dir,
inode->i_rdev = 0;
newsize = l - 1;
inode->i_op = &ocfs2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (l > ocfs2_fast_symlink_chars(sb)) {
u32 offset = 0;
@@ -1793,6 +2029,8 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
+ dl = dentry->d_fsdata;
+
status = ocfs2_add_entry(handle, dentry, inode,
le64_to_cpu(fe->i_blkno), parent_fe_bh,
&lookup);
@@ -1809,8 +2047,11 @@ bail:
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && fe)
+ ocfs2_set_links_count(fe, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -1818,7 +2059,6 @@ bail:
brelse(new_fe_bh);
brelse(parent_fe_bh);
- kfree(si.name);
kfree(si.value);
ocfs2_free_dir_lookup_result(&lookup);
if (inode_ac)
@@ -1828,6 +2068,9 @@ bail:
if (xattr_ac)
ocfs2_free_alloc_context(xattr_ac);
if ((status < 0) && inode) {
+ if (dl)
+ ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
clear_nlink(inode);
iput(inode);
@@ -1885,11 +2128,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
return ret;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (ret < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
mlog_errno(ret);
@@ -1906,12 +2149,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
struct buffer_head *orphan_dir_bh,
u64 blkno,
char *name,
- struct ocfs2_dir_lookup_result *lookup)
+ struct ocfs2_dir_lookup_result *lookup,
+ bool dio)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+ int namelen = dio ?
+ (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+ OCFS2_ORPHAN_NAMELEN;
+
+ if (dio) {
+ ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+ OCFS2_DIO_ORPHAN_PREFIX);
+ if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ return ret;
+ }
- ret = ocfs2_blkno_stringify(blkno, name);
+ ret = ocfs2_blkno_stringify(blkno,
+ name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+ } else
+ ret = ocfs2_blkno_stringify(blkno, name);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1919,7 +2178,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
orphan_dir_bh, name,
- OCFS2_ORPHAN_NAMELEN, lookup);
+ namelen, lookup);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1934,8 +2193,10 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
* @osb: ocfs2 file system
* @ret_orphan_dir: Orphan dir inode - returned locked!
* @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @name: Buffer to store the name of the orphan.
* @lookup: dir lookup result, to be passed back into functions like
* ocfs2_orphan_add
+ * @dio: Flag indicating if direct IO is being used or not.
*
* Returns zero on success and the ret_orphan_dir, name and lookup
* fields will be populated.
@@ -1946,7 +2207,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
struct inode **ret_orphan_dir,
u64 blkno,
char *name,
- struct ocfs2_dir_lookup_result *lookup)
+ struct ocfs2_dir_lookup_result *lookup,
+ bool dio)
{
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
@@ -1960,7 +2222,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
}
ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
- blkno, name, lookup);
+ blkno, name, lookup, dio);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1973,7 +2235,7 @@ out:
if (ret) {
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
}
@@ -1988,12 +2250,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
char *name,
struct ocfs2_dir_lookup_result *lookup,
- struct inode *orphan_dir_inode)
+ struct inode *orphan_dir_inode,
+ bool dio)
{
struct buffer_head *orphan_dir_bh = NULL;
int status = 0;
struct ocfs2_dinode *orphan_fe;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ int namelen = dio ?
+ (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+ OCFS2_ORPHAN_NAMELEN;
trace_ocfs2_orphan_add_begin(
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -2037,7 +2303,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, orphan_dir_bh);
status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
- OCFS2_ORPHAN_NAMELEN, inode,
+ namelen, inode,
OCFS2_I(inode)->ip_blkno,
orphan_dir_bh, lookup);
if (status < 0) {
@@ -2045,13 +2311,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
goto rollback;
}
- fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
- OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
+ if (dio) {
+ /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan
+ * slot.
+ */
+ fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+ fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num);
+ } else {
+ fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+ OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
- /* Record which orphan dir our inode now resides
- * in. delete_inode will use this to determine which orphan
- * dir to lock. */
- fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+ /* Record which orphan dir our inode now resides
+ * in. delete_inode will use this to determine which orphan
+ * dir to lock. */
+ fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+ }
ocfs2_journal_dirty(handle, fe_bh);
@@ -2076,14 +2350,27 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
handle_t *handle,
struct inode *orphan_dir_inode,
struct inode *inode,
- struct buffer_head *orphan_dir_bh)
+ struct buffer_head *orphan_dir_bh,
+ bool dio)
{
- char name[OCFS2_ORPHAN_NAMELEN + 1];
+ char name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
struct ocfs2_dinode *orphan_fe;
int status = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
- status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+ if (dio) {
+ status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+ OCFS2_DIO_ORPHAN_PREFIX);
+ if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+ status = -EINVAL;
+ mlog_errno(status);
+ return status;
+ }
+
+ status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno,
+ name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+ } else
+ status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -2091,10 +2378,19 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
trace_ocfs2_orphan_del(
(unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
- name, OCFS2_ORPHAN_NAMELEN);
+ name, strlen(name));
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
/* find it's spot in the orphan directory */
- status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
+ status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
&lookup);
if (status) {
mlog_errno(status);
@@ -2108,15 +2404,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(orphan_dir_inode),
- orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
/* do the i_nlink dance! :) */
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
@@ -2194,7 +2481,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
}
ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
- di_blkno, orphan_name, orphan_insert);
+ di_blkno, orphan_name, orphan_insert,
+ false);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -2215,7 +2503,7 @@ out:
ocfs2_free_alloc_context(inode_ac);
/* Unroll orphan dir locking */
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
ocfs2_inode_unlock(orphan_dir, 1);
iput(orphan_dir);
}
@@ -2233,14 +2521,13 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
struct inode *inode = NULL;
struct inode *orphan_dir = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
- struct ocfs2_dinode *di = NULL;
handle_t *handle = NULL;
char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
struct buffer_head *parent_di_bh = NULL;
struct buffer_head *new_di_bh = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
- u64 uninitialized_var(di_blkno), suballoc_loc;
+ u64 di_blkno, suballoc_loc;
u16 suballoc_bit;
status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
@@ -2260,8 +2547,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
}
inode = ocfs2_get_init_inode(dir, mode);
- if (!inode) {
- status = -ENOMEM;
+ if (IS_ERR(inode)) {
+ status = PTR_ERR(inode);
+ inode = NULL;
mlog_errno(status);
goto leave;
}
@@ -2290,7 +2578,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
clear_nlink(inode);
/* do the real work now. */
status = __ocfs2_mknod_locked(dir, inode,
- 0, &new_di_bh, parent_di_bh, handle,
+ 0, &new_di_bh, handle,
inode_ac, di_blkno, suballoc_loc,
suballoc_bit);
if (status < 0) {
@@ -2298,9 +2586,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
goto leave;
}
- di = (struct ocfs2_dinode *)new_di_bh->b_data;
status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
- &orphan_insert, orphan_dir);
+ &orphan_insert, orphan_dir, false);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -2321,7 +2608,7 @@ leave:
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
iput(orphan_dir);
}
@@ -2345,6 +2632,158 @@ leave:
return status;
}
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+ struct inode *inode)
+{
+ char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
+ struct inode *orphan_dir_inode = NULL;
+ struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+ struct buffer_head *di_bh = NULL;
+ int status = 0;
+ handle_t *handle = NULL;
+ struct ocfs2_dinode *di = NULL;
+
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ /*
+ * Another append dio crashed?
+ * If so, manually recover it first.
+ */
+ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+ status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode));
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+ }
+
+ status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
+ OCFS2_I(inode)->ip_blkno,
+ orphan_name,
+ &orphan_insert,
+ true);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ handle = ocfs2_start_trans(osb,
+ OCFS2_INODE_ADD_TO_ORPHAN_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ goto bail_unlock_orphan;
+ }
+
+ status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name,
+ &orphan_insert, orphan_dir_inode, true);
+ if (status)
+ mlog_errno(status);
+
+ ocfs2_commit_trans(osb, handle);
+
+bail_unlock_orphan:
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
+ inode_unlock(orphan_dir_inode);
+ iput(orphan_dir_inode);
+
+ ocfs2_free_dir_lookup_result(&orphan_insert);
+
+bail_unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+
+bail:
+ return status;
+}
+
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+ struct inode *inode, struct buffer_head *di_bh,
+ int update_isize, loff_t end)
+{
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ handle_t *handle = NULL;
+ int status = 0;
+
+ orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+ ORPHAN_DIR_SYSTEM_INODE,
+ le16_to_cpu(di->i_dio_orphaned_slot));
+ if (!orphan_dir_inode) {
+ status = -ENOENT;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ inode_lock(orphan_dir_inode);
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (status < 0) {
+ inode_unlock(orphan_dir_inode);
+ iput(orphan_dir_inode);
+ mlog_errno(status);
+ goto bail;
+ }
+
+ handle = ocfs2_start_trans(osb,
+ OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ goto bail_unlock_orphan;
+ }
+
+ BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)));
+
+ status = ocfs2_orphan_del(osb, handle, orphan_dir_inode,
+ inode, orphan_dir_bh, true);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_commit;
+ }
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(inode),
+ di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_commit;
+ }
+
+ di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+ di->i_dio_orphaned_slot = 0;
+
+ if (update_isize) {
+ status = ocfs2_set_inode_size(handle, inode, di_bh, end);
+ if (status)
+ mlog_errno(status);
+ } else
+ ocfs2_journal_dirty(handle, di_bh);
+
+bail_commit:
+ ocfs2_commit_trans(osb, handle);
+
+bail_unlock_orphan:
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
+ inode_unlock(orphan_dir_inode);
+ brelse(orphan_dir_bh);
+ iput(orphan_dir_inode);
+
+bail:
+ return status;
+}
+
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct inode *inode,
struct dentry *dentry)
@@ -2396,17 +2835,17 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
ORPHAN_DIR_SYSTEM_INODE,
osb->slot_num);
if (!orphan_dir_inode) {
- status = -EEXIST;
+ status = -ENOENT;
mlog_errno(status);
goto leave;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
mlog_errno(status);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
goto leave;
}
@@ -2433,7 +2872,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
}
status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
- orphan_dir_bh);
+ orphan_dir_bh, false);
if (status < 0) {
mlog_errno(status);
goto out_commit;
@@ -2444,6 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
di->i_orphaned_slot = 0;
set_nlink(inode, 1);
ocfs2_set_links_count(di, inode->i_nlink);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
status = ocfs2_add_entry(handle, dentry, inode,
@@ -2467,7 +2907,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
orphan_unlock:
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
leave:
@@ -2498,10 +2938,10 @@ const struct inode_operations ocfs2_dir_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
- .removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
+ .fileattr_get = ocfs2_fileattr_get,
+ .fileattr_set = ocfs2_fileattr_set,
};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e5d059d4f115..9cc891eb874e 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -1,31 +1,18 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* namei.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_NAMEI_H
#define OCFS2_NAMEI_H
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
+
extern const struct inode_operations ocfs2_dir_iops;
struct dentry *ocfs2_get_parent(struct dentry *child);
@@ -34,10 +21,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
handle_t *handle,
struct inode *orphan_dir_inode,
struct inode *inode,
- struct buffer_head *orphan_dir_bh);
+ struct buffer_head *orphan_dir_bh,
+ bool dio);
int ocfs2_create_inode_in_orphan(struct inode *dir,
int mode,
struct inode **new_inode);
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+ struct inode *inode);
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+ struct inode *inode, struct buffer_head *di_bh,
+ int update_isize, loff_t end);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct inode *new_inode,
struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
index dfb313bda5dd..6dbcf3d467fb 100644
--- a/fs/ocfs2/ocfs1_fs_compat.h
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -1,6 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* ocfs1_fs_compat.h
*
* OCFS1 volume header definitions. OCFS2 creates valid but unmountable
@@ -9,20 +8,6 @@
* mount it.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef _OCFS1_FS_COMPAT_H
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3a903470c794..6aaa94c554c1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* ocfs2.h
*
* Defines macros and structures used in OCFS2
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_H
@@ -30,6 +14,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/list.h>
+#include <linux/llist.h>
#include <linux/rbtree.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
@@ -49,6 +34,8 @@
#include "reservations.h"
+#include "filecheck.h"
+
/* Caching of metadata buffers */
/* Most user visible OCFS2 inodes will have very few pieces of
@@ -143,6 +130,12 @@ enum ocfs2_unlock_action {
* before the upconvert
* has completed */
+#define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster
+ * lock has already
+ * returned, do not block
+ * dc thread from
+ * downconverting */
+
struct ocfs2_lock_res_ops;
typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
@@ -155,16 +148,18 @@ struct ocfs2_lock_stats {
/* Storing max wait in usecs saves 24 bytes per inode */
u32 ls_max; /* Max wait in USEC */
+ u64 ls_last; /* Last unlock time in USEC */
};
#endif
struct ocfs2_lock_res {
void *l_priv;
- struct ocfs2_lock_res_ops *l_ops;
+ const struct ocfs2_lock_res_ops *l_ops;
struct list_head l_blocked_list;
struct list_head l_mask_waiters;
+ struct list_head l_holders;
unsigned long l_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
@@ -195,6 +190,7 @@ struct ocfs2_lock_res {
#ifdef CONFIG_OCFS2_FS_STATS
struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */
u32 l_lock_refresh; /* Disk refreshes */
+ u64 l_lock_wait; /* First lock wait time */
struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -202,6 +198,11 @@ struct ocfs2_lock_res {
#endif
};
+enum ocfs2_orphan_reco_type {
+ ORPHAN_NO_NEED_TRUNCATE = 0,
+ ORPHAN_NEED_TRUNCATE,
+};
+
enum ocfs2_orphan_scan_state {
ORPHAN_SCAN_ACTIVE,
ORPHAN_SCAN_INACTIVE
@@ -212,7 +213,7 @@ struct ocfs2_orphan_scan {
struct ocfs2_super *os_osb;
struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
struct delayed_work os_orphan_scan_work;
- struct timespec os_scantime; /* time this node ran the scan */
+ time64_t os_scantime; /* time this node ran the scan */
u32 os_count; /* tracks node specific scans */
u32 os_seqno; /* tracks cluster wide scans */
atomic_t os_state; /* ACTIVE or INACTIVE */
@@ -220,7 +221,7 @@ struct ocfs2_orphan_scan {
struct ocfs2_dlm_debug {
struct kref d_refcnt;
- struct dentry *d_locking_state;
+ u32 d_filter_secs;
struct list_head d_lockres_tracking;
};
@@ -272,21 +273,61 @@ enum ocfs2_mount_options
writes */
OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
+
+ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
+ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
+ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
+};
+
+#define OCFS2_OSB_SOFT_RO 0x0001
+#define OCFS2_OSB_HARD_RO 0x0002
+#define OCFS2_OSB_ERROR_FS 0x0004
+#define OCFS2_DEFAULT_ATIME_QUANTUM 60
+
+struct ocfs2_triggers {
+ struct jbd2_buffer_trigger_type ot_triggers;
+ int ot_offset;
+ struct super_block *sb;
+};
+
+enum ocfs2_journal_trigger_type {
+ OCFS2_JTR_DI,
+ OCFS2_JTR_EB,
+ OCFS2_JTR_RB,
+ OCFS2_JTR_GD,
+ OCFS2_JTR_DB,
+ OCFS2_JTR_XB,
+ OCFS2_JTR_DQ,
+ OCFS2_JTR_DR,
+ OCFS2_JTR_DL,
+ OCFS2_JTR_NONE /* This must be the last entry */
};
-#define OCFS2_OSB_SOFT_RO 0x0001
-#define OCFS2_OSB_HARD_RO 0x0002
-#define OCFS2_OSB_ERROR_FS 0x0004
-#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008
+#define OCFS2_JOURNAL_TRIGGER_COUNT OCFS2_JTR_NONE
+
+void ocfs2_initialize_journal_triggers(struct super_block *sb,
+ struct ocfs2_triggers triggers[]);
-#define OCFS2_DEFAULT_ATIME_QUANTUM 60
+enum ocfs2_recovery_state {
+ OCFS2_REC_ENABLED = 0,
+ OCFS2_REC_QUOTA_WANT_DISABLE,
+ /*
+ * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for
+ * ocfs2_recovery_disable_quota() to work.
+ */
+ OCFS2_REC_QUOTA_DISABLED,
+ OCFS2_REC_WANT_DISABLE,
+ /*
+ * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work
+ */
+ OCFS2_REC_DISABLED,
+};
struct ocfs2_journal;
struct ocfs2_slot_info;
struct ocfs2_recovery_map;
struct ocfs2_replay_map;
struct ocfs2_quota_recovery;
-struct ocfs2_dentry_lock;
struct ocfs2_super
{
struct task_struct *commit_task;
@@ -306,7 +347,6 @@ struct ocfs2_super
u64 system_dir_blkno;
u64 bitmap_blkno;
u32 bitmap_cpg;
- u8 *uuid;
char *uuid_str;
u32 uuid_hash;
u8 *vol_label;
@@ -323,8 +363,8 @@ struct ocfs2_super
spinlock_t osb_lock;
u32 s_next_generation;
unsigned long osb_flags;
- s16 s_inode_steal_slot;
- s16 s_meta_steal_slot;
+ u16 s_inode_steal_slot;
+ u16 s_meta_steal_slot;
atomic_t s_num_inodes_stolen;
atomic_t s_num_meta_stolen;
@@ -345,15 +385,18 @@ struct ocfs2_super
struct ocfs2_recovery_map *recovery_map;
struct ocfs2_replay_map *replay_map;
struct task_struct *recovery_thread_task;
- int disable_recovery;
+ enum ocfs2_recovery_state recovery_state;
wait_queue_head_t checkpoint_event;
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
+ /* Journal triggers for checksum */
+ struct ocfs2_triggers s_journal_triggers[OCFS2_JOURNAL_TRIGGER_COUNT];
+
struct delayed_work la_enable_wq;
/*
- * Must hold local alloc i_mutex and osb->osb_lock to change
+ * Must hold local alloc i_rwsem and osb->osb_lock to change
* local_alloc_bits. Reads can be done under either lock.
*/
unsigned int local_alloc_bits;
@@ -374,9 +417,8 @@ struct ocfs2_super
unsigned int osb_resv_level;
unsigned int osb_dir_resv_level;
- /* Next three fields are for local node slot recovery during
+ /* Next two fields are for local node slot recovery during
* mount. */
- int dirty;
struct ocfs2_dinode *local_alloc_copy;
struct ocfs2_quota_recovery *quota_rec;
@@ -387,14 +429,17 @@ struct ocfs2_super
u8 osb_stackflags;
char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+ char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
+ struct rw_semaphore nfs_sync_rwlock;
+ struct ocfs2_lock_res osb_trim_fs_lockres;
+ struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
- struct dentry *osb_ctxt;
wait_queue_head_t recovery_event;
@@ -413,10 +458,9 @@ struct ocfs2_super
struct list_head blocked_lock_list;
unsigned long blocked_lock_count;
- /* List of dentry locks to release. Anyone can add locks to
- * the list, ocfs2_wq processes the list */
- struct ocfs2_dentry_lock *dentry_lock_list;
- struct work_struct dentry_lock_work;
+ /* List of dquot structures to drop last reference to */
+ struct llist_head dquot_drop_list;
+ struct work_struct dquot_drop_work;
wait_queue_head_t osb_mount_event;
@@ -424,9 +468,10 @@ struct ocfs2_super
struct inode *osb_tl_inode;
struct buffer_head *osb_tl_bh;
struct delayed_work osb_truncate_log_wq;
+ atomic_t osb_tl_disable;
/*
* How many clusters in our truncate log.
- * It must be protected by osb_tl_inode->i_mutex.
+ * It must be protected by osb_tl_inode->i_rwsem.
*/
unsigned int truncated_clusters;
@@ -448,6 +493,22 @@ struct ocfs2_super
/* rb tree root for refcount lock. */
struct rb_root osb_rf_lock_tree;
struct ocfs2_refcount_tree *osb_ref_tree_lru;
+
+ struct mutex system_file_mutex;
+
+ /*
+ * OCFS2 needs to schedule several different types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own.
+ */
+ struct workqueue_struct *ocfs2_wq;
+
+ /* sysfs directory per partition */
+ struct kset *osb_dev_kset;
+
+ /* file check related stuff */
+ struct ocfs2_filecheck_sysfs_entry osb_fc_ent;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -486,6 +547,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
+ return 1;
+ return 0;
+}
+
+
static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
@@ -533,8 +602,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
u32 nlink = le16_to_cpu(di->i_links_count);
u32 hi = le16_to_cpu(di->i_links_count_hi);
- if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
- nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+ nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
return nlink;
}
@@ -578,18 +646,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
spin_unlock(&osb->osb_lock);
}
-
-static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb,
- unsigned long flag)
-{
- unsigned long ret;
-
- spin_lock(&osb->osb_lock);
- ret = osb->osb_flags & flag;
- spin_unlock(&osb->osb_lock);
- return ret;
-}
-
static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
int hard)
{
@@ -707,6 +763,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
return (u64)clusters << c_to_b_bits;
}
+static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb,
+ u64 blocks)
+{
+ int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+ sb->s_blocksize_bits;
+
+ blocks += (1 << b_to_c_bits) - 1;
+ return (u32)(blocks >> b_to_c_bits);
+}
+
static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
u64 blocks)
{
@@ -729,6 +795,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
return clusters;
}
+static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb,
+ u64 bytes)
+{
+ int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+ unsigned int clusters;
+
+ clusters = (unsigned int)(bytes >> cl_bits);
+ return clusters;
+}
+
static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
u64 bytes)
{
@@ -782,10 +858,10 @@ static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
u32 clusters = pg_index;
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
- if (unlikely(PAGE_CACHE_SHIFT > cbits))
- clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
- else if (PAGE_CACHE_SHIFT < cbits)
- clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+ if (unlikely(PAGE_SHIFT > cbits))
+ clusters = pg_index << (PAGE_SHIFT - cbits);
+ else if (PAGE_SHIFT < cbits)
+ clusters = pg_index >> (cbits - PAGE_SHIFT);
return clusters;
}
@@ -799,10 +875,10 @@ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb,
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
pgoff_t index = clusters;
- if (PAGE_CACHE_SHIFT > cbits) {
- index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits);
- } else if (PAGE_CACHE_SHIFT < cbits) {
- index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT);
+ if (PAGE_SHIFT > cbits) {
+ index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits);
+ } else if (PAGE_SHIFT < cbits) {
+ index = (pgoff_t)clusters << (cbits - PAGE_SHIFT);
}
return index;
@@ -813,8 +889,8 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int pages_per_cluster = 1;
- if (PAGE_CACHE_SHIFT < cbits)
- pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+ if (PAGE_SHIFT < cbits)
+ pages_per_cluster = 1 << (cbits - PAGE_SHIFT);
return pages_per_cluster;
}
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 938387a10d5d..f7763da5c4a2 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -1,30 +1,17 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* ocfs2_fs.h
*
* On-disk structures for OCFS2.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef _OCFS2_FS_H
#define _OCFS2_FS_H
+#include <linux/magic.h>
+
/* Version */
#define OCFS2_MAJOR_REV_LEVEL 0
#define OCFS2_MINOR_REV_LEVEL 90
@@ -56,9 +43,6 @@
#define OCFS2_MIN_BLOCKSIZE 512
#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE
-/* Filesystem magic number */
-#define OCFS2_SUPER_MAGIC 0x7461636f
-
/* Object signatures */
#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2"
#define OCFS2_INODE_SIGNATURE "INODE01"
@@ -102,7 +86,8 @@
| OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
| OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
| OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
- | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
+ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \
+ | OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
| OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -147,7 +132,7 @@
* well as the name of the cluster being joined.
* mount.ocfs2 must pass in a matching stack name.
*
- * If not set, the classic stack will be used. This is compatbile with
+ * If not set, the classic stack will be used. This is compatible with
* all older versions.
*/
#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
@@ -158,7 +143,7 @@
/* Support for extended attributes */
#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
-/* Support for indexed directores */
+/* Support for indexed directories */
#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400
/* Metadata checksum and error correction */
@@ -167,17 +152,22 @@
/* Refcount tree support */
#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
-/* Discontigous block groups */
+/* Discontiguous block groups */
#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
/*
- * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * Incompat bit to indicate usable clusterinfo with stackflags for all
* cluster stacks (userspace adnd o2cb). If this bit is set,
* INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
*/
#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
/*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000
+
+/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
*/
@@ -199,6 +189,7 @@
#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
+
/* The byte offset of the first backup block will be 1G.
* The following will be 4G, 16G, 64G, 256G and 1T.
*/
@@ -229,6 +220,8 @@
#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
+#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially
+ * for dio */
/*
* Flags on ocfs2_dinode.i_dyn_features
@@ -295,7 +288,7 @@
#define OCFS2_MAX_SLOTS 255
/* Slot map indicator for an empty slot */
-#define OCFS2_INVALID_SLOT -1
+#define OCFS2_INVALID_SLOT ((u16)-1)
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
@@ -331,8 +324,8 @@ struct ocfs2_system_inode_info {
enum {
BAD_BLOCK_SYSTEM_INODE = 0,
GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE
SLOT_MAP_SYSTEM_INODE,
-#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
HEARTBEAT_SYSTEM_INODE,
GLOBAL_BITMAP_SYSTEM_INODE,
USER_QUOTA_SYSTEM_INODE,
@@ -384,21 +377,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
#define OCFS2_HB_GLOBAL "heartbeat=global"
/*
- * OCFS2 directory file types. Only the low 3 bits are used. The
- * other bits are reserved for now.
- */
-#define OCFS2_FT_UNKNOWN 0
-#define OCFS2_FT_REG_FILE 1
-#define OCFS2_FT_DIR 2
-#define OCFS2_FT_CHRDEV 3
-#define OCFS2_FT_BLKDEV 4
-#define OCFS2_FT_FIFO 5
-#define OCFS2_FT_SOCK 6
-#define OCFS2_FT_SYMLINK 7
-
-#define OCFS2_FT_MAX 8
-
-/*
* OCFS2_DIR_PAD defines the directory entries boundaries
*
* NOTE: It must be a multiple of 4
@@ -416,17 +394,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
#define OCFS2_LINKS_HI_SHIFT 16
#define OCFS2_DX_ENTRIES_MAX (0xffffffffU)
-#define S_SHIFT 12
-static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR,
- [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK,
-};
-
/*
* Convenience casts
@@ -501,7 +468,8 @@ struct ocfs2_extent_list {
__le16 l_reserved1;
__le64 l_reserved2; /* Pad to
sizeof(ocfs2_extent_rec) */
-/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */
+ /* Extent records */
+/*10*/ struct ocfs2_extent_rec l_recs[] __counted_by_le(l_count);
};
/*
@@ -515,7 +483,8 @@ struct ocfs2_chain_list {
__le16 cl_count; /* Total chains in this list */
__le16 cl_next_free_rec; /* Next unused chain slot */
__le64 cl_reserved1;
-/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */
+ /* Chain records */
+/*10*/ struct ocfs2_chain_rec cl_recs[] __counted_by_le(cl_count);
};
/*
@@ -527,7 +496,8 @@ struct ocfs2_truncate_log {
/*00*/ __le16 tl_count; /* Total records in this log */
__le16 tl_used; /* Number of records in use */
__le32 tl_reserved1;
-/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */
+ /* Truncate records */
+/*08*/ struct ocfs2_truncate_rec tl_recs[] __counted_by_le(tl_count);
};
/*
@@ -560,7 +530,7 @@ struct ocfs2_extent_block
* value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty.
*/
struct ocfs2_slot_map {
-/*00*/ __le16 sm_slots[0];
+/*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots);
/*
* Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255,
* 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
@@ -571,7 +541,7 @@ struct ocfs2_extended_slot {
/*00*/ __u8 es_valid;
__u8 es_reserved1[3];
__le32 es_node_num;
-/*10*/
+/*08*/
};
/*
@@ -581,7 +551,7 @@ struct ocfs2_extended_slot {
* i_size.
*/
struct ocfs2_slot_map_extended {
-/*00*/ struct ocfs2_extended_slot se_slots[0];
+/*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots);
/*
* Actual size is i_size of the slot_map system file. It should
* match s_max_slots * sizeof(struct ocfs2_extended_slot)
@@ -647,7 +617,7 @@ struct ocfs2_super_block {
__le16 s_reserved0;
__le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash.
* s_uuid_hash serves as seed[3]. */
-/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */
+/*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */
/*140*/
/*
@@ -671,7 +641,7 @@ struct ocfs2_local_alloc
__le16 la_size; /* Size of included bitmap, in bytes */
__le16 la_reserved1;
__le64 la_reserved2;
-/*10*/ __u8 la_bitmap[0];
+/*10*/ __u8 la_bitmap[];
};
/*
@@ -684,7 +654,7 @@ struct ocfs2_inline_data
* for data, starting at id_data */
__le16 id_reserved0;
__le32 id_reserved1;
- __u8 id_data[0]; /* Start of user data */
+ __u8 id_data[]; /* Start of user data */
};
/*
@@ -729,7 +699,9 @@ struct ocfs2_dinode {
inode belongs to. Only valid
if allocated from a
discontiguous block group */
-/*A0*/ __le64 i_reserved2[3];
+/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */
+ __le16 i_reserved1[3];
+ __le64 i_reserved2[2];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -758,7 +730,7 @@ struct ocfs2_dinode {
struct ocfs2_extent_list i_list;
struct ocfs2_truncate_log i_dealloc;
struct ocfs2_inline_data i_data;
- __u8 i_symlink[0];
+ DECLARE_FLEX_ARRAY(__u8, i_symlink);
} id2;
/* Actual on-disk size is one block */
};
@@ -797,11 +769,11 @@ struct ocfs2_dir_block_trailer {
* in this block. (unused) */
/*10*/ __u8 db_signature[8]; /* Signature for verification */
__le64 db_reserved2;
- __le64 db_free_next; /* Next block in list (unused) */
-/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */
- __le64 db_parent_dinode; /* dinode which owns me, in
+/*20*/ __le64 db_free_next; /* Next block in list (unused) */
+ __le64 db_blkno; /* Offset on disk, in blocks */
+/*30*/ __le64 db_parent_dinode; /* dinode which owns me, in
blocks */
-/*30*/ struct ocfs2_block_check db_check; /* Error checking */
+ struct ocfs2_block_check db_check; /* Error checking */
/*40*/
};
@@ -827,9 +799,10 @@ struct ocfs2_dx_entry_list {
* possible in de_entries */
__le16 de_num_used; /* Current number of
* de_entries entries */
- struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries
- * in a packed array of
- * length de_num_used */
+ /* Indexed dir entries in a packed
+ * array of length de_num_used.
+ */
+ struct ocfs2_dx_entry de_entries[] __counted_by_le(de_count);
};
#define OCFS2_DX_FLAG_INLINE 0x01
@@ -914,7 +887,8 @@ struct ocfs2_group_desc
__le16 bg_free_bits_count; /* Free bits count */
__le16 bg_chain; /* What chain I am in. */
/*10*/ __le32 bg_generation;
- __le32 bg_reserved1;
+ __le16 bg_contig_free_bits; /* max contig free bits length */
+ __le16 bg_reserved1;
__le64 bg_next_group; /* Next group in my list, in
blocks */
/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
@@ -923,12 +897,12 @@ struct ocfs2_group_desc
/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
__le64 bg_reserved2;
/*40*/ union {
- __u8 bg_bitmap[0];
+ DECLARE_FLEX_ARRAY(__u8, bg_bitmap);
struct {
/*
* Block groups may be discontiguous when
* OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
- * The extents of a discontigous block group are
+ * The extents of a discontiguous block group are
* stored in bg_list. It is a flat list.
* l_tree_depth must always be zero. A
* discontiguous group is signified by a non-zero
@@ -964,7 +938,8 @@ struct ocfs2_refcount_list {
__le16 rl_used; /* Current number of used records */
__le32 rl_reserved2;
__le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */
-/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */
+ /* Refcount records */
+/*10*/ struct ocfs2_refcount_rec rl_recs[] __counted_by_le(rl_count);
};
@@ -1050,7 +1025,8 @@ struct ocfs2_xattr_header {
buckets. A block uses
xb_check and sets
this field to zero.) */
- struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+ /* xattr entry list. */
+ struct ocfs2_xattr_entry xh_entries[] __counted_by_le(xh_count);
};
/*
@@ -1113,7 +1089,7 @@ struct ocfs2_xattr_block {
struct ocfs2_xattr_header xb_header; /* xattr header if this
block contains xattr */
struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
- block cotains xattr
+ block contains xattr
tree. */
} xb_attrs;
};
@@ -1236,7 +1212,7 @@ struct ocfs2_local_disk_dqinfo {
/* Header of one chunk of a quota file */
struct ocfs2_local_disk_chunk {
__le32 dqc_free; /* Number of free entries in the bitmap */
- __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
+ __u8 dqc_bitmap[]; /* Bitmap of entries in the corresponding
* chunk of quota file */
};
@@ -1619,7 +1595,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
umode_t mode)
{
- de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ de->file_type = fs_umode_to_ftype(mode);
}
static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 5b27ff1fa577..2de2f8733283 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -1,34 +1,16 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* ocfs2_ioctl.h
*
* Defines OCFS2 ioctls.
*
* Copyright (C) 2010 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef OCFS2_IOCTL_H
#define OCFS2_IOCTL_H
/*
- * ioctl commands
- */
-#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
-#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
-
-/*
* Space reservation / allocation / free ioctls and argument structure
* are designed to be compatible with XFS.
*
@@ -233,7 +215,7 @@ struct ocfs2_move_extents {
movement less likely
to fail, may make fs
even more fragmented */
-#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
+#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmentation
completely gets done.
*/
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index d277aabf5dfb..9b234c03d693 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* ocfs2_lockid.h
*
* Defines OCFS2 lockid bits.
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_LOCKID_H
@@ -50,6 +34,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_NFS_SYNC,
OCFS2_LOCK_TYPE_ORPHAN_SCAN,
OCFS2_LOCK_TYPE_REFCOUNT,
+ OCFS2_LOCK_TYPE_TRIM_FS,
OCFS2_NUM_LOCK_TYPES
};
@@ -93,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_REFCOUNT:
c = 'T';
break;
+ case OCFS2_LOCK_TYPE_TRIM_FS:
+ c = 'I';
+ break;
default:
c = '\0';
}
@@ -105,7 +93,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_DATA] = "Data",
[OCFS2_LOCK_TYPE_SUPER] = "Super",
[OCFS2_LOCK_TYPE_RENAME] = "Rename",
- /* Need to differntiate from [R]ename.. serializing writes is the
+ /* Need to differentiate from [R]ename.. serializing writes is the
* important job it does, anyway. */
[OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
@@ -115,6 +103,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
[OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
+ [OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 2e45c8d2ea7e..31a5e1619e7f 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -1,20 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* ocfs2_lockingver.h
*
* Defines OCFS2 Locking version values.
*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License, version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef OCFS2_LOCKINGVER_H
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 3b481f490633..4b32fb5658ad 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ocfs2
@@ -81,7 +82,7 @@ DECLARE_EVENT_CLASS(ocfs2__string,
__string(name,name)
),
TP_fast_assign(
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%s", __get_str(name))
);
@@ -711,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
+
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
/* End of trace events for fs/ocfs2/alloc.c. */
@@ -1154,8 +1157,6 @@ DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
-DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
-
DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
TRACE_EVENT(ocfs2_try_to_write_inline_data,
@@ -1288,7 +1289,7 @@ DECLARE_EVENT_CLASS(ocfs2__file_ops,
__entry->dentry = dentry;
__entry->ino = ino;
__entry->d_len = d_len;
- __assign_str(d_name, d_name);
+ __assign_str(d_name);
__entry->para = para;
),
TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
@@ -1310,14 +1311,12 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
-DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_write_iter);
-DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_read_iter);
DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
-DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
-
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error);
@@ -1426,7 +1425,7 @@ TRACE_EVENT(ocfs2_setattr,
__entry->dentry = dentry;
__entry->ino = ino;
__entry->d_len = d_len;
- __assign_str(d_name, d_name);
+ __assign_str(d_name);
__entry->ia_valid = ia_valid;
__entry->ia_mode = ia_mode;
__entry->ia_uid = ia_uid;
@@ -1450,31 +1449,26 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
TRACE_EVENT(ocfs2_prepare_inode_for_write,
TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
- int appending, unsigned long count,
- int *direct_io, int *has_refcount),
- TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+ unsigned long count, int wait),
+ TP_ARGS(ino, saved_pos, count, wait),
TP_STRUCT__entry(
__field(unsigned long long, ino)
__field(unsigned long long, saved_pos)
- __field(int, appending)
__field(unsigned long, count)
- __field(int, direct_io)
- __field(int, has_refcount)
+ __field(int, wait)
),
TP_fast_assign(
__entry->ino = ino;
__entry->saved_pos = saved_pos;
- __entry->appending = appending;
__entry->count = count;
- __entry->direct_io = direct_io ? *direct_io : -1;
- __entry->has_refcount = has_refcount ? *has_refcount : -1;
+ __entry->wait = wait;
),
- TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
- __entry->saved_pos, __entry->appending, __entry->count,
- __entry->direct_io, __entry->has_refcount)
+ TP_printk("%llu %llu %lu %d", __entry->ino,
+ __entry->saved_pos, __entry->count, __entry->wait)
);
-DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+DEFINE_OCFS2_INT_EVENT(generic_file_read_iter_ret);
+DEFINE_OCFS2_INT_EVENT(filemap_splice_read_ret);
/* End of trace events for fs/ocfs2/file.c. */
@@ -1540,6 +1534,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block);
TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
TP_PROTO(void *task, void *dc_task, unsigned long long ino,
@@ -1573,8 +1569,6 @@ DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
-DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
-
TRACE_EVENT(ocfs2_inode_revalidate,
TP_PROTO(void *inode, unsigned long long ino,
unsigned int flags),
@@ -1662,34 +1656,34 @@ TRACE_EVENT(ocfs2_remount,
);
TRACE_EVENT(ocfs2_fill_super,
- TP_PROTO(void *sb, void *data, int silent),
- TP_ARGS(sb, data, silent),
+ TP_PROTO(void *sb, void *fc, int silent),
+ TP_ARGS(sb, fc, silent),
TP_STRUCT__entry(
__field(void *, sb)
- __field(void *, data)
+ __field(void *, fc)
__field(int, silent)
),
TP_fast_assign(
__entry->sb = sb;
- __entry->data = data;
+ __entry->fc = fc;
__entry->silent = silent;
),
TP_printk("%p %p %d", __entry->sb,
- __entry->data, __entry->silent)
+ __entry->fc, __entry->silent)
);
TRACE_EVENT(ocfs2_parse_options,
- TP_PROTO(int is_remount, char *options),
- TP_ARGS(is_remount, options),
+ TP_PROTO(int is_remount, const char *option),
+ TP_ARGS(is_remount, option),
TP_STRUCT__entry(
__field(int, is_remount)
- __string(options, options)
+ __string(option, option)
),
TP_fast_assign(
__entry->is_remount = is_remount;
- __assign_str(options, options);
+ __assign_str(option);
),
- TP_printk("%d %s", __entry->is_remount, __get_str(options))
+ TP_printk("%d %s", __entry->is_remount, __get_str(option))
);
DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
@@ -1722,8 +1716,8 @@ TRACE_EVENT(ocfs2_initialize_super,
__field(int, cluster_bits)
),
TP_fast_assign(
- __assign_str(label, label);
- __assign_str(uuid_str, uuid_str);
+ __assign_str(label);
+ __assign_str(uuid_str);
__entry->root_dir = root_dir;
__entry->system_dir = system_dir;
__entry->cluster_bits = cluster_bits;
@@ -1750,7 +1744,7 @@ TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
__field(int, credits)
),
TP_fast_assign(
- __assign_str(name, name);
+ __assign_str(name);
__entry->meta = meta;
__entry->clusters = clusters;
__entry->credits = credits;
@@ -1774,7 +1768,7 @@ DECLARE_EVENT_CLASS(ocfs2__xattr_find,
),
TP_fast_assign(
__entry->ino = ino;
- __assign_str(name, name);
+ __assign_str(name);
__entry->name_index = name_index;
__entry->hash = hash;
__entry->location = location;
@@ -2023,7 +2017,7 @@ TRACE_EVENT(ocfs2_sync_dquot_helper,
__entry->dq_id = dq_id;
__entry->dq_type = dq_type;
__entry->type = type;
- __assign_str(s_id, s_id);
+ __assign_str(s_id);
),
TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
__entry->type, __get_str(s_id))
@@ -2035,6 +2029,8 @@ DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_get_next_id);
+
DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
/* End of trace events for fs/ocfs2/quota_global.c. */
@@ -2062,7 +2058,7 @@ TRACE_EVENT(ocfs2_dx_dir_search,
TP_fast_assign(
__entry->ino = ino;
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
__entry->major_hash = major_hash;
__entry->minor_hash = minor_hash;
__entry->blkno = blkno;
@@ -2090,7 +2086,7 @@ TRACE_EVENT(ocfs2_find_files_on_disk,
),
TP_fast_assign(
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
__entry->blkno = blkno;
__entry->dir = dir;
),
@@ -2109,7 +2105,7 @@ TRACE_EVENT(ocfs2_check_dir_for_entry,
TP_fast_assign(
__entry->dir = dir;
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%llu %.*s", __entry->dir,
__entry->namelen, __get_str(name))
@@ -2137,7 +2133,7 @@ TRACE_EVENT(ocfs2_dx_dir_index_root_block,
__entry->major_hash = major_hash;
__entry->minor_hash = minor_hash;
__entry->namelen = namelen;
- __assign_str(name, name);
+ __assign_str(name);
__entry->num_used = num_used;
),
TP_printk("%llu %x %x %.*s %u", __entry->dir,
@@ -2173,7 +2169,7 @@ DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
__entry->dir = dir;
__entry->dentry = dentry;
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->dir_blkno = dir_blkno;
__entry->extra = extra;
),
@@ -2219,7 +2215,7 @@ TRACE_EVENT(ocfs2_mknod,
__entry->dir = dir;
__entry->dentry = dentry;
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->dir_blkno = dir_blkno;
__entry->dev = dev;
__entry->mode = mode;
@@ -2243,9 +2239,9 @@ TRACE_EVENT(ocfs2_link,
TP_fast_assign(
__entry->ino = ino;
__entry->old_len = old_len;
- __assign_str(old_name, old_name);
+ __assign_str(old_name);
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%llu %.*s %.*s", __entry->ino,
__entry->old_len, __get_str(old_name),
@@ -2281,9 +2277,9 @@ TRACE_EVENT(ocfs2_rename,
__entry->new_dir = new_dir;
__entry->new_dentry = new_dentry;
__entry->old_len = old_len;
- __assign_str(old_name, old_name);
+ __assign_str(old_name);
__entry->new_len = new_len;
- __assign_str(new_name, new_name);
+ __assign_str(new_name);
),
TP_printk("%p %p %p %p %.*s %.*s",
__entry->old_dir, __entry->old_dentry,
@@ -2292,6 +2288,8 @@ TRACE_EVENT(ocfs2_rename,
__entry->new_len, __get_str(new_name))
);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
+
TRACE_EVENT(ocfs2_rename_target_exists,
TP_PROTO(int new_len, const char *new_name),
TP_ARGS(new_len, new_name),
@@ -2301,7 +2299,7 @@ TRACE_EVENT(ocfs2_rename_target_exists,
),
TP_fast_assign(
__entry->new_len = new_len;
- __assign_str(new_name, new_name);
+ __assign_str(new_name);
),
TP_printk("%.*s", __entry->new_len, __get_str(new_name))
);
@@ -2344,7 +2342,7 @@ TRACE_EVENT(ocfs2_symlink_begin,
__entry->dentry = dentry;
__entry->symname = symname;
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
__entry->symname, __entry->len, __get_str(name))
@@ -2360,7 +2358,7 @@ TRACE_EVENT(ocfs2_blkno_stringify,
),
TP_fast_assign(
__entry->blkno = blkno;
- __assign_str(name, name);
+ __assign_str(name);
__entry->namelen = namelen;
),
TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
@@ -2381,7 +2379,7 @@ TRACE_EVENT(ocfs2_orphan_del,
),
TP_fast_assign(
__entry->dir = dir;
- __assign_str(name, name);
+ __assign_str(name);
__entry->namelen = namelen;
),
TP_printk("%llu %s %d", __entry->dir, __get_str(name),
@@ -2403,7 +2401,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate,
TP_fast_assign(
__entry->dentry = dentry;
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
);
@@ -2420,7 +2418,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate_negative,
),
TP_fast_assign(
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->pgen = pgen;
__entry->gen = gen;
),
@@ -2445,7 +2443,7 @@ TRACE_EVENT(ocfs2_find_local_alias,
),
TP_fast_assign(
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
),
TP_printk("%.*s", __entry->len, __get_str(name))
);
@@ -2462,7 +2460,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock,
),
TP_fast_assign(
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->parent = parent;
__entry->fsdata = fsdata;
),
@@ -2480,7 +2478,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock_found,
__field(unsigned long long, ino)
),
TP_fast_assign(
- __assign_str(name, name);
+ __assign_str(name);
__entry->parent = parent;
__entry->ino = ino;
),
@@ -2527,7 +2525,7 @@ TRACE_EVENT(ocfs2_get_parent,
TP_fast_assign(
__entry->child = child;
__entry->len = len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->ino = ino;
),
TP_printk("%p %.*s %llu", __entry->child, __entry->len,
@@ -2551,7 +2549,7 @@ TRACE_EVENT(ocfs2_encode_fh_begin,
TP_fast_assign(
__entry->dentry = dentry;
__entry->name_len = name_len;
- __assign_str(name, name);
+ __assign_str(name);
__entry->fh = fh;
__entry->len = len;
__entry->connectable = connectable;
@@ -2577,8 +2575,12 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits);
+
DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
+
DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index d5ab56cbe5c5..788a8de922a4 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* quota.h for OCFS2
*
@@ -17,6 +18,9 @@
#include "ocfs2.h"
+/* Number of quota types we support */
+#define OCFS2_MAXQUOTAS 2
+
/*
* In-memory structures
*/
@@ -28,6 +32,7 @@ struct ocfs2_dquot {
unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
s64 dq_origspace; /* Last globally synced space usage */
s64 dq_originodes; /* Last globally synced inode usage */
+ struct llist_node list; /* Member of list of dquots to drop */
};
/* Description of one chunk to recover in memory */
@@ -38,12 +43,13 @@ struct ocfs2_recovery_chunk {
};
struct ocfs2_quota_recovery {
- struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */
+ struct list_head r_list[OCFS2_MAXQUOTAS]; /* List of chunks to recover */
};
/* In-memory structure with quota header information */
struct ocfs2_mem_dqinfo {
unsigned int dqi_type; /* Quota type this structure describes */
+ unsigned int dqi_flags; /* Flags OLQF_* */
unsigned int dqi_chunks; /* Number of chunks in local quota file */
unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
unsigned int dqi_syncms; /* How often should we sync with other nodes */
@@ -77,7 +83,7 @@ struct ocfs2_quota_chunk {
extern struct kmem_cache *ocfs2_dquot_cachep;
extern struct kmem_cache *ocfs2_qf_chunk_cachep;
-extern struct qtree_fmt_operations ocfs2_global_ops;
+extern const struct qtree_fmt_operations ocfs2_global_ops;
struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
struct ocfs2_super *osb, int slot_num);
@@ -91,7 +97,6 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
int ocfs2_global_read_info(struct super_block *sb, int type);
int ocfs2_global_write_info(struct super_block *sb, int type);
-int ocfs2_global_read_dquot(struct dquot *dquot);
int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
static inline int ocfs2_sync_dquot(struct dquot *dquot)
{
@@ -110,6 +115,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
int ocfs2_create_local_dquot(struct dquot *dquot);
int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
int ocfs2_local_write_dquot(struct dquot *dquot);
+void ocfs2_drop_dquot_refs(struct work_struct *work);
extern const struct dquot_operations ocfs2_quota_operations;
extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 332a281f217e..e85b1ccf81be 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Implementation of operations over global quota file
*/
@@ -10,6 +11,8 @@
#include <linux/jiffies.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
+#include <linux/llist.h>
+#include <linux/iversion.h>
#include <cluster/masklog.h>
@@ -32,8 +35,8 @@
* Locking of quotas with OCFS2 is rather complex. Here are rules that
* should be obeyed by all the functions:
* - any write of quota structure (either to local or global file) is protected
- * by dqio_mutex or dquot->dq_lock.
- * - any modification of global quota file holds inode cluster lock, i_mutex,
+ * by dqio_sem or dquot->dq_lock.
+ * - any modification of global quota file holds inode cluster lock, i_rwsem,
* and ip_alloc_sem of the global quota file (achieved by
* ocfs2_lock_global_qf). It also has to hold qinfo_lock.
* - an allocation of new blocks for local quota file is protected by
@@ -41,9 +44,9 @@
*
* A rough sketch of locking dependencies (lf = local file, gf = global file):
* Normal filesystem operation:
- * start_trans -> dqio_mutex -> write to lf
+ * start_trans -> dqio_sem -> write to lf
* Syncing of local and global file:
- * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ * ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock ->
* write to gf
* -> write to lf
* Acquire dquot for the first time:
@@ -59,7 +62,7 @@
* Recovery:
* inode cluster lock of recovered lf
* -> read bitmaps -> ip_alloc_sem of lf
- * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ * -> ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock ->
* write to gf
*/
@@ -122,7 +125,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
dquot->dq_id);
}
-struct qtree_fmt_operations ocfs2_global_ops = {
+const struct qtree_fmt_operations ocfs2_global_ops = {
.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
.disk2mem_dqblk = ocfs2_global_disk2memdqb,
.is_id = ocfs2_global_is_id,
@@ -234,7 +237,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
}
- if (gqinode->i_size < off + len) {
+ if (i_size_read(gqinode) < off + len) {
loff_t rounded_end =
ocfs2_align_bytes_to_blocks(sb, off + len);
@@ -270,7 +273,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
if (new)
memset(bh->b_data, 0, sb->s_blocksize);
memcpy(bh->b_data + offset, data, len);
- flush_dcache_page(bh->b_page);
+ flush_dcache_folio(bh->b_folio);
set_buffer_uptodate(bh);
unlock_buffer(bh);
ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
@@ -287,7 +290,7 @@ out:
mlog_errno(err);
return err;
}
- gqinode->i_version++;
+ inode_inc_iversion(gqinode);
ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
return len;
}
@@ -307,7 +310,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
WARN_ON(bh != oinfo->dqi_gqi_bh);
spin_unlock(&dq_data_lock);
if (ex) {
- mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+ inode_lock(oinfo->dqi_gqinode);
down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
} else {
down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
@@ -319,7 +322,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
{
if (ex) {
up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
- mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+ inode_unlock(oinfo->dqi_gqinode);
} else {
up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
}
@@ -334,24 +337,14 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
/* Read information header from global quota file */
int ocfs2_global_read_info(struct super_block *sb, int type)
{
- struct inode *gqinode = NULL;
- unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
- GROUP_QUOTA_SYSTEM_INODE };
+ unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE };
struct ocfs2_global_disk_dqinfo dinfo;
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
u64 pcount;
int status;
- /* Read global header */
- gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
- OCFS2_INVALID_SLOT);
- if (!gqinode) {
- mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
- type);
- status = -EINVAL;
- goto out_err;
- }
oinfo->dqi_gi.dqi_sb = sb;
oinfo->dqi_gi.dqi_type = type;
ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
@@ -359,21 +352,35 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
oinfo->dqi_gqi_bh = NULL;
oinfo->dqi_gqi_count = 0;
- oinfo->dqi_gqinode = gqinode;
+
+ /* Read global header */
+ oinfo->dqi_gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+ OCFS2_INVALID_SLOT);
+ if (!oinfo->dqi_gqinode) {
+ mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+ type);
+ status = -EINVAL;
+ goto out_err;
+ }
+
status = ocfs2_lock_global_qf(oinfo, 0);
if (status < 0) {
mlog_errno(status);
goto out_err;
}
- status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+ status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk,
&pcount, NULL);
- if (status < 0)
+ if (status < 0) {
+ mlog_errno(status);
goto out_unlock;
+ }
status = ocfs2_qinfo_lock(oinfo, 0);
- if (status < 0)
+ if (status < 0) {
+ mlog_errno(status);
goto out_unlock;
+ }
status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
sizeof(struct ocfs2_global_disk_dqinfo),
OCFS2_GLOBAL_INFO_OFF);
@@ -401,15 +408,14 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
schedule_delayed_work(&oinfo->dqi_sync_work,
msecs_to_jiffies(oinfo->dqi_syncms));
-out_err:
- return status;
+ return 0;
out_unlock:
ocfs2_unlock_global_qf(oinfo, 0);
- mlog_errno(status);
- goto out_err;
+out_err:
+ return status;
}
-/* Write information to global quota file. Expects exlusive lock on quota
+/* Write information to global quota file. Expects exclusive lock on quota
* file inode and quota info */
static int __ocfs2_global_write_info(struct super_block *sb, int type)
{
@@ -442,13 +448,20 @@ static int __ocfs2_global_write_info(struct super_block *sb, int type)
int ocfs2_global_write_info(struct super_block *sb, int type)
{
int err;
- struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct ocfs2_mem_dqinfo *info = dqopt->info[type].dqi_priv;
+ unsigned int memalloc;
+ down_write(&dqopt->dqio_sem);
+ memalloc = memalloc_nofs_save();
err = ocfs2_qinfo_lock(info, 1);
if (err < 0)
- return err;
+ goto out_sem;
err = __ocfs2_global_write_info(sb, type);
ocfs2_qinfo_unlock(info, 1);
+out_sem:
+ memalloc_nofs_restore(memalloc);
+ up_write(&dqopt->dqio_sem);
return err;
}
@@ -482,7 +495,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
struct ocfs2_global_disk_dqblk dqblk;
s64 spacechange, inodechange;
- time_t olditime, oldbtime;
+ time64_t olditime, oldbtime;
err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
sizeof(struct ocfs2_global_disk_dqblk),
@@ -499,7 +512,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
/* Update space and inode usage. Get also other information from
* global quota file so that we don't overwrite any changes there.
* We are */
- spin_lock(&dq_data_lock);
+ spin_lock(&dquot->dq_dqb_lock);
spacechange = dquot->dq_dqb.dqb_curspace -
OCFS2_DQUOT(dquot)->dq_origspace;
inodechange = dquot->dq_dqb.dqb_curinodes -
@@ -555,7 +568,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
__clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
err = ocfs2_qinfo_lock(info, freeing);
if (err < 0) {
mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
@@ -594,6 +607,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
struct ocfs2_super *osb = OCFS2_SB(sb);
int status = 0;
+ unsigned int memalloc;
trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id),
dquot->dq_id.type,
@@ -610,7 +624,8 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
mlog_errno(status);
goto out_ilock;
}
- mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ down_write(&sb_dqopt(sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
status = ocfs2_sync_dquot(dquot);
if (status < 0)
mlog_errno(status);
@@ -618,7 +633,8 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
status = ocfs2_local_write_dquot(dquot);
if (status < 0)
mlog_errno(status);
- mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ memalloc_nofs_restore(memalloc);
+ up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
@@ -633,7 +649,15 @@ static void qsync_work_fn(struct work_struct *work)
dqi_sync_work.work);
struct super_block *sb = oinfo->dqi_gqinode->i_sb;
- dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+ /*
+ * We have to be careful here not to deadlock on s_umount as umount
+ * disabling quotas may be in progress and it waits for this work to
+ * complete. If trylock fails, we'll do the sync next time...
+ */
+ if (down_read_trylock(&sb->s_umount)) {
+ dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+ up_read(&sb->s_umount);
+ }
schedule_delayed_work(&oinfo->dqi_sync_work,
msecs_to_jiffies(oinfo->dqi_syncms));
}
@@ -647,6 +671,7 @@ static int ocfs2_write_dquot(struct dquot *dquot)
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
int status = 0;
+ unsigned int memalloc;
trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id),
dquot->dq_id.type);
@@ -657,9 +682,11 @@ static int ocfs2_write_dquot(struct dquot *dquot)
mlog_errno(status);
goto out;
}
- mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+ down_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
status = ocfs2_local_write_dquot(dquot);
- mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+ memalloc_nofs_restore(memalloc);
+ up_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out:
return status;
@@ -679,6 +706,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
OCFS2_INODE_UPDATE_CREDITS;
}
+void ocfs2_drop_dquot_refs(struct work_struct *work)
+{
+ struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+ dquot_drop_work);
+ struct llist_node *list;
+ struct ocfs2_dquot *odquot, *next_odquot;
+
+ list = llist_del_all(&osb->dquot_drop_list);
+ llist_for_each_entry_safe(odquot, next_odquot, list, list) {
+ /* Drop the reference we acquired in ocfs2_dquot_release() */
+ dqput(&odquot->dq_dquot);
+ }
+}
+
+/*
+ * Called when the last reference to dquot is dropped. If we are called from
+ * downconvert thread, we cannot do all the handling here because grabbing
+ * quota lock could deadlock (the node holding the quota lock could need some
+ * other cluster lock to proceed but with blocked downconvert thread we cannot
+ * release any lock).
+ */
static int ocfs2_release_dquot(struct dquot *dquot)
{
handle_t *handle;
@@ -692,14 +740,32 @@ static int ocfs2_release_dquot(struct dquot *dquot)
mutex_lock(&dquot->dq_lock);
/* Check whether we are not racing with some other dqget() */
- if (atomic_read(&dquot->dq_count) > 1)
+ if (dquot_is_busy(dquot))
+ goto out;
+ /* Running from downconvert thread? Postpone quota processing to wq */
+ if (current == osb->dc_task) {
+ /*
+ * Grab our own reference to dquot and queue it for delayed
+ * dropping. Quota code rechecks after calling
+ * ->release_dquot() and won't free dquot structure.
+ */
+ dqgrab(dquot);
+ /* First entry on list -> queue work */
+ if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
+ queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
goto out;
+ }
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
handle = ocfs2_start_trans(osb,
ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type));
if (IS_ERR(handle)) {
+ /*
+ * Mark dquot as inactive to avoid endless cycle in
+ * quota_release_workfn().
+ */
+ clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
status = PTR_ERR(handle);
mlog_errno(status);
goto out_ilock;
@@ -717,6 +783,12 @@ static int ocfs2_release_dquot(struct dquot *dquot)
*/
if (status < 0)
mlog_errno(status);
+ /*
+ * Clear dq_off so that we search for the structure in quota file next
+ * time we acquire it. The structure might be deleted and reallocated
+ * elsewhere by another node while our dquot structure is on freelist.
+ */
+ dquot->dq_off = 0;
clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_trans:
ocfs2_commit_trans(osb, handle);
@@ -756,16 +828,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
status = ocfs2_lock_global_qf(info, 1);
if (status < 0)
goto out;
- if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
- status = ocfs2_qinfo_lock(info, 0);
- if (status < 0)
- goto out_dq;
- status = qtree_read_dquot(&info->dqi_gi, dquot);
- ocfs2_qinfo_unlock(info, 0);
- if (status < 0)
- goto out_dq;
- }
- set_bit(DQ_READ_B, &dquot->dq_flags);
+ status = ocfs2_qinfo_lock(info, 0);
+ if (status < 0)
+ goto out_dq;
+ /*
+ * We always want to read dquot structure from disk because we don't
+ * know what happened with it while it was on freelist.
+ */
+ status = qtree_read_dquot(&info->dqi_gi, dquot);
+ ocfs2_qinfo_unlock(info, 0);
+ if (status < 0)
+ goto out_dq;
OCFS2_DQUOT(dquot)->dq_use_count++;
OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
@@ -778,8 +851,8 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
*/
WARN_ON(journal_current_handle());
status = ocfs2_extend_no_holes(gqinode, NULL,
- gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
- gqinode->i_size);
+ i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits),
+ i_size_read(gqinode));
if (status < 0)
goto out_dq;
}
@@ -818,6 +891,37 @@ out:
return status;
}
+static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ int type = qid->type;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ int status = 0;
+
+ trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type);
+ if (!sb_has_quota_active(sb, type)) {
+ status = -ESRCH;
+ goto out;
+ }
+ status = ocfs2_lock_global_qf(info, 0);
+ if (status < 0)
+ goto out;
+ status = ocfs2_qinfo_lock(info, 0);
+ if (status < 0)
+ goto out_global;
+ status = qtree_get_next_id(&info->dqi_gi, qid);
+ ocfs2_qinfo_unlock(info, 0);
+out_global:
+ ocfs2_unlock_global_qf(info, 0);
+out:
+ /*
+ * Avoid logging ENOENT since it just means there isn't next ID and
+ * ESRCH which means quota isn't enabled for the filesystem.
+ */
+ if (status && status != -ENOENT && status != -ESRCH)
+ mlog_errno(status);
+ return status;
+}
+
static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
{
unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
@@ -833,16 +937,17 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(sb);
+ unsigned int memalloc;
trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id),
type);
/* In case user set some limits, sync dquot immediately to global
* quota file so that information propagates quicker */
- spin_lock(&dq_data_lock);
+ spin_lock(&dquot->dq_dqb_lock);
if (dquot->dq_flags & mask)
sync = 1;
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
/* This is a slight hack but we can't afford getting global quota
* lock if we already have a transaction started. */
if (!sync || journal_current_handle()) {
@@ -858,7 +963,8 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
mlog_errno(status);
goto out_ilock;
}
- mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ down_write(&sb_dqopt(sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
status = ocfs2_sync_dquot(dquot);
if (status < 0) {
mlog_errno(status);
@@ -867,7 +973,8 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
/* Now write updated local dquot structure */
status = ocfs2_local_write_dquot(dquot);
out_dlock:
- mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ memalloc_nofs_restore(memalloc);
+ up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
@@ -926,4 +1033,5 @@ const struct dquot_operations ocfs2_quota_operations = {
.write_info = ocfs2_write_info,
.alloc_dquot = ocfs2_alloc_dquot,
.destroy_dquot = ocfs2_destroy_dquot,
+ .get_next_id = ocfs2_get_next_id,
};
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 27fe7ee4874c..de7f12858729 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Implementation of operations over local quota file
*/
@@ -73,12 +74,6 @@ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
ol_dqblk_block_off(sb, c, off);
}
-/* Compute block number from given offset */
-static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
-{
- return off >> sb->s_blocksize_bits;
-}
-
static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
{
return off & ((1 << sb->s_blocksize_bits) - 1);
@@ -142,15 +137,13 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
int rc = 0;
struct buffer_head *tmp = *bh;
- if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
- ocfs2_error(inode->i_sb,
- "Quota file %llu is probably corrupted! Requested "
- "to read block %Lu but file has size only %Lu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_block,
- (unsigned long long)i_size_read(inode));
- return -EIO;
- }
+ if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block)
+ return ocfs2_error(inode->i_sb,
+ "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)v_block,
+ (unsigned long long)i_size_read(inode));
+
rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
ocfs2_validate_quota_block);
if (rc)
@@ -166,12 +159,12 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
/* Check whether we understand format of quota files */
static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
{
- unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
- unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
- unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
- unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
- unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
- GROUP_QUOTA_SYSTEM_INODE };
+ unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+ unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+ unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+ unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+ unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE };
struct buffer_head *bh = NULL;
struct inode *linode = sb_dqopt(sb)->files[type];
struct inode *ginode = NULL;
@@ -292,7 +285,7 @@ static void olq_update_info(struct buffer_head *bh, void *private)
ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
OCFS2_LOCAL_INFO_OFF);
spin_lock(&dq_data_lock);
- ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+ ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags);
ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
spin_unlock(&dq_data_lock);
@@ -336,7 +329,7 @@ void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
{
int type;
- for (type = 0; type < MAXQUOTAS; type++)
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++)
free_recovery_list(&(rec->r_list[type]));
kfree(rec);
}
@@ -382,7 +375,7 @@ static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
if (!rec)
return NULL;
- for (type = 0; type < MAXQUOTAS; type++)
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++)
INIT_LIST_HEAD(&(rec->r_list[type]));
return rec;
}
@@ -392,10 +385,11 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
struct ocfs2_super *osb,
int slot_num)
{
- unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
- unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
- LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ unsigned int feature[OCFS2_MAXQUOTAS] = {
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
struct super_block *sb = osb->sb;
struct ocfs2_local_disk_dqinfo *ldinfo;
struct inode *lqinode;
@@ -412,7 +406,7 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
return ERR_PTR(-ENOMEM);
/* First init... */
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
continue;
/* At this point, journal of the slot is already replayed so
@@ -459,8 +453,7 @@ out:
/* Sync changes in local quota file into global quota file and
* reinitialize local quota file.
- * The function expects local quota file to be already locked and
- * dqonoff_mutex locked. */
+ * The function expects local quota file to be already locked. */
static int ocfs2_recover_local_quota_file(struct inode *lqinode,
int type,
struct ocfs2_quota_recovery *rec)
@@ -476,6 +469,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
int bit, chunk;
struct ocfs2_recovery_chunk *rchunk, *next;
qsize_t spacechange, inodechange;
+ unsigned int memalloc;
trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
@@ -504,8 +498,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
dquot = dqget(sb,
make_kqid(&init_user_ns, type,
le64_to_cpu(dqblk->dqb_id)));
- if (!dquot) {
- status = -EIO;
+ if (IS_ERR(dquot)) {
+ status = PTR_ERR(dquot);
mlog(ML_ERROR, "Failed to get quota structure "
"for id %u, type %d. Cannot finish quota "
"file recovery.\n",
@@ -526,8 +520,9 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
mlog_errno(status);
goto out_drop_lock;
}
- mutex_lock(&sb_dqopt(sb)->dqio_mutex);
- spin_lock(&dq_data_lock);
+ down_write(&sb_dqopt(sb)->dqio_sem);
+ memalloc = memalloc_nofs_save();
+ spin_lock(&dquot->dq_dqb_lock);
/* Add usage from quota entry into quota changes
* of our node. Auxiliary variables are important
* due to signedness */
@@ -535,7 +530,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
inodechange = le64_to_cpu(dqblk->dqb_inodemod);
dquot->dq_dqb.dqb_curspace += spacechange;
dquot->dq_dqb.dqb_curinodes += inodechange;
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
/* We want to drop reference held by the crashed
* node. Since we have our own reference we know
* global structure actually won't be freed. */
@@ -559,7 +554,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
unlock_buffer(qbh);
ocfs2_journal_dirty(handle, qbh);
out_commit:
- mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ memalloc_nofs_restore(memalloc);
+ up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(OCFS2_SB(sb), handle);
out_drop_lock:
ocfs2_unlock_global_qf(oinfo, 1);
@@ -589,9 +585,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
struct ocfs2_quota_recovery *rec,
int slot_num)
{
- unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
- LOCAL_GROUP_QUOTA_SYSTEM_INODE };
- struct super_block *sb = osb->sb;
+ unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
struct ocfs2_local_disk_dqinfo *ldinfo;
struct buffer_head *bh;
handle_t *handle;
@@ -603,8 +598,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
"slot %u\n", osb->dev_str, slot_num);
- mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (list_empty(&(rec->r_list[type])))
continue;
trace_ocfs2_finish_quota_recovery(slot_num);
@@ -680,8 +674,7 @@ out_put:
break;
}
out:
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
- kfree(rec);
+ ocfs2_free_quota_recovery(rec);
return status;
}
@@ -695,17 +688,15 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
int status;
struct buffer_head *bh = NULL;
struct ocfs2_quota_recovery *rec;
- int locked = 0;
+ int locked = 0, global_read = 0;
- /* We don't need the lock and we have to acquire quota file locks
- * which will later depend on this lock */
- mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
- info->dqi_maxblimit = 0x7fffffffffffffffLL;
- info->dqi_maxilimit = 0x7fffffffffffffffLL;
+ info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
+ info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
if (!oinfo) {
mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
" info.");
+ status = -ENOMEM;
goto out_err;
}
info->dqi_priv = oinfo;
@@ -718,6 +709,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
status = ocfs2_global_read_info(sb, type);
if (status < 0)
goto out_err;
+ global_read = 1;
status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
if (status < 0) {
@@ -736,13 +728,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
}
ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
OCFS2_LOCAL_INFO_OFF);
- info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+ oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
oinfo->dqi_libh = bh;
/* We crashed when using local quota file? */
- if (!(info->dqi_flags & OLQF_CLEAN)) {
+ if (!(oinfo->dqi_flags & OLQF_CLEAN)) {
rec = OCFS2_SB(sb)->quota_rec;
if (!rec) {
rec = ocfs2_alloc_quota_recovery();
@@ -771,14 +763,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
}
/* Now mark quota file as used */
- info->dqi_flags &= ~OLQF_CLEAN;
+ oinfo->dqi_flags &= ~OLQF_CLEAN;
status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
if (status < 0) {
mlog_errno(status);
goto out_err;
}
- mutex_lock(&sb_dqopt(sb)->dqio_mutex);
return 0;
out_err:
if (oinfo) {
@@ -789,11 +780,12 @@ out_err:
if (locked)
ocfs2_inode_unlock(lqinode, 1);
ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+ if (global_read)
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
kfree(oinfo);
}
brelse(bh);
- mutex_lock(&sb_dqopt(sb)->dqio_mutex);
- return -1;
+ return status;
}
/* Write local info to quota file */
@@ -822,7 +814,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
struct ocfs2_quota_chunk *chunk;
struct ocfs2_local_disk_chunk *dchunk;
int mark_clean = 1, len;
- int status;
+ int status = 0;
iput(oinfo->dqi_gqinode);
ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
@@ -846,7 +838,9 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
}
ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
- /* dqonoff_mutex protects us against racing with recovery thread... */
+ /*
+ * ocfs2_dismount_volume() has already aborted quota recovery...
+ */
if (oinfo->dqi_rec) {
ocfs2_free_quota_recovery(oinfo->dqi_rec);
mark_clean = 0;
@@ -856,22 +850,20 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
goto out;
/* Mark local file as clean */
- info->dqi_flags |= OLQF_CLEAN;
+ oinfo->dqi_flags |= OLQF_CLEAN;
status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
oinfo->dqi_libh,
olq_update_info,
info);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto out;
- }
-
out:
ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
brelse(oinfo->dqi_libh);
brelse(oinfo->dqi_lqi_bh);
kfree(oinfo);
- return 0;
+ info->dqi_priv = NULL;
+ return status;
}
static void olq_set_dquot(struct buffer_head *bh, void *private)
@@ -885,12 +877,12 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns,
od->dq_dquot.dq_id));
- spin_lock(&dq_data_lock);
+ spin_lock(&od->dq_dquot.dq_dqb_lock);
dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
od->dq_origspace);
dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
od->dq_originodes);
- spin_unlock(&dq_data_lock);
+ spin_unlock(&od->dq_dquot.dq_dqb_lock);
trace_olq_set_dquot(
(unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
(unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
@@ -929,19 +921,19 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
{
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
- struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_quota_chunk *chunk = NULL, *iter;
struct ocfs2_local_disk_chunk *dchunk;
int found = 0, len;
- list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+ list_for_each_entry(iter, &oinfo->dqi_chunk, qc_chunk) {
dchunk = (struct ocfs2_local_disk_chunk *)
- chunk->qc_headerbh->b_data;
+ iter->qc_headerbh->b_data;
if (le32_to_cpu(dchunk->dqc_free) > 0) {
- found = 1;
+ chunk = iter;
break;
}
}
- if (!found)
+ if (!chunk)
return NULL;
if (chunk->qc_num < oinfo->dqi_chunks - 1) {
@@ -982,14 +974,14 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
/* We are protected by dqio_sem so no locking needed */
status = ocfs2_extend_no_holes(lqinode, NULL,
- lqinode->i_size + 2 * sb->s_blocksize,
- lqinode->i_size);
+ i_size_read(lqinode) + 2 * sb->s_blocksize,
+ i_size_read(lqinode));
if (status < 0) {
mlog_errno(status);
goto out;
}
status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
- lqinode->i_size + 2 * sb->s_blocksize);
+ i_size_read(lqinode) + 2 * sb->s_blocksize);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -1125,14 +1117,14 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
/* We are protected by dqio_sem so no locking needed */
status = ocfs2_extend_no_holes(lqinode, NULL,
- lqinode->i_size + sb->s_blocksize,
- lqinode->i_size);
+ i_size_read(lqinode) + sb->s_blocksize,
+ i_size_read(lqinode));
if (status < 0) {
mlog_errno(status);
goto out;
}
status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
- lqinode->i_size + sb->s_blocksize);
+ i_size_read(lqinode) + sb->s_blocksize);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -1251,6 +1243,10 @@ int ocfs2_create_local_dquot(struct dquot *dquot)
&od->dq_local_phys_blk,
&pcount,
NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
/* Initialize dquot structure on disk */
status = ocfs2_local_write_dquot(dquot);
@@ -1303,10 +1299,6 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
out:
- /* Clear the read bit so that next time someone uses this
- * dquot he reads fresh info from disk and allocates local
- * dquot structure */
- clear_bit(DQ_READ_B, &dquot->dq_flags);
return status;
}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 998b17eda09d..c92e0ea85bca 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1,18 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* refcounttree.c
*
* Copyright (C) 2009 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/sort.h>
@@ -34,6 +24,8 @@
#include "xattr.h"
#include "namei.h"
#include "ocfs2_trace.h"
+#include "file.h"
+#include "symlink.h"
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -42,14 +34,15 @@
#include <linux/pagevec.h>
#include <linux/swap.h>
#include <linux/security.h>
+#include <linux/string.h>
#include <linux/fsnotify.h>
#include <linux/quotaops.h>
#include <linux/namei.h>
#include <linux/mount.h>
+#include <linux/posix_acl.h>
struct ocfs2_cow_context {
struct inode *inode;
- struct file *file;
u32 cow_start;
u32 cow_len;
struct ocfs2_extent_tree data_et;
@@ -66,7 +59,7 @@ struct ocfs2_cow_context {
u32 *num_clusters,
unsigned int *extent_flags);
int (*cow_duplicate_clusters)(handle_t *handle,
- struct file *file,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
};
@@ -102,32 +95,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
- ocfs2_error(sb,
- "Refcount block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- rb->rf_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ rb->rf_signature);
+ goto out;
}
if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid rf_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(rb->rf_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(rb->rf_blkno));
+ goto out;
}
if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid "
- "rf_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(rb->rf_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(rb->rf_fs_generation));
+ goto out;
}
-
- return 0;
+out:
+ return rc;
}
static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -163,6 +154,7 @@ ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
}
static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
+__acquires(&rf->rf_lock)
{
struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
@@ -170,6 +162,7 @@ static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
}
static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
+__releases(&rf->rf_lock)
{
struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
@@ -412,7 +405,7 @@ static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
goto out;
}
- BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
di = (struct ocfs2_dinode *)di_bh->b_data;
*ref_blkno = le64_to_cpu(di->i_refcount_loc);
@@ -480,7 +473,6 @@ again:
if (ret) {
mlog_errno(ret);
ocfs2_unlock_refcount_tree(osb, tree, rw);
- ocfs2_refcount_tree_put(tree);
goto out;
}
@@ -572,10 +564,10 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
u32 num_got;
u64 suballoc_loc, first_blkno;
- BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+ BUG_ON(ocfs2_is_refcount_inode(inode));
trace_ocfs2_create_refcount_tree(
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ (unsigned long long)oi->ip_blkno);
ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
if (ret) {
@@ -613,6 +605,11 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
}
new_bh = sb_getblk(inode->i_sb, first_blkno);
+ if (!new_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out_commit;
+ }
ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
@@ -625,7 +622,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
/* Initialize ocfs2_refcount_block. */
rb = (struct ocfs2_refcount_block *)new_bh->b_data;
memset(rb, 0, inode->i_sb->s_blocksize);
- strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+ strscpy(rb->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
@@ -635,7 +632,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
rb->rf_records.rl_count =
cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
spin_lock(&osb->osb_lock);
- rb->rf_generation = osb->s_next_generation++;
+ rb->rf_generation = cpu_to_le32(osb->s_next_generation++);
spin_unlock(&osb->osb_lock);
ocfs2_journal_dirty(handle, new_bh);
@@ -705,7 +702,7 @@ static int ocfs2_set_refcount_tree(struct inode *inode,
struct ocfs2_refcount_block *rb;
struct ocfs2_refcount_tree *ref_tree;
- BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+ BUG_ON(ocfs2_is_refcount_inode(inode));
ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
&ref_tree, &ref_root_bh);
@@ -772,7 +769,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
u16 bit = 0;
- if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+ if (!ocfs2_is_refcount_inode(inode))
return 0;
BUG_ON(!ref_blkno);
@@ -804,7 +801,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
mlog_errno(ret);
goto out;
}
- mutex_lock(&alloc_inode->i_mutex);
+ inode_lock(alloc_inode);
ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
if (ret) {
@@ -864,7 +861,7 @@ out_unlock:
}
out_mutex:
if (alloc_inode) {
- mutex_unlock(&alloc_inode->i_mutex);
+ inode_unlock(alloc_inode);
iput(alloc_inode);
}
out:
@@ -981,7 +978,7 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
return 0;
}
- if (!eb || (eb && !eb->h_next_leaf_blk)) {
+ if (!eb || !eb->h_next_leaf_blk) {
/*
* We are the last extent rec, so any high cpos should
* be stored in this leaf refcount block.
@@ -1066,7 +1063,7 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
struct buffer_head **ret_bh)
{
int ret = 0, i, found;
- u32 low_cpos, uninitialized_var(cpos_end);
+ u32 low_cpos, cpos_end;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec = NULL;
struct ocfs2_extent_block *eb = NULL;
@@ -1097,12 +1094,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(sb,
- "refcount tree %llu has non zero tree "
- "depth in leaf btree tree block %llu\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -1311,7 +1306,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
new_bh = sb_getblk(sb, blkno);
if (new_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
@@ -1399,16 +1394,6 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
return 0;
}
-static void swap_refcount_rec(void *a, void *b, int size)
-{
- struct ocfs2_refcount_rec *l = a, *r = b, tmp;
-
- tmp = *(struct ocfs2_refcount_rec *)l;
- *(struct ocfs2_refcount_rec *)l =
- *(struct ocfs2_refcount_rec *)r;
- *(struct ocfs2_refcount_rec *)r = tmp;
-}
-
/*
* The refcount cpos are ordered by their 64bit cpos,
* But we will use the low 32 bit to be the e_cpos in the b-tree.
@@ -1484,7 +1469,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
*/
sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
sizeof(struct ocfs2_refcount_rec),
- cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
+ cmp_refcount_rec_by_low_cpos, NULL);
ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
if (ret) {
@@ -1509,11 +1494,11 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
sizeof(struct ocfs2_refcount_rec),
- cmp_refcount_rec_by_cpos, swap_refcount_rec);
+ cmp_refcount_rec_by_cpos, NULL);
sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
sizeof(struct ocfs2_refcount_rec),
- cmp_refcount_rec_by_cpos, swap_refcount_rec);
+ cmp_refcount_rec_by_cpos, NULL);
*split_cpos = cpos;
return 0;
@@ -1562,7 +1547,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
new_bh = sb_getblk(sb, blkno);
if (new_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
@@ -1578,7 +1563,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
/* Initialize ocfs2_refcount_block. */
new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
memset(new_rb, 0, sb->s_blocksize);
- strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+ strscpy(new_rb->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
@@ -2301,11 +2286,10 @@ int ocfs2_decrease_refcount(struct inode *inode,
{
int ret;
u64 ref_blkno;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *tree;
- BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_get_refcount_block(inode, &ref_blkno);
if (ret) {
@@ -2357,10 +2341,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
cpos, len, phys);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2424,8 +2406,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
get_bh(prev_bh);
}
- rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
-
trace_ocfs2_calc_refcount_meta_credits_iterate(
recs_add, (unsigned long long)cpos, clusters,
(unsigned long long)le64_to_cpu(rec.r_cpos),
@@ -2441,7 +2421,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
*
* If we will insert a new one, this is easy and only happens
* during adding refcounted flag to the extent, so we don't
- * have a chance of spliting. We just need one record.
+ * have a chance of splitting. We just need one record.
*
* If the refcount rec already exists, that would be a little
* complicated. we may have to:
@@ -2503,8 +2483,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
*meta_add += ocfs2_extend_meta_needed(et.et_root_el);
*credits += ocfs2_calc_extend_credits(sb,
- et.et_root_el,
- ref_blocks);
+ et.et_root_el);
} else {
*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
*meta_add += 1;
@@ -2540,20 +2519,17 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
int *ref_blocks)
{
int ret;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *tree;
u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
- BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
refcount_loc, &tree);
@@ -2633,13 +2609,13 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
}
/*
- * Calculate out the start and number of virtual clusters we need to to CoW.
+ * Calculate out the start and number of virtual clusters we need to CoW.
*
- * cpos is vitual start cluster position we want to do CoW in a
+ * cpos is virtual start cluster position we want to do CoW in a
* file and write_len is the cluster length.
* max_cpos is the place where we want to stop CoW intentionally.
*
- * Normal we will start CoW from the beginning of extent record cotaining cpos.
+ * Normal we will start CoW from the beginning of extent record containing cpos.
* We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
* get good I/O from the resulting extent tree.
*/
@@ -2673,11 +2649,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -2863,7 +2838,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
int *credits)
{
int ret = 0, meta_add = 0;
- int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+ int num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
@@ -2875,8 +2850,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
meta_add =
ocfs2_extend_meta_needed(et->et_root_el);
- *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
- num_clusters + 2);
+ *credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
p_cluster, num_clusters,
@@ -2922,27 +2896,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
}
int ocfs2_duplicate_clusters_by_page(handle_t *handle,
- struct file *file,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
{
int ret = 0, partial;
- struct inode *inode = file_inode(file);
- struct ocfs2_caching_info *ci = INODE_CACHE(inode);
- struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct super_block *sb = inode->i_sb;
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
- struct page *page;
pgoff_t page_index;
- unsigned int from, to, readahead_pages;
+ unsigned int from, to;
loff_t offset, end, map_end;
struct address_space *mapping = inode->i_mapping;
trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
new_cluster, new_len);
- readahead_pages =
- (ocfs2_cow_contig_clusters(sb) <<
- OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
/*
@@ -2953,44 +2921,53 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
end = i_size_read(inode);
while (offset < end) {
- page_index = offset >> PAGE_CACHE_SHIFT;
- map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+ struct folio *folio;
+ page_index = offset >> PAGE_SHIFT;
+ map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
if (map_end > end)
map_end = end;
/* from, to is the offset within the page. */
- from = offset & (PAGE_CACHE_SIZE - 1);
- to = PAGE_CACHE_SIZE;
- if (map_end & (PAGE_CACHE_SIZE - 1))
- to = map_end & (PAGE_CACHE_SIZE - 1);
-
- page = find_or_create_page(mapping, page_index, GFP_NOFS);
+ from = offset & (PAGE_SIZE - 1);
+ to = PAGE_SIZE;
+ if (map_end & (PAGE_SIZE - 1))
+ to = map_end & (PAGE_SIZE - 1);
+
+retry:
+ folio = __filemap_get_folio(mapping, page_index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ mlog_errno(ret);
+ break;
+ }
/*
- * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
- * can't be dirtied before we CoW it out.
+ * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty
+ * page, so write it back.
*/
- if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
- BUG_ON(PageDirty(page));
-
- if (PageReadahead(page)) {
- page_cache_async_readahead(mapping,
- &file->f_ra, file,
- page, page_index,
- readahead_pages);
+ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) {
+ if (folio_test_dirty(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
+
+ ret = filemap_write_and_wait_range(mapping,
+ offset, map_end - 1);
+ goto retry;
+ }
}
- if (!PageUptodate(page)) {
- ret = block_read_full_page(page, ocfs2_get_block);
+ if (!folio_test_uptodate(folio)) {
+ ret = block_read_full_folio(folio, ocfs2_get_block);
if (ret) {
mlog_errno(ret);
goto unlock;
}
- lock_page(page);
+ folio_lock(folio);
}
- if (page_has_buffers(page)) {
- ret = walk_page_buffers(handle, page_buffers(page),
+ if (folio_buffers(folio)) {
+ ret = walk_page_buffers(handle, folio_buffers(folio),
from, to, &partial,
ocfs2_clear_cow_buffer);
if (ret) {
@@ -2999,13 +2976,12 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
}
- ocfs2_map_and_dirty_page(inode, handle, from, to,
- page, 0, &new_block);
- mark_page_accessed(page);
+ ocfs2_map_and_dirty_folio(inode, handle, from, to,
+ folio, 0, &new_block);
+ folio_mark_accessed(folio);
unlock:
- unlock_page(page);
- page_cache_release(page);
- page = NULL;
+ folio_unlock(folio);
+ folio_put(folio);
offset = map_end;
if (ret)
break;
@@ -3015,12 +2991,11 @@ unlock:
}
int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
- struct file *file,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
{
int ret = 0;
- struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct ocfs2_caching_info *ci = INODE_CACHE(inode);
int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
@@ -3036,7 +3011,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
for (i = 0; i < blocks; i++, old_block++, new_block++) {
new_bh = sb_getblk(osb->sb, new_block);
if (new_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
break;
}
@@ -3111,12 +3086,10 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
- if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
- ocfs2_error(sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ if (index == -1) {
+ ret = ocfs2_error(sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
@@ -3145,7 +3118,7 @@ static int ocfs2_replace_clusters(handle_t *handle,
/*If the old clusters is unwritten, no need to duplicate. */
if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = context->cow_duplicate_clusters(handle, context->file,
+ ret = context->cow_duplicate_clusters(handle, context->inode,
cpos, old, new, len);
if (ret) {
mlog_errno(ret);
@@ -3166,48 +3139,18 @@ int ocfs2_cow_sync_writeback(struct super_block *sb,
struct inode *inode,
u32 cpos, u32 num_clusters)
{
- int ret = 0;
- loff_t offset, end, map_end;
- pgoff_t page_index;
- struct page *page;
+ int ret;
+ loff_t start, end;
if (ocfs2_should_order_data(inode))
return 0;
- offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
- end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
+ start = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+ end = start + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits) - 1;
- ret = filemap_fdatawrite_range(inode->i_mapping,
- offset, end - 1);
- if (ret < 0) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret < 0)
mlog_errno(ret);
- return ret;
- }
-
- while (offset < end) {
- page_index = offset >> PAGE_CACHE_SHIFT;
- map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
- if (map_end > end)
- map_end = end;
-
- page = find_or_create_page(inode->i_mapping,
- page_index, GFP_NOFS);
- BUG_ON(!page);
-
- wait_on_page_writeback(page);
- if (PageError(page)) {
- ret = -EIO;
- mlog_errno(ret);
- } else
- mark_page_accessed(page);
-
- unlock_page(page);
- page_cache_release(page);
- page = NULL;
- offset = map_end;
- if (ret)
- break;
- }
return ret;
}
@@ -3381,11 +3324,9 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
unsigned int ext_flags;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- return -EROFS;
+ if (!ocfs2_refcount_tree(osb)) {
+ return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
}
ocfs2_init_dealloc_ctxt(&context->dealloc);
@@ -3423,48 +3364,24 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
return ret;
}
-static void ocfs2_readahead_for_cow(struct inode *inode,
- struct file *file,
- u32 start, u32 len)
-{
- struct address_space *mapping;
- pgoff_t index;
- unsigned long num_pages;
- int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
-
- if (!file)
- return;
-
- mapping = file->f_mapping;
- num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
- if (!num_pages)
- num_pages = 1;
-
- index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
- page_cache_sync_readahead(mapping, &file->f_ra, file,
- index, num_pages);
-}
-
/*
* Starting at cpos, try to CoW write_len clusters. Don't CoW
* past max_cpos. This will stop when it runs into a hole or an
* unrefcounted extent.
*/
static int ocfs2_refcount_cow_hunk(struct inode *inode,
- struct file *file,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos)
{
int ret;
u32 cow_start = 0, cow_len = 0;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *ref_tree;
struct ocfs2_cow_context *context = NULL;
- BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
cpos, write_len, max_cpos,
@@ -3480,8 +3397,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
BUG_ON(cow_len == 0);
- ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
-
context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
if (!context) {
ret = -ENOMEM;
@@ -3503,7 +3418,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
context->ref_root_bh = ref_root_bh;
context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
context->get_clusters = ocfs2_di_get_clusters;
- context->file = file;
ocfs2_init_dinode_extent_tree(&context->data_et,
INODE_CACHE(inode), di_bh);
@@ -3532,7 +3446,6 @@ out:
* clusters between cpos and cpos+write_len are safe to modify.
*/
int ocfs2_refcount_cow(struct inode *inode,
- struct file *file,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos)
{
@@ -3552,7 +3465,7 @@ int ocfs2_refcount_cow(struct inode *inode,
num_clusters = write_len;
if (ext_flags & OCFS2_EXT_REFCOUNTED) {
- ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
+ ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
num_clusters, max_cpos);
if (ret) {
mlog_errno(ret);
@@ -3657,8 +3570,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
*credits += ocfs2_calc_extend_credits(inode->i_sb,
- et.et_root_el,
- ref_blocks);
+ et.et_root_el);
}
out:
@@ -3679,11 +3591,10 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
{
int ret;
struct ocfs2_xattr_value_root *xv = vb->vb_xv;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_cow_context *context = NULL;
u32 cow_start, cow_len;
- BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
cpos, write_len, UINT_MAX,
@@ -3746,6 +3657,9 @@ int ocfs2_add_refcount_flag(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_alloc_context *meta_ac = NULL;
+ /* We need to be able to handle at least an extent tree split. */
+ ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el);
+
ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
ref_ci, ref_root_bh,
p_cluster, num_clusters,
@@ -3758,7 +3672,7 @@ int ocfs2_add_refcount_flag(struct inode *inode,
trace_ocfs2_add_refcount_flag(ref_blocks, credits);
if (ref_blocks) {
- ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+ ret = ocfs2_reserve_new_metadata_blocks(osb,
ref_blocks, &meta_ac);
if (ret) {
mlog_errno(ret);
@@ -3828,9 +3742,9 @@ static int ocfs2_change_ctime(struct inode *inode,
goto out_commit;
}
- inode->i_ctime = CURRENT_TIME;
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ inode_set_ctime_current(inode);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_journal_dirty(handle, di_bh);
@@ -3857,7 +3771,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
ocfs2_init_dealloc_ctxt(&dealloc);
- if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+ if (!ocfs2_is_refcount_inode(inode)) {
ret = ocfs2_create_refcount_tree(inode, di_bh);
if (ret) {
mlog_errno(ret);
@@ -3886,7 +3800,10 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
while (cpos < clusters) {
ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
&num_clusters, &ext_flags);
-
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
ret = ocfs2_add_refcount_flag(inode, &di_et,
&ref_tree->rf_ci,
@@ -3981,6 +3898,13 @@ static int ocfs2_add_refcounted_extent(struct inode *inode,
ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
p_cluster, num_clusters,
meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, num_clusters));
if (ret)
mlog_errno(ret);
@@ -4057,7 +3981,10 @@ static int ocfs2_duplicate_extent_list(struct inode *s_inode,
while (cpos < clusters) {
ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
&num_clusters, &ext_flags);
-
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
if (p_cluster) {
ret = ocfs2_add_refcounted_extent(t_inode, &et,
ref_ci, ref_root_bh,
@@ -4138,12 +4065,12 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
* we want mtime to appear identical to the source and
* update ctime.
*/
- t_inode->i_ctime = CURRENT_TIME;
+ inode_set_ctime_current(t_inode);
- di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(t_inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(t_inode));
- t_inode->i_mtime = s_inode->i_mtime;
+ inode_set_mtime_to_ts(t_inode, inode_get_mtime(s_inode));
di->i_mtime = s_di->i_mtime;
di->i_mtime_nsec = s_di->i_mtime_nsec;
}
@@ -4165,7 +4092,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
- struct ocfs2_refcount_block *rb;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
struct ocfs2_refcount_tree *ref_tree;
@@ -4192,7 +4118,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
mlog_errno(ret);
goto out;
}
- rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
&ref_tree->rf_ci, ref_root_bh,
@@ -4220,10 +4145,11 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
bool preserve)
{
int ret;
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct buffer_head *new_bh = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
- if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+ if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
ret = -EINVAL;
mlog_errno(ret);
goto out;
@@ -4241,7 +4167,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
goto out;
}
- mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(new_inode, I_MUTEX_CHILD);
ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
OI_LS_REFLINK_TARGET);
if (ret) {
@@ -4249,6 +4175,26 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
goto out_unlock;
}
+ if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) &&
+ (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+ /*
+ * Adjust extent record count to reserve space for extended attribute.
+ * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+ */
+ struct ocfs2_inode_info *new_oi = OCFS2_I(new_inode);
+
+ if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+ !(ocfs2_inode_is_fast_symlink(new_inode))) {
+ struct ocfs2_dinode *new_di = (struct ocfs2_dinode *)new_bh->b_data;
+ struct ocfs2_dinode *old_di = (struct ocfs2_dinode *)old_bh->b_data;
+ struct ocfs2_extent_list *el = &new_di->id2.i_list;
+ int inline_size = le16_to_cpu(old_di->i_xattr_inline_size);
+
+ le16_add_cpu(&el->l_count, -(inline_size /
+ sizeof(struct ocfs2_extent_rec)));
+ }
+ }
+
ret = ocfs2_create_reflink_node(inode, old_bh,
new_inode, new_bh, preserve);
if (ret) {
@@ -4256,7 +4202,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
goto inode_unlock;
}
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+ if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
ret = ocfs2_reflink_xattrs(inode, old_bh,
new_inode, new_bh,
preserve);
@@ -4275,7 +4221,7 @@ inode_unlock:
ocfs2_inode_unlock(new_inode, 1);
brelse(new_bh);
out_unlock:
- mutex_unlock(&new_inode->i_mutex);
+ inode_unlock(new_inode);
out:
if (!ret) {
ret = filemap_fdatawait(inode->i_mapping);
@@ -4288,14 +4234,16 @@ out:
static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool preserve)
{
- int error;
- struct inode *inode = old_dentry->d_inode;
+ int error, had_lock;
+ struct inode *inode = d_inode(old_dentry);
struct buffer_head *old_bh = NULL;
struct inode *new_orphan_inode = NULL;
+ struct ocfs2_lock_holder oh;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
+
error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
&new_orphan_inode);
if (error) {
@@ -4303,9 +4251,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
goto out;
}
+ error = ocfs2_rw_lock(inode, 1);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
error = ocfs2_inode_lock(inode, &old_bh, 1);
if (error) {
mlog_errno(error);
+ ocfs2_rw_unlock(inode, 1);
goto out;
}
@@ -4317,6 +4272,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
up_write(&OCFS2_I(inode)->ip_xattr_sem);
ocfs2_inode_unlock(inode, 1);
+ ocfs2_rw_unlock(inode, 1);
brelse(old_bh);
if (error) {
@@ -4324,6 +4280,14 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
goto out;
}
+ had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1,
+ &oh);
+ if (had_lock < 0) {
+ error = had_lock;
+ mlog_errno(error);
+ goto out;
+ }
+
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
@@ -4331,14 +4295,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
if (error)
mlog_errno(error);
}
-out:
if (!error) {
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
new_dentry);
if (error)
mlog_errno(error);
}
+ ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock);
+out:
if (new_orphan_inode) {
/*
* We need to open_unlock the inode no matter whether we
@@ -4361,11 +4326,11 @@ out:
/* copied from may_create in VFS. */
static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
{
- if (child->d_inode)
+ if (d_really_is_positive(child))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
- return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ return inode_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC);
}
/**
@@ -4379,7 +4344,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool preserve)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
int error;
if (!inode)
@@ -4419,15 +4384,16 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
* file.
*/
if (!preserve) {
- error = inode_permission(inode, MAY_READ);
+ error = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
if (error)
return error;
}
- mutex_lock(&inode->i_mutex);
- dquot_initialize(dir);
- error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
- mutex_unlock(&inode->i_mutex);
+ inode_lock(inode);
+ error = dquot_initialize(dir);
+ if (!error)
+ error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
+ inode_unlock(inode);
if (!error)
fsnotify_create(dir, new_dentry);
return error;
@@ -4453,7 +4419,7 @@ int ocfs2_reflink_ioctl(struct inode *inode,
return error;
}
- new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
+ new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry)) {
mlog_errno(error);
@@ -4467,12 +4433,375 @@ int ocfs2_reflink_ioctl(struct inode *inode,
}
error = ocfs2_vfs_reflink(old_path.dentry,
- new_path.dentry->d_inode,
+ d_inode(new_path.dentry),
new_dentry, preserve);
out_dput:
- done_path_create(&new_path, new_dentry);
+ end_creating_path(&new_path, new_dentry);
out:
path_put(&old_path);
return error;
}
+
+/* Update destination inode size, if necessary. */
+int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen)
+{
+ handle_t *handle;
+ int ret;
+
+ dest->i_blocks = ocfs2_inode_sector_count(dest);
+
+ if (newlen <= i_size_read(dest))
+ return 0;
+
+ handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* Extend i_size if needed. */
+ spin_lock(&OCFS2_I(dest)->ip_lock);
+ if (newlen > i_size_read(dest))
+ i_size_write(dest, newlen);
+ spin_unlock(&OCFS2_I(dest)->ip_lock);
+ inode_set_mtime_to_ts(dest, inode_set_ctime_current(dest));
+
+ ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
+ return ret;
+}
+
+/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
+static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ struct ocfs2_extent_tree s_et;
+ struct ocfs2_extent_tree t_et;
+ struct ocfs2_dinode *dis;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct ocfs2_super *osb;
+ loff_t remapped_bytes = 0;
+ loff_t pstart, plen;
+ u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
+ unsigned int ext_flags;
+ int ret = 0;
+
+ osb = OCFS2_SB(s_inode->i_sb);
+ dis = (struct ocfs2_dinode *)s_bh->b_data;
+ ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
+ ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
+
+ spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
+ tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
+ slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
+
+ while (spos < slast) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ /* Look up the extent. */
+ ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ num_clusters = min_t(u32, num_clusters, slast - spos);
+
+ /* Punch out the dest range. */
+ pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
+ plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
+ ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (p_cluster == 0)
+ goto next_loop;
+
+ /* Lock the refcount btree... */
+ ret = ocfs2_lock_refcount_tree(osb,
+ le64_to_cpu(dis->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Mark s_inode's extent as refcounted. */
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = ocfs2_add_refcount_flag(s_inode, &s_et,
+ &ref_tree->rf_ci,
+ ref_root_bh, spos,
+ p_cluster, num_clusters,
+ dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+ }
+
+ /* Map in the new extent. */
+ ext_flags |= OCFS2_EXT_REFCOUNTED;
+ ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
+ &ref_tree->rf_ci,
+ ref_root_bh,
+ tpos, p_cluster,
+ num_clusters,
+ ext_flags,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+next_loop:
+ spos += num_clusters;
+ tpos += num_clusters;
+ remapped_clus += num_clusters;
+ }
+
+ goto out;
+out_unlock_refcount:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+out:
+ remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
+ remapped_bytes = min_t(loff_t, len, remapped_bytes);
+
+ return remapped_bytes > 0 ? remapped_bytes : ret;
+}
+
+/* Set up refcount tree and remap s_inode to t_inode. */
+loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_super *osb;
+ struct ocfs2_dinode *dis;
+ struct ocfs2_dinode *dit;
+ loff_t ret;
+
+ osb = OCFS2_SB(s_inode->i_sb);
+ dis = (struct ocfs2_dinode *)s_bh->b_data;
+ dit = (struct ocfs2_dinode *)t_bh->b_data;
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /*
+ * If we're reflinking the entire file and the source is inline
+ * data, just copy the contents.
+ */
+ if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
+ i_size_read(t_inode) <= len &&
+ (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * If both inodes belong to two different refcount groups then
+ * forget it because we don't know how (or want) to go merging
+ * refcount trees.
+ */
+ ret = -EOPNOTSUPP;
+ if (ocfs2_is_refcount_inode(s_inode) &&
+ ocfs2_is_refcount_inode(t_inode) &&
+ le64_to_cpu(dis->i_refcount_loc) !=
+ le64_to_cpu(dit->i_refcount_loc))
+ goto out;
+
+ /* Neither inode has a refcount tree. Add one to s_inode. */
+ if (!ocfs2_is_refcount_inode(s_inode) &&
+ !ocfs2_is_refcount_inode(t_inode)) {
+ ret = ocfs2_create_refcount_tree(s_inode, s_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Ensure that both inodes end up with the same refcount tree. */
+ if (!ocfs2_is_refcount_inode(s_inode)) {
+ ret = ocfs2_set_refcount_tree(s_inode, s_bh,
+ le64_to_cpu(dit->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ if (!ocfs2_is_refcount_inode(t_inode)) {
+ ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+ le64_to_cpu(dis->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Turn off inline data in the dest file. */
+ if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Actually remap extents now. */
+ ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
+ pos_out, len, &dealloc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+
+ return ret;
+}
+
+/* Lock an inode and grab a bh pointing to the inode. */
+int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+ struct buffer_head **bh_s,
+ struct inode *t_inode,
+ struct buffer_head **bh_t)
+{
+ struct inode *inode1 = s_inode;
+ struct inode *inode2 = t_inode;
+ struct ocfs2_inode_info *oi1;
+ struct ocfs2_inode_info *oi2;
+ struct buffer_head *bh1 = NULL;
+ struct buffer_head *bh2 = NULL;
+ bool same_inode = (s_inode == t_inode);
+ bool need_swap = (inode1->i_ino > inode2->i_ino);
+ int status;
+
+ /* First grab the VFS and rw locks. */
+ lock_two_nondirectories(s_inode, t_inode);
+ if (need_swap)
+ swap(inode1, inode2);
+
+ status = ocfs2_rw_lock(inode1, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_i1;
+ }
+ if (!same_inode) {
+ status = ocfs2_rw_lock(inode2, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_i2;
+ }
+ }
+
+ /* Now go for the cluster locks */
+ oi1 = OCFS2_I(inode1);
+ oi2 = OCFS2_I(inode2);
+
+ trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
+
+ /* We always want to lock the one with the lower lockid first. */
+ if (oi1->ip_blkno > oi2->ip_blkno)
+ mlog_errno(-ENOLCK);
+
+ /* lock id1 */
+ status = ocfs2_inode_lock_nested(inode1, &bh1, 1,
+ OI_LS_REFLINK_TARGET);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ goto out_rw2;
+ }
+
+ /* lock id2 */
+ if (!same_inode) {
+ status = ocfs2_inode_lock_nested(inode2, &bh2, 1,
+ OI_LS_REFLINK_TARGET);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ goto out_cl1;
+ }
+ } else {
+ bh2 = bh1;
+ }
+
+ /*
+ * If we swapped inode order above, we have to swap the buffer heads
+ * before passing them back to the caller.
+ */
+ if (need_swap)
+ swap(bh1, bh2);
+ *bh_s = bh1;
+ *bh_t = bh2;
+
+ trace_ocfs2_double_lock_end(
+ (unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
+
+ return 0;
+
+out_cl1:
+ ocfs2_inode_unlock(inode1, 1);
+ brelse(bh1);
+out_rw2:
+ ocfs2_rw_unlock(inode2, 1);
+out_i2:
+ ocfs2_rw_unlock(inode1, 1);
+out_i1:
+ unlock_two_nondirectories(s_inode, t_inode);
+ return status;
+}
+
+/* Unlock both inodes and release buffers. */
+void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
+{
+ ocfs2_inode_unlock(s_inode, 1);
+ ocfs2_rw_unlock(s_inode, 1);
+ brelse(s_bh);
+ if (s_inode != t_inode) {
+ ocfs2_inode_unlock(t_inode, 1);
+ ocfs2_rw_unlock(t_inode, 1);
+ brelse(t_bh);
+ }
+ unlock_two_nondirectories(s_inode, t_inode);
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 7754608c83a4..8197a94feec0 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -1,18 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* refcounttree.h
*
* Copyright (C) 2009 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef OCFS2_REFCOUNTTREE_H
#define OCFS2_REFCOUNTTREE_H
@@ -53,7 +43,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
int *credits,
int *ref_blocks);
int ocfs2_refcount_cow(struct inode *inode,
- struct file *filep, struct buffer_head *di_bh,
+ struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos);
typedef int (ocfs2_post_refcount_func)(struct inode *inode,
@@ -85,11 +75,11 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
u32 cpos, u32 write_len,
struct ocfs2_post_refcount *post);
int ocfs2_duplicate_clusters_by_page(handle_t *handle,
- struct file *file,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
- struct file *file,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
int ocfs2_cow_sync_writeback(struct super_block *sb,
@@ -115,4 +105,23 @@ int ocfs2_reflink_ioctl(struct inode *inode,
const char __user *oldname,
const char __user *newname,
bool preserve);
+loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len);
+int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+ struct buffer_head **bh1,
+ struct inode *t_inode,
+ struct buffer_head **bh2);
+void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh);
+int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen);
+
#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 41ffd36c689c..1fe61974d9f0 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -1,6 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* reservations.c
*
* Allocation reservations implementation
@@ -13,15 +12,6 @@
* Universite Pierre et Marie Curie (Paris VI)
*
* The rest is copyright (C) 2010 Novell. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/fs.h>
@@ -39,10 +29,7 @@
#define OCFS2_CHECK_RESERVATIONS
#endif
-DEFINE_SPINLOCK(resv_lock);
-
-#define OCFS2_MIN_RESV_WINDOW_BITS 8
-#define OCFS2_MAX_RESV_WINDOW_BITS 1024
+static DEFINE_SPINLOCK(resv_lock);
int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
{
@@ -211,7 +198,7 @@ void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
resv->r_flags |= flags;
}
-int ocfs2_resmap_init(struct ocfs2_super *osb,
+void ocfs2_resmap_init(struct ocfs2_super *osb,
struct ocfs2_reservation_map *resmap)
{
memset(resmap, 0, sizeof(*resmap));
@@ -220,8 +207,6 @@ int ocfs2_resmap_init(struct ocfs2_super *osb,
resmap->m_reservations = RB_ROOT;
/* m_bitmap_len is initialized to zero by the above memset. */
INIT_LIST_HEAD(&resmap->m_lru);
-
- return 0;
}
static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
@@ -429,7 +414,7 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
start = search_start;
while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
- start)) != -1) {
+ start)) < resmap->m_bitmap_len) {
/* Search reached end of the region */
if (offset >= (search_start + search_len))
break;
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 42c2b804f3fd..4fce17180342 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -1,20 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* reservations.h
*
* Allocation reservations function prototypes and structures.
*
* Copyright (C) 2010 Novell. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef OCFS2_RESERVATIONS_H
@@ -41,7 +31,7 @@ struct ocfs2_alloc_reservation {
#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */
#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be
- * destroyed immedately after use */
+ * destroyed immediately after use */
#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed
* directory btree */
@@ -83,15 +73,10 @@ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
/**
* ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @osb: struct ocfs2_super to be saved in resmap
* @resmap: struct ocfs2_reservation_map to initialize
- * @obj: unused for now
- * @ops: unused for now
- * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
- *
- * Only possible return value other than '0' is -ENOMEM for failure to
- * allocation mirror bitmap.
*/
-int ocfs2_resmap_init(struct ocfs2_super *osb,
+void ocfs2_resmap_init(struct ocfs2_super *osb,
struct ocfs2_reservation_map *resmap);
/**
@@ -140,7 +125,7 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
/**
* ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
* @resmap: reservations bitmap
- * @resv: optional reservation to recalulate based on new bitmap
+ * @resv: optional reservation to recalculate based on new bitmap
* @cstart: start of allocation in clusters
* @clen: end of allocation in clusters.
*
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ec55add7604a..b0733c08ed13 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -1,27 +1,11 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* resize.c
*
* volume resize.
* Inspired by ext3/resize.c.
*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -53,14 +37,13 @@
*/
static u16 ocfs2_calc_new_backup_super(struct inode *inode,
struct ocfs2_group_desc *gd,
- int new_clusters,
- u32 first_new_cluster,
u16 cl_cpg,
+ u16 old_bg_clusters,
int set)
{
int i;
u16 backups = 0;
- u32 cluster;
+ u32 cluster, lgd_cluster;
u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
@@ -73,6 +56,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
else if (gd_blkno > lgd_blkno)
break;
+ /* check if already done backup super */
+ lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno);
+ lgd_cluster += old_bg_clusters;
+ if (lgd_cluster >= cluster)
+ continue;
+
if (set)
ocfs2_set_bit(cluster % cl_cpg,
(unsigned long *)gd->bg_bitmap);
@@ -101,6 +90,9 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
u16 chain, num_bits, backups = 0;
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+ u16 old_bg_clusters;
+ u16 contig_bits;
+ __le16 old_bg_contig_free_bits;
trace_ocfs2_update_last_group_and_inode(new_clusters,
first_new_cluster);
@@ -114,6 +106,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
group = (struct ocfs2_group_desc *)group_bh->b_data;
+ old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc;
/* update the group first. */
num_bits = new_clusters * cl_bpc;
le16_add_cpu(&group->bg_bits, num_bits);
@@ -127,12 +120,15 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
backups = ocfs2_calc_new_backup_super(bm_inode,
group,
- new_clusters,
- first_new_cluster,
- cl_cpg, 1);
+ cl_cpg, old_bg_clusters, 1);
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
+ contig_bits = ocfs2_find_max_contig_free_bits(group->bg_bitmap,
+ le16_to_cpu(group->bg_bits), 0);
+ old_bg_contig_free_bits = group->bg_contig_free_bits;
+ group->bg_contig_free_bits = cpu_to_le16(contig_bits);
+
ocfs2_journal_dirty(handle, group_bh);
/* update the inode accordingly. */
@@ -157,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
spin_lock(&OCFS2_I(bm_inode)->ip_lock);
OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
+ le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
i_size_write(bm_inode, le64_to_cpu(fe->i_size));
@@ -167,12 +163,11 @@ out_rollback:
if (ret < 0) {
ocfs2_calc_new_backup_super(bm_inode,
group,
- new_clusters,
- first_new_cluster,
- cl_cpg, 0);
+ cl_cpg, old_bg_clusters, 0);
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
+ group->bg_contig_free_bits = old_bg_contig_free_bits;
}
out:
if (ret)
@@ -193,7 +188,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- if (cluster > clusters)
+ if (cluster >= clusters)
break;
ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
@@ -298,7 +293,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (ret < 0) {
@@ -372,7 +367,7 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
@@ -469,6 +464,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
struct ocfs2_chain_list *cl;
struct ocfs2_chain_rec *cr;
u16 cl_bpc;
+ u64 bg_ptr;
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
@@ -482,7 +478,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (ret < 0) {
@@ -513,7 +509,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
if (ret) {
mlog_errno(ret);
- goto out_unlock;
+ goto out_free_group_bh;
}
trace_ocfs2_group_add((unsigned long long)input->group,
@@ -523,7 +519,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
ret = -EINVAL;
- goto out_unlock;
+ goto out_free_group_bh;
}
cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
@@ -538,12 +534,14 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
}
group = (struct ocfs2_group_desc *)group_bh->b_data;
+ bg_ptr = le64_to_cpu(group->bg_next_group);
group->bg_next_group = cr->c_blkno;
ocfs2_journal_dirty(handle, group_bh);
ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
+ group->bg_next_group = cpu_to_le64(bg_ptr);
mlog_errno(ret);
goto out_commit;
}
@@ -566,7 +564,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
+ le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
@@ -574,14 +572,19 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
out_commit:
ocfs2_commit_trans(osb, handle);
-out_unlock:
+
+out_free_group_bh:
+ if (ret < 0)
+ ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh);
brelse(group_bh);
+
+out_unlock:
brelse(main_bm_bh);
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
index f38841abf10b..4990637219ef 100644
--- a/fs/ocfs2/resize.h
+++ b/fs/ocfs2/resize.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* resize.h
*
* Function prototypes
*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_RESIZE_H
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 1424c151cccc..e544c704b583 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -1,26 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* slot_map.c
*
- *
- *
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/types.h>
@@ -55,7 +37,7 @@ struct ocfs2_slot_info {
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
- struct ocfs2_slot *si_slots;
+ struct ocfs2_slot si_slots[] __counted_by(si_num_slots);
};
@@ -306,7 +288,7 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
assert_spin_locked(&osb->osb_lock);
BUG_ON(slot_num < 0);
- BUG_ON(slot_num > osb->max_slots);
+ BUG_ON(slot_num >= osb->max_slots);
if (!si->si_slots[slot_num].sl_valid)
return -ENOENT;
@@ -322,8 +304,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
if (si == NULL)
return;
- if (si->si_inode)
- iput(si->si_inode);
+ iput(si->si_inode);
if (si->si_bh) {
for (i = 0; i < si->si_blocks; i++) {
if (si->si_bh[i]) {
@@ -382,7 +363,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
- si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+ si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!si->si_bh) {
status = -ENOMEM;
@@ -421,19 +402,15 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
struct inode *inode = NULL;
struct ocfs2_slot_info *si;
- si = kzalloc(sizeof(struct ocfs2_slot_info) +
- (sizeof(struct ocfs2_slot) * osb->max_slots),
- GFP_KERNEL);
+ si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ return status;
}
si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
- si->si_slots = (struct ocfs2_slot *)((char *)si +
- sizeof(struct ocfs2_slot_info));
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -452,7 +429,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
osb->slot_info = (struct ocfs2_slot_info *)si;
bail:
- if (status < 0 && si)
+ if (status < 0)
__ocfs2_free_slot_info(si);
return status;
@@ -503,8 +480,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
trace_ocfs2_find_slot(osb->slot_num);
status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ /*
+ * if write block failed, invalidate slot to avoid overwrite
+ * slot during dismount in case another node rightly has mounted
+ */
+ spin_lock(&osb->osb_lock);
+ ocfs2_invalidate_slot(si, osb->slot_num);
+ osb->slot_num = OCFS2_INVALID_SLOT;
+ spin_unlock(&osb->osb_lock);
+ }
bail:
return status;
@@ -527,12 +513,8 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
spin_unlock(&osb->osb_lock);
status = ocfs2_update_disk_slot(osb, si, slot_num);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto bail;
- }
-bail:
ocfs2_free_slot_info(osb);
}
-
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 601c95fd7003..a43644570b53 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* slotmap.h
*
* description here
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bf1f8930456f..f58e891aa2da 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -1,20 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* stack_o2cb.c
*
* Code which interfaces ocfs2 with the o2cb stack.
*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/kernel.h>
@@ -67,31 +57,31 @@ static inline int mode_to_o2dlm(int mode)
return mode;
}
-#define map_flag(_generic, _o2dlm) \
- if (flags & (_generic)) { \
- flags &= ~(_generic); \
- o2dlm_flags |= (_o2dlm); \
- }
static int flags_to_o2dlm(u32 flags)
{
int o2dlm_flags = 0;
- map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
- map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
- map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
- map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
- map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
- map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
- map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
- map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
- map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
-
- /* map_flag() should have cleared every flag passed in */
- BUG_ON(flags != 0);
+ if (flags & DLM_LKF_NOQUEUE)
+ o2dlm_flags |= LKM_NOQUEUE;
+ if (flags & DLM_LKF_CANCEL)
+ o2dlm_flags |= LKM_CANCEL;
+ if (flags & DLM_LKF_CONVERT)
+ o2dlm_flags |= LKM_CONVERT;
+ if (flags & DLM_LKF_VALBLK)
+ o2dlm_flags |= LKM_VALBLK;
+ if (flags & DLM_LKF_IVVALBLK)
+ o2dlm_flags |= LKM_INVVALBLK;
+ if (flags & DLM_LKF_ORPHAN)
+ o2dlm_flags |= LKM_ORPHAN;
+ if (flags & DLM_LKF_FORCEUNLOCK)
+ o2dlm_flags |= LKM_FORCE;
+ if (flags & DLM_LKF_TIMEOUT)
+ o2dlm_flags |= LKM_TIMEOUT;
+ if (flags & DLM_LKF_LOCAL)
+ o2dlm_flags |= LKM_LOCAL;
return o2dlm_flags;
}
-#undef map_flag
/*
* Map an o2dlm status to standard errno values.
@@ -237,7 +227,7 @@ static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
}
/*
- * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB
+ * o2dlm always has a "valid" LVB. If the dlm loses track of the LVB
* contents, it will zero out the LVB. Thus the caller can always trust
* the contents.
*/
@@ -283,19 +273,19 @@ static int o2cb_cluster_check(void)
*/
#define O2CB_MAP_STABILIZE_COUNT 60
for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
- o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ o2hb_fill_node_map(hbmap, O2NM_MAX_NODES);
if (!test_bit(node_num, hbmap)) {
printk(KERN_ERR "o2cb: %s heartbeat has not been "
"started.\n", (o2hb_global_heartbeat_active() ?
"Global" : "Local"));
return -EINVAL;
}
- o2net_fill_node_map(netmap, sizeof(netmap));
+ o2net_fill_node_map(netmap, O2NM_MAX_NODES);
/* Force set the current node to allow easy compare */
set_bit(node_num, netmap);
- if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES))
return 0;
- if (i < O2CB_MAP_STABILIZE_COUNT)
+ if (i < O2CB_MAP_STABILIZE_COUNT - 1)
msleep(1000);
}
@@ -398,7 +388,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
return 0;
}
-static int o2cb_cluster_this_node(unsigned int *node)
+static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
{
int node_num;
@@ -413,7 +404,7 @@ static int o2cb_cluster_this_node(unsigned int *node)
return 0;
}
-static struct ocfs2_stack_operations o2cb_stack_ops = {
+static const struct ocfs2_stack_operations o2cb_stack_ops = {
.connect = o2cb_cluster_connect,
.disconnect = o2cb_cluster_disconnect,
.this_node = o2cb_cluster_this_node,
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 286edf1e231f..be0a5758bd40 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1,29 +1,21 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* stack_user.c
*
* Code which interfaces ocfs2 with fs/dlm and a userspace stack.
*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/module.h>
#include <linux/fs.h>
+#include <linux/filelock.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/reboot.h>
-#include <asm/uaccess.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
#include "stackglue.h"
@@ -102,6 +94,12 @@
#define OCFS2_TEXT_UUID_LEN 32
#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
+#define VERSION_LOCK "version_lock"
+
+enum ocfs2_connection_type {
+ WITH_CONTROLD,
+ NO_CONTROLD
+};
/*
* ocfs2_live_connection is refcounted because the filesystem and
@@ -110,6 +108,13 @@
struct ocfs2_live_connection {
struct list_head oc_list;
struct ocfs2_cluster_connection *oc_conn;
+ enum ocfs2_connection_type oc_type;
+ atomic_t oc_this_node;
+ int oc_our_slot;
+ struct dlm_lksb oc_version_lksb;
+ char oc_lvb[DLM_LVB_LEN];
+ struct completion oc_sync_wait;
+ wait_queue_head_t oc_wait;
};
struct ocfs2_control_private {
@@ -198,20 +203,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
* mount path. Since the VFS prevents multiple calls to
* fill_super(), we can't get dupes here.
*/
-static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
- struct ocfs2_live_connection **c_ret)
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_live_connection *c)
{
int rc = 0;
- struct ocfs2_live_connection *c;
-
- c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
- if (!c)
- return -ENOMEM;
mutex_lock(&ocfs2_control_lock);
c->oc_conn = conn;
- if (atomic_read(&ocfs2_control_opened))
+ if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
list_add(&c->oc_list, &ocfs2_live_connection_list);
else {
printk(KERN_ERR
@@ -220,12 +220,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
}
mutex_unlock(&ocfs2_control_lock);
-
- if (!rc)
- *c_ret = c;
- else
- kfree(c);
-
return rc;
}
@@ -366,7 +360,6 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
struct ocfs2_control_message_setn *msg)
{
long nodenum;
- char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
if (ocfs2_control_get_handshake_state(file) !=
@@ -381,8 +374,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
return -EINVAL;
msg->space = msg->newline = '\0';
- nodenum = simple_strtol(msg->nodestr, &ptr, 16);
- if (!ptr || *ptr)
+ if (kstrtol(msg->nodestr, 16, &nodenum))
return -EINVAL;
if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
@@ -395,9 +387,8 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
static int ocfs2_control_do_setversion_msg(struct file *file,
struct ocfs2_control_message_setv *msg)
- {
+{
long major, minor;
- char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
struct ocfs2_protocol_version *max =
&ocfs2_user_plugin.sp_max_proto;
@@ -415,11 +406,9 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
return -EINVAL;
msg->space1 = msg->space2 = msg->newline = '\0';
- major = simple_strtol(msg->major, &ptr, 16);
- if (!ptr || *ptr)
+ if (kstrtol(msg->major, 16, &major))
return -EINVAL;
- minor = simple_strtol(msg->minor, &ptr, 16);
- if (!ptr || *ptr)
+ if (kstrtol(msg->minor, 16, &minor))
return -EINVAL;
/*
@@ -447,7 +436,6 @@ static int ocfs2_control_do_down_msg(struct file *file,
struct ocfs2_control_message_down *msg)
{
long nodenum;
- char *p = NULL;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_VALID)
@@ -462,8 +450,7 @@ static int ocfs2_control_do_down_msg(struct file *file,
return -EINVAL;
msg->space1 = msg->space2 = msg->newline = '\0';
- nodenum = simple_strtol(msg->nodestr, &p, 16);
- if (!p || *p)
+ if (kstrtol(msg->nodestr, 16, &nodenum))
return -EINVAL;
if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
@@ -588,7 +575,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
*/
ocfs2_control_this_node = -1;
running_proto.pv_major = 0;
- running_proto.pv_major = 0;
+ running_proto.pv_minor = 0;
}
out:
@@ -652,14 +639,7 @@ static int ocfs2_control_init(void)
static void ocfs2_control_exit(void)
{
- int rc;
-
- rc = misc_deregister(&ocfs2_control_device);
- if (rc)
- printk(KERN_ERR
- "ocfs2: Unable to deregister ocfs2_control device "
- "(errno %d)\n",
- -rc);
+ misc_deregister(&ocfs2_control_device);
}
static void fsdlm_lock_ast_wrapper(void *astarg)
@@ -697,28 +677,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
void *name,
unsigned int namelen)
{
- int ret;
-
if (!lksb->lksb_fsdlm.sb_lvbptr)
lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
sizeof(struct dlm_lksb);
- ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
- flags|DLM_LKF_NODLCKWT, name, namelen, 0,
- fsdlm_lock_ast_wrapper, lksb,
- fsdlm_blocking_ast_wrapper);
- return ret;
+ return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+ flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+ fsdlm_lock_ast_wrapper, lksb,
+ fsdlm_blocking_ast_wrapper);
}
static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags)
{
- int ret;
-
- ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
- flags, &lksb->lksb_fsdlm, lksb);
- return ret;
+ return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+ flags, &lksb->lksb_fsdlm, lksb);
}
static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
@@ -757,20 +731,13 @@ static int user_plock(struct ocfs2_cluster_connection *conn,
*
* Internally, fs/dlm will pass these to a misc device, which
* a userspace daemon will read and write to.
- *
- * For now, cancel requests (which happen internally only),
- * are turned into unlocks. Most of this function taken from
- * gfs2_lock.
*/
- if (cmd == F_CANCELLK) {
- cmd = F_SETLK;
- fl->fl_type = F_UNLCK;
- }
-
- if (IS_GETLK(cmd))
+ if (cmd == F_CANCELLK)
+ return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl);
+ else if (IS_GETLK(cmd))
return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
- else if (fl->fl_type == F_UNLCK)
+ else if (lock_is_unlock(fl))
return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
else
return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
@@ -799,18 +766,257 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
return 0;
}
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ ver->pv_major = pv->pv_major;
+ ver->pv_minor = pv->pv_minor;
+}
+
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ pv->pv_major = ver->pv_major;
+ pv->pv_minor = ver->pv_minor;
+}
+
+static void sync_wait_cb(void *arg)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ complete(&lc->oc_sync_wait);
+}
+
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x error %d\n",
+ name, lksb->sb_lkid, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ if (lksb->sb_status != -DLM_EUNLOCK) {
+ printk(KERN_ERR "%s lkid %x status %d\n",
+ name, lksb->sb_lkid, lksb->sb_status);
+ return -1;
+ }
+ return 0;
+}
+
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+ int mode, uint32_t flags,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error, status;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+ name, strlen(name),
+ 0, sync_wait_cb, conn, NULL);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+ name, lksb->sb_lkid, flags, mode, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ status = lksb->sb_status;
+
+ if (status && status != -EAGAIN) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+ name, lksb->sb_lkid, flags, mode, status);
+ }
+
+ return status;
+}
+
+
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+ int flags)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_lock(conn, mode, flags,
+ &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ * version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ * taking the PR lock.
+ */
+
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+ int ret;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ struct ocfs2_protocol_version pv;
+
+ running_proto.pv_major =
+ ocfs2_user_plugin.sp_max_proto.pv_major;
+ running_proto.pv_minor =
+ ocfs2_user_plugin.sp_max_proto.pv_minor;
+
+ lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+ ret = version_lock(conn, DLM_LOCK_EX,
+ DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+ if (!ret) {
+ conn->cc_version.pv_major = running_proto.pv_major;
+ conn->cc_version.pv_minor = running_proto.pv_minor;
+ version_to_lvb(&running_proto, lc->oc_lvb);
+ version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ } else if (ret == -EAGAIN) {
+ ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+ if (ret)
+ goto out;
+ lvb_to_version(lc->oc_lvb, &pv);
+
+ if ((pv.pv_major != running_proto.pv_major) ||
+ (pv.pv_minor > running_proto.pv_minor)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ conn->cc_version.pv_major = pv.pv_major;
+ conn->cc_version.pv_minor = pv.pv_minor;
+ }
+out:
+ return ret;
+}
+
+static void user_recover_prep(void *arg)
+{
+}
+
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+ slot->nodeid, slot->slot);
+ conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+
+}
+
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+ int num_slots, int our_slot,
+ uint32_t generation)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ int i;
+
+ for (i = 0; i < num_slots; i++)
+ if (slots[i].slot == our_slot) {
+ atomic_set(&lc->oc_this_node, slots[i].nodeid);
+ break;
+ }
+
+ lc->oc_our_slot = our_slot;
+ wake_up(&lc->oc_wait);
+}
+
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
+ .recover_prep = user_recover_prep,
+ .recover_slot = user_recover_slot,
+ .recover_done = user_recover_done,
+};
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+ version_unlock(conn);
+ dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL);
+ conn->cc_lockspace = NULL;
+ ocfs2_live_connection_drop(conn->cc_private);
+ conn->cc_private = NULL;
+ return 0;
+}
+
static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
{
dlm_lockspace_t *fsdlm;
- struct ocfs2_live_connection *uninitialized_var(control);
- int rc = 0;
+ struct ocfs2_live_connection *lc;
+ int rc, ops_rv;
BUG_ON(conn == NULL);
- rc = ocfs2_live_connection_new(conn, &control);
+ lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+ if (!lc)
+ return -ENOMEM;
+
+ init_waitqueue_head(&lc->oc_wait);
+ init_completion(&lc->oc_sync_wait);
+ atomic_set(&lc->oc_this_node, 0);
+ conn->cc_private = lc;
+ lc->oc_type = NO_CONTROLD;
+
+ rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+ DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
+ &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
+ if (rc) {
+ if (rc == -EEXIST || rc == -EPROTO)
+ printk(KERN_ERR "ocfs2: Unable to create the "
+ "lockspace %s (%d), because a ocfs2-tools "
+ "program is running on this file system "
+ "with the same name lockspace\n",
+ conn->cc_name, rc);
+ goto out;
+ }
+
+ if (ops_rv == -EOPNOTSUPP) {
+ lc->oc_type = WITH_CONTROLD;
+ printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+ "version of dlm_controld and/or ocfs2-tools."
+ " Please consider upgrading.\n");
+ } else if (ops_rv) {
+ rc = ops_rv;
+ goto out;
+ }
+ conn->cc_lockspace = fsdlm;
+
+ rc = ocfs2_live_connection_attach(conn, lc);
if (rc)
goto out;
+ if (lc->oc_type == NO_CONTROLD) {
+ rc = get_protocol_version(conn);
+ if (rc) {
+ printk(KERN_ERR "ocfs2: Could not determine"
+ " locking version\n");
+ user_cluster_disconnect(conn);
+ lc = NULL;
+ goto out;
+ }
+ wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+ }
+
/*
* running_proto must have been set before we allowed any mounts
* to proceed.
@@ -818,42 +1024,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
printk(KERN_ERR
"Unable to mount with fs locking protocol version "
- "%u.%u because the userspace control daemon has "
- "negotiated %u.%u\n",
+ "%u.%u because negotiated protocol is %u.%u\n",
conn->cc_version.pv_major, conn->cc_version.pv_minor,
running_proto.pv_major, running_proto.pv_minor);
rc = -EPROTO;
- ocfs2_live_connection_drop(control);
- goto out;
+ ocfs2_live_connection_drop(lc);
+ lc = NULL;
}
- rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
- NULL, NULL, NULL, &fsdlm);
- if (rc) {
- ocfs2_live_connection_drop(control);
- goto out;
- }
-
- conn->cc_private = control;
- conn->cc_lockspace = fsdlm;
out:
+ if (rc)
+ kfree(lc);
return rc;
}
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-{
- dlm_release_lockspace(conn->cc_lockspace, 2);
- conn->cc_lockspace = NULL;
- ocfs2_live_connection_drop(conn->cc_private);
- conn->cc_private = NULL;
- return 0;
-}
-static int user_cluster_this_node(unsigned int *this_node)
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *this_node)
{
int rc;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ if (lc->oc_type == WITH_CONTROLD)
+ rc = ocfs2_control_get_this_node();
+ else if (lc->oc_type == NO_CONTROLD)
+ rc = atomic_read(&lc->oc_this_node);
+ else
+ rc = -EINVAL;
- rc = ocfs2_control_get_this_node();
if (rc < 0)
return rc;
@@ -861,7 +1059,7 @@ static int user_cluster_this_node(unsigned int *this_node)
return 0;
}
-static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
+static const struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
.connect = user_cluster_connect,
.disconnect = user_cluster_disconnect,
.this_node = user_cluster_this_node,
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 39abf89697ed..a28c127b9934 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -1,21 +1,11 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* stackglue.c
*
* Code which implements an OCFS2 specific interface to underlying
* cluster stacks.
*
* Copyright (C) 2007, 2009 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/list.h>
@@ -309,6 +299,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
EXPORT_SYMBOL_GPL(ocfs2_plock);
int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
@@ -342,8 +334,12 @@ int ocfs2_cluster_connect(const char *stack_name,
goto out;
}
- memcpy(new_conn->cc_name, group, grouplen);
+ strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
new_conn->cc_namelen = grouplen;
+ if (cluster_name_len)
+ strscpy(new_conn->cc_cluster_name, cluster_name,
+ CLUSTER_NAME_MAX + 1);
+ new_conn->cc_cluster_name_len = cluster_name_len;
new_conn->cc_recovery_handler = recovery_handler;
new_conn->cc_recovery_data = recovery_data;
@@ -386,8 +382,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,
if (cluster_stack_name[0])
stack_name = cluster_stack_name;
- return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
- recovery_handler, recovery_data, conn);
+ return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
+ lproto, recovery_handler, recovery_data,
+ conn);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
@@ -460,9 +457,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
-int ocfs2_cluster_this_node(unsigned int *node)
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
{
- return active_stack->sp_ops->this_node(node);
+ return active_stack->sp_ops->this_node(conn, node);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
@@ -488,7 +486,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_max_locking_protocol =
- __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+ __ATTR(max_locking_protocol, S_IRUGO,
ocfs2_max_locking_protocol_show, NULL);
static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
@@ -502,11 +500,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
ret = snprintf(buf, remain, "%s\n",
p->sp_name);
- if (ret < 0) {
- total = ret;
- break;
- }
- if (ret == remain) {
+ if (ret >= remain) {
/* snprintf() didn't fit */
total = -E2BIG;
break;
@@ -520,7 +514,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
- __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+ __ATTR(loaded_cluster_plugins, S_IRUGO,
ocfs2_loaded_cluster_plugins_show, NULL);
static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
@@ -533,7 +527,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
if (active_stack) {
ret = snprintf(buf, PAGE_SIZE, "%s\n",
active_stack->sp_name);
- if (ret == PAGE_SIZE)
+ if (ret >= PAGE_SIZE)
ret = -E2BIG;
}
spin_unlock(&ocfs2_stack_lock);
@@ -542,7 +536,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
- __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+ __ATTR(active_cluster_plugin, S_IRUGO,
ocfs2_active_cluster_plugin_show, NULL);
static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
@@ -591,23 +585,38 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
static struct kobj_attribute ocfs2_attr_cluster_stack =
- __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+ __ATTR(cluster_stack, S_IRUGO | S_IWUSR,
ocfs2_cluster_stack_show,
ocfs2_cluster_stack_store);
+
+
+static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "1\n");
+}
+
+static struct kobj_attribute ocfs2_attr_dlm_recover_support =
+ __ATTR(dlm_recover_callback_support, S_IRUGO,
+ ocfs2_dlm_recover_show, NULL);
+
static struct attribute *ocfs2_attrs[] = {
&ocfs2_attr_max_locking_protocol.attr,
&ocfs2_attr_loaded_cluster_plugins.attr,
&ocfs2_attr_active_cluster_plugin.attr,
&ocfs2_attr_cluster_stack.attr,
+ &ocfs2_attr_dlm_recover_support.attr,
NULL,
};
-static struct attribute_group ocfs2_attr_group = {
+static const struct attribute_group ocfs2_attr_group = {
.attrs = ocfs2_attrs,
};
-static struct kset *ocfs2_kset;
+struct kset *ocfs2_kset;
+EXPORT_SYMBOL_GPL(ocfs2_kset);
static void ocfs2_sysfs_exit(void)
{
@@ -641,9 +650,7 @@ error:
* and easier to preserve the name.
*/
-#define FS_OCFS2_NM 1
-
-static ctl_table ocfs2_nm_table[] = {
+static const struct ctl_table ocfs2_nm_table[] = {
{
.procname = "hb_ctl_path",
.data = ocfs2_hb_ctl_path,
@@ -651,44 +658,9 @@ static ctl_table ocfs2_nm_table[] = {
.mode = 0644,
.proc_handler = proc_dostring,
},
- { }
};
-static ctl_table ocfs2_mod_table[] = {
- {
- .procname = "nm",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_nm_table
- },
- { }
-};
-
-static ctl_table ocfs2_kern_table[] = {
- {
- .procname = "ocfs2",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_mod_table
- },
- { }
-};
-
-static ctl_table ocfs2_root_table[] = {
- {
- .procname = "fs",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_kern_table
- },
- { }
-};
-
-static struct ctl_table_header *ocfs2_table_header = NULL;
-
+static struct ctl_table_header *ocfs2_table_header;
/*
* Initialization
@@ -696,31 +668,34 @@ static struct ctl_table_header *ocfs2_table_header = NULL;
static int __init ocfs2_stack_glue_init(void)
{
+ int ret;
+
strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
- ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
+ ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table);
if (!ocfs2_table_header) {
printk(KERN_ERR
"ocfs2 stack glue: unable to register sysctl\n");
return -ENOMEM; /* or something. */
}
- return ocfs2_sysfs_init();
+ ret = ocfs2_sysfs_init();
+ if (ret)
+ unregister_sysctl_table(ocfs2_table_header);
+
+ return ret;
}
static void __exit ocfs2_stack_glue_exit(void)
{
memset(&locking_max_version, 0,
sizeof(struct ocfs2_protocol_version));
- locking_max_version.pv_major = 0;
- locking_max_version.pv_minor = 0;
ocfs2_sysfs_exit();
- if (ocfs2_table_header)
- unregister_sysctl_table(ocfs2_table_header);
+ unregister_sysctl_table(ocfs2_table_header);
}
MODULE_AUTHOR("Oracle");
-MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
+MODULE_DESCRIPTION("ocfs2 cluster stack glue layer");
MODULE_LICENSE("GPL");
module_init(ocfs2_stack_glue_init);
module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 1ec56fdb8d0d..5486a6dce70a 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -1,20 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* stackglue.h
*
* Glue to the underlying cluster stack.
*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
@@ -45,6 +35,9 @@ struct file_lock;
*/
#define GROUP_NAME_MAX 64
+/* This shadows OCFS2_CLUSTER_NAME_LEN */
+#define CLUSTER_NAME_MAX 16
+
/*
* ocfs2_protocol_version changes when ocfs2 does something different in
@@ -97,8 +90,10 @@ struct ocfs2_locking_protocol {
* locking compatibility.
*/
struct ocfs2_cluster_connection {
- char cc_name[GROUP_NAME_MAX];
+ char cc_name[GROUP_NAME_MAX + 1];
int cc_namelen;
+ char cc_cluster_name[CLUSTER_NAME_MAX + 1];
+ int cc_cluster_name_len;
struct ocfs2_protocol_version cc_version;
struct ocfs2_locking_protocol *cc_proto;
void (*cc_recovery_handler)(int node_num, void *recovery_data);
@@ -152,7 +147,8 @@ struct ocfs2_stack_operations {
* ->this_node() returns the cluster's unique identifier for the
* local node.
*/
- int (*this_node)(unsigned int *node);
+ int (*this_node)(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
/*
* Call the underlying dlm lock function. The ->dlm_lock()
@@ -214,7 +210,7 @@ struct ocfs2_stack_operations {
struct file_lock *fl);
/*
- * This is an optoinal debugging hook. If provided, the
+ * This is an optional debugging hook. If provided, the
* stack can dump debugging information about this lock.
*/
void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
@@ -227,7 +223,7 @@ struct ocfs2_stack_operations {
*/
struct ocfs2_stack_plugin {
char *sp_name;
- struct ocfs2_stack_operations *sp_ops;
+ const struct ocfs2_stack_operations *sp_ops;
struct module *sp_owner;
/* These are managed by the stackglue code. */
@@ -239,6 +235,8 @@ struct ocfs2_stack_plugin {
/* Used by the filesystem */
int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
@@ -260,7 +258,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,
int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
int hangup_pending);
void ocfs2_cluster_hangup(const char *group, int grouplen);
-int ocfs2_cluster_this_node(unsigned int *node);
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
struct ocfs2_lock_res;
int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
@@ -289,4 +288,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+extern struct kset *ocfs2_kset;
+
#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5397c07ce608..6ac4dcd54588 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1,27 +1,11 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* suballoc.c
*
* metadata alloc and free
* Inspired by ext3 block groups.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -66,6 +50,10 @@ struct ocfs2_suballoc_result {
u64 sr_blkno; /* The first allocated block */
unsigned int sr_bit_offset; /* The bit in the bg */
unsigned int sr_bits; /* How many bits we claimed */
+ unsigned int sr_max_contig_bits; /* The length for contiguous
+ * free bits, only available
+ * for cluster group
+ */
};
static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
@@ -79,8 +67,6 @@ static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
}
-static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
-static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
@@ -113,12 +99,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
struct ocfs2_suballoc_result *res);
static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr);
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits);
static int ocfs2_relink_block_group(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *fe_bh,
@@ -147,7 +127,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
if (ac->ac_which != OCFS2_AC_USE_LOCAL)
ocfs2_inode_unlock(inode, 1);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
ac->ac_inode = NULL;
@@ -155,10 +135,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
brelse(ac->ac_bh);
ac->ac_bh = NULL;
ac->ac_resv = NULL;
- if (ac->ac_find_loc_priv) {
- kfree(ac->ac_find_loc_priv);
- ac->ac_find_loc_priv = NULL;
- }
+ kfree(ac->ac_find_loc_priv);
+ ac->ac_find_loc_priv = NULL;
}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -173,12 +151,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
}
#define do_error(fmt, ...) \
- do{ \
- if (resize) \
- mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
- else \
- ocfs2_error(sb, fmt, ##__VA_ARGS__); \
- } while (0)
+do { \
+ if (resize) \
+ mlog(ML_ERROR, fmt, ##__VA_ARGS__); \
+ else \
+ return ocfs2_error(sb, fmt, ##__VA_ARGS__); \
+} while (0)
static int ocfs2_validate_gd_self(struct super_block *sb,
struct buffer_head *bh,
@@ -187,44 +165,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- do_error("Group descriptor #%llu has bad signature %.*s",
+ do_error("Group descriptor #%llu has bad signature %.*s\n",
(unsigned long long)bh->b_blocknr, 7,
gd->bg_signature);
- return -EINVAL;
}
if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
- do_error("Group descriptor #%llu has an invalid bg_blkno "
- "of %llu",
+ do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_blkno));
- return -EINVAL;
}
if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
- do_error("Group descriptor #%llu has an invalid "
- "fs_generation of #%u",
+ do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(gd->bg_generation));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
- do_error("Group descriptor #%llu has bit count %u but "
- "claims that %u are free",
+ do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
le16_to_cpu(gd->bg_free_bits_count));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
- do_error("Group descriptor #%llu has bit count %u but "
- "max bitmap bits of %u",
+ do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
8 * le16_to_cpu(gd->bg_size));
- return -EINVAL;
}
return 0;
@@ -239,20 +208,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (di->i_blkno != gd->bg_parent_dinode) {
- do_error("Group descriptor #%llu has bad parent "
- "pointer (%llu, expected %llu)",
+ do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
(unsigned long long)le64_to_cpu(di->i_blkno));
- return -EINVAL;
}
max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
if (le16_to_cpu(gd->bg_bits) > max_bits) {
- do_error("Group descriptor #%llu has bit count of %u",
+ do_error("Group descriptor #%llu has bit count of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits));
- return -EINVAL;
}
/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -260,10 +226,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
((le16_to_cpu(gd->bg_chain) ==
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
- do_error("Group descriptor #%llu has bad chain %u",
+ do_error("Group descriptor #%llu has bad chain %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_chain));
- return -EINVAL;
}
return 0;
@@ -390,11 +355,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct super_block * sb = alloc_inode->i_sb;
if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
- ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
- "b_blocknr (%llu)",
- (unsigned long long)group_blkno,
- (unsigned long long) bg_bh->b_blocknr);
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "group block (%llu) != b_blocknr (%llu)\n",
+ (unsigned long long)group_blkno,
+ (unsigned long long) bg_bh->b_blocknr);
goto bail;
}
@@ -409,7 +373,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
memset(bg, 0, sb->s_blocksize);
strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
- bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ bg->bg_generation = cpu_to_le32(osb->fs_generation);
bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
osb->s_feature_incompat));
bg->bg_chain = cpu_to_le16(my_chain);
@@ -481,7 +445,7 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
bg_bh = sb_getblk(osb->sb, bg_blkno);
if (!bg_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -661,7 +625,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
bg_bh = sb_getblk(osb->sb, bg_blkno);
if (!bg_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -734,10 +698,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
ac, cl);
- if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
+ if (PTR_ERR(bg_bh) == -ENOSPC) {
+ ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
bg_bh = ocfs2_block_group_alloc_discontig(handle,
alloc_inode,
ac, cl);
+ }
if (IS_ERR(bg_bh)) {
status = PTR_ERR(bg_bh);
bg_bh = NULL;
@@ -777,6 +743,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
+ ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
status = 0;
@@ -818,11 +785,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
return -EINVAL;
}
- mutex_lock(&alloc_inode->i_mutex);
+ inode_lock(alloc_inode);
status = ocfs2_inode_lock(alloc_inode, &bh, 1);
if (status < 0) {
- mutex_unlock(&alloc_inode->i_mutex);
+ inode_unlock(alloc_inode);
iput(alloc_inode);
mlog_errno(status);
@@ -839,9 +806,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
- ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
- (unsigned long long)le64_to_cpu(fe->i_blkno));
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "Invalid chain allocator %llu\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno));
goto bail;
}
@@ -916,9 +883,9 @@ static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
{
spin_lock(&osb->osb_lock);
if (type == INODE_ALLOC_SYSTEM_INODE)
- osb->s_inode_steal_slot = slot;
+ osb->s_inode_steal_slot = (u16)slot;
else if (type == EXTENT_ALLOC_SYSTEM_INODE)
- osb->s_meta_steal_slot = slot;
+ osb->s_meta_steal_slot = (u16)slot;
spin_unlock(&osb->osb_lock);
}
@@ -1168,12 +1135,9 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT, NULL,
ALLOC_NEW_GROUP);
- if (status < 0 && status != -ENOSPC) {
+ if (status < 0 && status != -ENOSPC)
mlog_errno(status);
- goto bail;
- }
-bail:
return status;
}
@@ -1185,7 +1149,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
int flags,
struct ocfs2_alloc_context **ac)
{
- int status;
+ int status, ret = 0;
+ int retried = 0;
*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
@@ -1210,7 +1175,34 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
}
if (status == -ENOSPC) {
+retry:
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+ /* Retry if there is sufficient space cached in truncate log */
+ if (status == -ENOSPC && !retried) {
+ retried = 1;
+ ocfs2_inode_unlock((*ac)->ac_inode, 1);
+ inode_unlock((*ac)->ac_inode);
+
+ ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
+ if (ret == 1) {
+ iput((*ac)->ac_inode);
+ (*ac)->ac_inode = NULL;
+ goto retry;
+ }
+
+ if (ret < 0)
+ mlog_errno(ret);
+
+ inode_lock((*ac)->ac_inode);
+ ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ inode_unlock((*ac)->ac_inode);
+ iput((*ac)->ac_inode);
+ (*ac)->ac_inode = NULL;
+ goto bail;
+ }
+ }
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -1264,25 +1256,48 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr)
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ struct journal_head *jh;
int ret;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
return 0;
- if (!buffer_jbd(bg_bh))
+ jh = jbd2_journal_grab_journal_head(bg_bh);
+ if (!jh)
return 1;
- jbd_lock_bh_state(bg_bh);
- bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+ spin_lock(&jh->b_state_lock);
+ bg = (struct ocfs2_group_desc *) jh->b_committed_data;
if (bg)
ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
else
ret = 1;
- jbd_unlock_bh_state(bg_bh);
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_put_journal_head(jh);
return ret;
}
+u16 ocfs2_find_max_contig_free_bits(void *bitmap,
+ u16 total_bits, u16 start)
+{
+ u16 offset, free_bits;
+ u16 contig_bits = 0;
+
+ while (start < total_bits) {
+ offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start);
+ if (offset == total_bits)
+ break;
+
+ start = ocfs2_find_next_bit(bitmap, total_bits, offset);
+ free_bits = start - offset;
+ if (contig_bits < free_bits)
+ contig_bits = free_bits;
+ }
+
+ return contig_bits;
+}
+
static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
struct buffer_head *bg_bh,
unsigned int bits_wanted,
@@ -1291,6 +1306,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
{
void *bitmap;
u16 best_offset, best_size;
+ u16 prev_best_size = 0;
int offset, start, found, status = 0;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
@@ -1301,10 +1317,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
found = start = best_offset = best_size = 0;
bitmap = bg->bg_bitmap;
- while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
- if (offset == total_bits)
- break;
-
+ while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) <
+ total_bits) {
if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
/* We found a zero, but we can't use it as it
* hasn't been put to disk yet! */
@@ -1319,6 +1333,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
/* got a zero after some ones */
found = 1;
start = offset + 1;
+ prev_best_size = best_size;
}
if (found > best_size) {
best_size = found;
@@ -1331,6 +1346,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
}
}
+ /* best_size will be allocated, we save prev_best_size */
+ res->sr_max_contig_bits = prev_best_size;
if (best_size) {
res->sr_bit_offset = best_offset;
res->sr_bits = best_size;
@@ -1343,16 +1360,21 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
return status;
}
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
+int ocfs2_block_group_set_bits(handle_t *handle,
struct inode *alloc_inode,
struct ocfs2_group_desc *bg,
struct buffer_head *group_bh,
unsigned int bit_off,
- unsigned int num_bits)
+ unsigned int num_bits,
+ unsigned int max_contig_bits,
+ int fastpath)
{
int status;
void *bitmap = bg->bg_bitmap;
int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+ unsigned int start = bit_off + num_bits;
+ u16 contig_bits;
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
/* All callers get the descriptor via
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
@@ -1375,21 +1397,41 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
+ /*
+ * this is optimize path, caller set old contig value
+ * in max_contig_bits to bypass finding action.
+ */
+ if (fastpath) {
+ bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
+ } else if (ocfs2_is_cluster_bitmap(alloc_inode)) {
+ /*
+ * Usually, the block group bitmap allocates only 1 bit
+ * at a time, while the cluster group allocates n bits
+ * each time. Therefore, we only save the contig bits for
+ * the cluster group.
+ */
+ contig_bits = ocfs2_find_max_contig_free_bits(bitmap,
+ le16_to_cpu(bg->bg_bits), start);
+ if (contig_bits > max_contig_bits)
+ max_contig_bits = contig_bits;
+ bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
+ ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits);
+ } else {
+ bg->bg_contig_free_bits = 0;
+ }
+
ocfs2_journal_dirty(handle, group_bh);
bail:
- if (status)
- mlog_errno(status);
return status;
}
@@ -1500,7 +1542,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
BUG_ON(!ocfs2_is_cluster_bitmap(inode));
- if (gd->bg_free_bits_count) {
+ if (le16_to_cpu(gd->bg_contig_free_bits) &&
+ le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted)
+ return -ENOSPC;
+
+ /* ->bg_contig_free_bits may un-initialized, so compare again */
+ if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) {
max_bits = le16_to_cpu(gd->bg_bits);
/* Tail groups in cluster bitmaps which aren't cpg
@@ -1520,7 +1567,7 @@ static int ocfs2_cluster_group_search(struct inode *inode,
OCFS2_I(inode)->ip_clusters, max_bits);
}
- ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+ ret = ocfs2_block_group_find_clear_bits(osb,
group_bh, bits_wanted,
max_bits, res);
if (ret)
@@ -1544,13 +1591,6 @@ static int ocfs2_cluster_group_search(struct inode *inode,
* of bits. */
if (min_bits <= res->sr_bits)
search = 0; /* success */
- else if (res->sr_bits) {
- /*
- * Don't show bits which we'll be returning
- * for allocation to the local alloc bitmap.
- */
- ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
- }
}
return search;
@@ -1569,7 +1609,7 @@ static int ocfs2_block_group_search(struct inode *inode,
BUG_ON(min_bits != 1);
BUG_ON(ocfs2_is_cluster_bitmap(inode));
- if (bg->bg_free_bits_count) {
+ if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) {
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
le16_to_cpu(bg->bg_bits),
@@ -1588,7 +1628,7 @@ static int ocfs2_block_group_search(struct inode *inode,
return ret;
}
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
handle_t *handle,
struct buffer_head *di_bh,
u32 num_bits,
@@ -1615,6 +1655,21 @@ out:
return ret;
}
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain)
+{
+ u32 tmp_used;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+ struct ocfs2_chain_list *cl;
+
+ cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
+ tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+ di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
+ le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
+}
+
static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
struct ocfs2_extent_rec *rec,
struct ocfs2_chain_list *cl)
@@ -1714,9 +1769,14 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
}
ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
- res->sr_bit_offset, res->sr_bits);
- if (ret < 0)
+ res->sr_bit_offset, res->sr_bits,
+ res->sr_max_contig_bits, 0);
+ if (ret < 0) {
+ ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
+ res->sr_bits,
+ le16_to_cpu(gd->bg_chain));
mlog_errno(ret);
+ }
out_loc_only:
*bits_left = le16_to_cpu(gd->bg_free_bits_count);
@@ -1736,6 +1796,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
{
int status;
u16 chain;
+ u32 contig_bits;
u64 next_group;
struct inode *alloc_inode = ac->ac_inode;
struct buffer_head *group_bh = NULL;
@@ -1761,10 +1822,21 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
status = -ENOSPC;
/* for now, the chain search is a bit simplistic. We just use
* the 1st group with any empty bits. */
- while ((status = ac->ac_group_search(alloc_inode, group_bh,
- bits_wanted, min_bits,
- ac->ac_max_block,
- res)) == -ENOSPC) {
+ while (1) {
+ if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) {
+ contig_bits = le16_to_cpu(bg->bg_contig_free_bits);
+ if (!contig_bits)
+ contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
+ le16_to_cpu(bg->bg_bits), 0);
+ if (bits_wanted > contig_bits && contig_bits >= min_bits)
+ bits_wanted = contig_bits;
+ }
+
+ status = ac->ac_group_search(alloc_inode, group_bh,
+ bits_wanted, min_bits,
+ ac->ac_max_block, res);
+ if (status != -ENOSPC)
+ break;
if (!bg->bg_next_group)
break;
@@ -1844,8 +1916,12 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
bg,
group_bh,
res->sr_bit_offset,
- res->sr_bits);
+ res->sr_bits,
+ res->sr_max_contig_bits,
+ 0);
if (status < 0) {
+ ocfs2_rollback_alloc_dinode_counts(alloc_inode,
+ ac->ac_bh, res->sr_bits, chain);
mlog_errno(status);
goto bail;
}
@@ -1891,13 +1967,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(ac->ac_inode->i_sb,
- "Chain allocator dinode %llu has %u used "
- "bits but only %u total.",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- le32_to_cpu(fe->id1.bitmap1.i_used),
- le32_to_cpu(fe->id1.bitmap1.i_total));
- status = -EIO;
+ status = ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has %u used bits but only %u total\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno),
+ le32_to_cpu(fe->id1.bitmap1.i_used),
+ le32_to_cpu(fe->id1.bitmap1.i_total));
goto bail;
}
@@ -1922,10 +1996,14 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
victim = ocfs2_find_victim_chain(cl);
ac->ac_chain = victim;
+search:
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
if (!status) {
- hint = ocfs2_group_from_res(res);
+ if (ocfs2_is_cluster_bitmap(ac->ac_inode))
+ hint = res->sr_bg_blkno;
+ else
+ hint = ocfs2_group_from_res(res);
goto set_hint;
}
if (status < 0 && status != -ENOSPC) {
@@ -1943,7 +2021,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
if (i == victim)
continue;
- if (!cl->cl_recs[i].c_free)
+ if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted)
continue;
ac->ac_chain = i;
@@ -1959,6 +2037,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
}
}
+ /* Chains can't supply the bits_wanted contiguous space.
+ * We should switch to using every single bit when allocating
+ * from the global bitmap. */
+ if (i == le16_to_cpu(cl->cl_next_free_rec) &&
+ status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) {
+ ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
+ ac->ac_chain = victim;
+ goto search;
+ }
+
set_hint:
if (status != -ENOSPC) {
/* If the next search of this group is not likely to
@@ -2099,7 +2187,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir,
ac->ac_find_loc_priv = res;
*fe_blkno = res->sr_blkno;
-
+ ocfs2_update_inode_fsync_trans(handle, dir, 0);
out:
if (handle)
ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
@@ -2155,8 +2243,12 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
bg,
bg_bh,
res->sr_bit_offset,
- res->sr_bits);
+ res->sr_bits,
+ res->sr_max_contig_bits,
+ 0);
if (ret < 0) {
+ ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
+ ac->ac_bh, res->sr_bits, chain);
mlog_errno(ret);
goto out;
}
@@ -2298,7 +2390,8 @@ int __ocfs2_claim_clusters(handle_t *handle,
BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
- && ac->ac_which != OCFS2_AC_USE_MAIN);
+ && ac->ac_which != OCFS2_AC_USE_MAIN
+ && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG);
if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
WARN_ON(min_clusters > 1);
@@ -2372,12 +2465,15 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
struct buffer_head *group_bh,
unsigned int bit_off,
unsigned int num_bits,
+ unsigned int max_contig_bits,
void (*undo_fn)(unsigned int bit,
unsigned long *bmap))
{
int status;
unsigned int tmp;
+ u16 contig_bits;
struct ocfs2_group_desc *undo_bg = NULL;
+ struct journal_head *jh;
/* The caller got this descriptor from
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
@@ -2396,10 +2492,10 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
goto bail;
}
+ jh = bh2jh(group_bh);
if (undo_fn) {
- jbd_lock_bh_state(group_bh);
- undo_bg = (struct ocfs2_group_desc *)
- bh2jh(group_bh)->b_committed_data;
+ spin_lock(&jh->b_state_lock);
+ undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data;
BUG_ON(!undo_bg);
}
@@ -2413,16 +2509,31 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ if (undo_fn)
+ spin_unlock(&jh->b_state_lock);
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
+ }
+
+ /*
+ * TODO: even 'num_bits == 1' (the worst case, release 1 cluster),
+ * we still need to rescan whole bitmap.
+ */
+ if (ocfs2_is_cluster_bitmap(alloc_inode)) {
+ contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
+ le16_to_cpu(bg->bg_bits), 0);
+ if (contig_bits > max_contig_bits)
+ max_contig_bits = contig_bits;
+ bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
+ } else {
+ bg->bg_contig_free_bits = 0;
}
if (undo_fn)
- jbd_unlock_bh_state(group_bh);
+ spin_unlock(&jh->b_state_lock);
ocfs2_journal_dirty(handle, group_bh);
bail:
@@ -2447,6 +2558,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
struct ocfs2_chain_list *cl = &fe->id2.i_chain;
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *group;
+ __le16 old_bg_contig_free_bits = 0;
/* The alloc_bh comes from ocfs2_free_dinode() or
* ocfs2_free_clusters(). The callers have all locked the
@@ -2471,9 +2583,11 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
+ if (ocfs2_is_cluster_bitmap(alloc_inode))
+ old_bg_contig_free_bits = group->bg_contig_free_bits;
status = ocfs2_block_group_clear_bits(handle, alloc_inode,
group, group_bh,
- start_bit, count, undo_fn);
+ start_bit, count, 0, undo_fn);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -2483,6 +2597,9 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
+ ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
+ start_bit, count,
+ le16_to_cpu(old_bg_contig_free_bits), 1);
goto bail;
}
@@ -2494,9 +2611,6 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
bail:
brelse(group_bh);
-
- if (status)
- mlog_errno(status);
return status;
}
@@ -2537,16 +2651,16 @@ static int _ocfs2_free_clusters(handle_t *handle,
int status;
u16 bg_start_bit;
u64 bg_blkno;
- struct ocfs2_dinode *fe;
/* You can't ever have a contiguous set of clusters
* bigger than a block group bitmap so we never have to worry
* about looping on them.
* This is expensive. We can safely remove once this stuff has
* gotten tested really well. */
- BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
+ BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
+ ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
+ start_blk)));
- fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
&bg_start_bit);
@@ -2567,8 +2681,6 @@ static int _ocfs2_free_clusters(handle_t *handle,
num_clusters);
out:
- if (status)
- mlog_errno(status);
return status;
}
@@ -2598,53 +2710,6 @@ int ocfs2_release_clusters(handle_t *handle,
_ocfs2_clear_bit);
}
-static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
-{
- printk("Block Group:\n");
- printk("bg_signature: %s\n", bg->bg_signature);
- printk("bg_size: %u\n", bg->bg_size);
- printk("bg_bits: %u\n", bg->bg_bits);
- printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
- printk("bg_chain: %u\n", bg->bg_chain);
- printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
- printk("bg_next_group: %llu\n",
- (unsigned long long)bg->bg_next_group);
- printk("bg_parent_dinode: %llu\n",
- (unsigned long long)bg->bg_parent_dinode);
- printk("bg_blkno: %llu\n",
- (unsigned long long)bg->bg_blkno);
-}
-
-static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
-{
- int i;
-
- printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
- printk("i_signature: %s\n", fe->i_signature);
- printk("i_size: %llu\n",
- (unsigned long long)fe->i_size);
- printk("i_clusters: %u\n", fe->i_clusters);
- printk("i_generation: %u\n",
- le32_to_cpu(fe->i_generation));
- printk("id1.bitmap1.i_used: %u\n",
- le32_to_cpu(fe->id1.bitmap1.i_used));
- printk("id1.bitmap1.i_total: %u\n",
- le32_to_cpu(fe->id1.bitmap1.i_total));
- printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
- printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
- printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
- printk("id2.i_chain.cl_next_free_rec: %u\n",
- fe->id2.i_chain.cl_next_free_rec);
- for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
- printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
- fe->id2.i_chain.cl_recs[i].c_free);
- printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
- fe->id2.i_chain.cl_recs[i].c_total);
- printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
- (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
- }
-}
-
/*
* For a given allocation, determine which allocators will need to be
* accessed, and lock them, reserving the appropriate number of bits.
@@ -2671,7 +2736,7 @@ int ocfs2_lock_allocators(struct inode *inode,
BUG_ON(clusters_to_add != 0 && data_ac == NULL);
- num_free_extents = ocfs2_num_free_extents(osb, et);
+ num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
@@ -2862,9 +2927,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
goto bail;
}
- inode_alloc_inode =
- ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
- suballoc_slot);
+ if (suballoc_slot == (u16)OCFS2_INVALID_SLOT)
+ inode_alloc_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
+ else
+ inode_alloc_inode = ocfs2_get_system_file_inode(osb,
+ INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
if (!inode_alloc_inode) {
/* the error code could be inaccurate, but we are not able to
* get the correct one. */
@@ -2874,10 +2942,11 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
goto bail;
}
- mutex_lock(&inode_alloc_inode->i_mutex);
+ inode_lock(inode_alloc_inode);
status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
if (status < 0) {
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
+ iput(inode_alloc_inode);
mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
(u32)suballoc_slot, status);
goto bail;
@@ -2889,7 +2958,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
ocfs2_inode_unlock(inode_alloc_inode, 0);
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
iput(inode_alloc_inode);
brelse(alloc_bh);
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a36d0aa50911..bcf2ed4a8631 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* suballoc.h
*
* Defines sub allocator api
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef _CHAINALLOC_H_
@@ -45,6 +29,7 @@ struct ocfs2_alloc_context {
#define OCFS2_AC_USE_MAIN 2
#define OCFS2_AC_USE_INODE 3
#define OCFS2_AC_USE_META 4
+#define OCFS2_AC_USE_MAIN_DISCONTIG 5
u32 ac_which;
/* these are used by the chain search */
@@ -54,7 +39,7 @@ struct ocfs2_alloc_context {
u64 ac_last_group;
u64 ac_max_block; /* Highest block number to allocate. 0 is
- is the same as ~0 - unlimited */
+ the same as ~0 - unlimited */
int ac_find_loc_only; /* hack for reflink operation ordering */
struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
@@ -86,6 +71,26 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
u32 bits_wanted,
struct ocfs2_alloc_context **ac);
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+u16 ocfs2_find_max_contig_free_bits(void *bitmap,
+ u16 total_bits, u16 start);
+int ocfs2_block_group_set_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits,
+ unsigned int max_contig_bits,
+ int fastpath);
+
int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
@@ -106,7 +111,7 @@ int ocfs2_claim_clusters(handle_t *handle,
u32 *cluster_start,
u32 *num_clusters);
/*
- * Use this variant of ocfs2_claim_clusters to specify a maxiumum
+ * Use this variant of ocfs2_claim_clusters to specify a maximum
* number of clusters smaller than the allocation reserved.
*/
int __ocfs2_claim_clusters(handle_t *handle,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 854d80955bf8..2c7ba1480f7a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* super.c
*
* load/unload driver, mount/dismount volumes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/module.h>
@@ -35,13 +19,13 @@
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/crc32.h>
#include <linux/debugfs.h>
-#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/quotaops.h>
-#include <linux/cleancache.h>
+#include <linux/signal.h>
#define CREATE_TRACE_POINTS
#include "ocfs2_trace.h"
@@ -68,50 +52,43 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
-#include "ver.h"
#include "xattr.h"
#include "quota.h"
#include "refcounttree.h"
#include "suballoc.h"
#include "buffer_head_io.h"
+#include "filecheck.h"
-static struct kmem_cache *ocfs2_inode_cachep = NULL;
+static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several different types of work which
- * require cluster locking, disk I/O, recovery waits, etc. Since these
- * types of work tend to be heavy we avoid using the kernel events
- * workqueue and schedule on our own. */
-struct workqueue_struct *ocfs2_wq = NULL;
-
-static struct dentry *ocfs2_debugfs_root = NULL;
+static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster file system");
struct mount_options
{
unsigned long commit_interval;
unsigned long mount_opt;
unsigned int atime_quantum;
- signed short slot;
+ unsigned short slot;
int localalloc_opt;
unsigned int resv_level;
int dir_resv_level;
char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+ bool user_stack;
};
-static int ocfs2_parse_options(struct super_block *sb, char *options,
- struct mount_options *mopt,
- int is_remount);
+static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param);
static int ocfs2_check_set_options(struct super_block *sb,
struct mount_options *options);
static int ocfs2_show_options(struct seq_file *s, struct dentry *root);
static void ocfs2_put_super(struct super_block *sb);
static int ocfs2_mount_volume(struct super_block *sb);
-static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
static int ocfs2_initialize_mem_caches(void);
static void ocfs2_free_mem_caches(void);
@@ -138,36 +115,36 @@ static int ocfs2_get_sector(struct super_block *sb,
int block,
int sect_size);
static struct inode *ocfs2_alloc_inode(struct super_block *sb);
-static void ocfs2_destroy_inode(struct inode *inode);
+static void ocfs2_free_inode(struct inode *inode);
static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
static int ocfs2_enable_quotas(struct ocfs2_super *osb);
static void ocfs2_disable_quotas(struct ocfs2_super *osb);
+static struct dquot __rcu **ocfs2_get_dquots(struct inode *inode)
+{
+ return OCFS2_I(inode)->i_dquot;
+}
+
static const struct super_operations ocfs2_sops = {
.statfs = ocfs2_statfs,
.alloc_inode = ocfs2_alloc_inode,
- .destroy_inode = ocfs2_destroy_inode,
- .drop_inode = ocfs2_drop_inode,
+ .free_inode = ocfs2_free_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = ocfs2_evict_inode,
.sync_fs = ocfs2_sync_fs,
.put_super = ocfs2_put_super,
- .remount_fs = ocfs2_remount,
.show_options = ocfs2_show_options,
.quota_read = ocfs2_quota_read,
.quota_write = ocfs2_quota_write,
+ .get_dquots = ocfs2_get_dquots,
};
enum {
Opt_barrier,
- Opt_err_panic,
- Opt_err_ro,
+ Opt_errors,
Opt_intr,
- Opt_nointr,
- Opt_hb_none,
- Opt_hb_local,
- Opt_hb_global,
- Opt_data_ordered,
- Opt_data_writeback,
+ Opt_heartbeat,
+ Opt_data,
Opt_atime_quantum,
Opt_slot,
Opt_commit,
@@ -175,48 +152,64 @@ enum {
Opt_localflocks,
Opt_stack,
Opt_user_xattr,
- Opt_nouser_xattr,
Opt_inode64,
Opt_acl,
- Opt_noacl,
Opt_usrquota,
Opt_grpquota,
- Opt_coherency_buffered,
- Opt_coherency_full,
+ Opt_coherency,
Opt_resv_level,
Opt_dir_resv_level,
- Opt_err,
+ Opt_journal_async_commit,
+};
+
+static const struct constant_table ocfs2_param_errors[] = {
+ {"panic", OCFS2_MOUNT_ERRORS_PANIC},
+ {"remount-ro", OCFS2_MOUNT_ERRORS_ROFS},
+ {"continue", OCFS2_MOUNT_ERRORS_CONT},
+ {}
+};
+
+static const struct constant_table ocfs2_param_heartbeat[] = {
+ {"local", OCFS2_MOUNT_HB_LOCAL},
+ {"none", OCFS2_MOUNT_HB_NONE},
+ {"global", OCFS2_MOUNT_HB_GLOBAL},
+ {}
+};
+
+static const struct constant_table ocfs2_param_data[] = {
+ {"writeback", OCFS2_MOUNT_DATA_WRITEBACK},
+ {"ordered", 0},
+ {}
};
-static const match_table_t tokens = {
- {Opt_barrier, "barrier=%u"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_intr, "intr"},
- {Opt_nointr, "nointr"},
- {Opt_hb_none, OCFS2_HB_NONE},
- {Opt_hb_local, OCFS2_HB_LOCAL},
- {Opt_hb_global, OCFS2_HB_GLOBAL},
- {Opt_data_ordered, "data=ordered"},
- {Opt_data_writeback, "data=writeback"},
- {Opt_atime_quantum, "atime_quantum=%u"},
- {Opt_slot, "preferred_slot=%u"},
- {Opt_commit, "commit=%u"},
- {Opt_localalloc, "localalloc=%d"},
- {Opt_localflocks, "localflocks"},
- {Opt_stack, "cluster_stack=%s"},
- {Opt_user_xattr, "user_xattr"},
- {Opt_nouser_xattr, "nouser_xattr"},
- {Opt_inode64, "inode64"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_usrquota, "usrquota"},
- {Opt_grpquota, "grpquota"},
- {Opt_coherency_buffered, "coherency=buffered"},
- {Opt_coherency_full, "coherency=full"},
- {Opt_resv_level, "resv_level=%u"},
- {Opt_dir_resv_level, "dir_resv_level=%u"},
- {Opt_err, NULL}
+static const struct constant_table ocfs2_param_coherency[] = {
+ {"buffered", OCFS2_MOUNT_COHERENCY_BUFFERED},
+ {"full", 0},
+ {}
+};
+
+static const struct fs_parameter_spec ocfs2_param_spec[] = {
+ fsparam_u32 ("barrier", Opt_barrier),
+ fsparam_enum ("errors", Opt_errors, ocfs2_param_errors),
+ fsparam_flag_no ("intr", Opt_intr),
+ fsparam_enum ("heartbeat", Opt_heartbeat, ocfs2_param_heartbeat),
+ fsparam_enum ("data", Opt_data, ocfs2_param_data),
+ fsparam_u32 ("atime_quantum", Opt_atime_quantum),
+ fsparam_u32 ("preferred_slot", Opt_slot),
+ fsparam_u32 ("commit", Opt_commit),
+ fsparam_s32 ("localalloc", Opt_localalloc),
+ fsparam_flag ("localflocks", Opt_localflocks),
+ fsparam_string ("cluster_stack", Opt_stack),
+ fsparam_flag_no ("user_xattr", Opt_user_xattr),
+ fsparam_flag ("inode64", Opt_inode64),
+ fsparam_flag_no ("acl", Opt_acl),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_enum ("coherency", Opt_coherency, ocfs2_param_coherency),
+ fsparam_u32 ("resv_level", Opt_resv_level),
+ fsparam_u32 ("dir_resv_level", Opt_dir_resv_level),
+ fsparam_flag ("journal_async_commit", Opt_journal_async_commit),
+ {}
};
#ifdef CONFIG_DEBUG_FS
@@ -226,32 +219,33 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
struct ocfs2_recovery_map *rm = osb->recovery_map;
struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
int i, out = 0;
+ unsigned long flags;
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
"Device", osb->dev_str, osb->uuid_str,
osb->fs_generation, osb->vol_label);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => State: %d Flags: 0x%lX\n", "Volume",
atomic_read(&osb->vol_state), osb->osb_flags);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Block: %lu Cluster: %d\n", "Sizes",
osb->sb->s_blocksize, osb->s_clustersize);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Compat: 0x%X Incompat: 0x%X "
"ROcompat: 0x%X\n",
"Features", osb->s_feature_compat,
osb->s_feature_incompat, osb->s_feature_ro_compat);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
osb->s_mount_opt, osb->s_atime_quantum);
if (cconn) {
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Stack: %s Name: %*s "
"Version: %d.%d\n", "Cluster",
(*osb->osb_cluster_stack == '\0' ?
@@ -261,42 +255,42 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
cconn->cc_version.pv_minor);
}
- spin_lock(&osb->dc_task_lock);
- out += snprintf(buf + out, len - out,
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
+ out += scnprintf(buf + out, len - out,
"%10s => Pid: %d Count: %lu WakeSeq: %lu "
"WorkSeq: %lu\n", "DownCnvt",
(osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
osb->blocked_lock_count, osb->dc_wake_sequence,
osb->dc_work_sequence);
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
spin_lock(&osb->osb_lock);
- out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
+ out += scnprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
"Recovery",
(osb->recovery_thread_task ?
task_pid_nr(osb->recovery_thread_task) : -1));
if (rm->rm_used == 0)
- out += snprintf(buf + out, len - out, " None\n");
+ out += scnprintf(buf + out, len - out, " None\n");
else {
for (i = 0; i < rm->rm_used; i++)
- out += snprintf(buf + out, len - out, " %d",
+ out += scnprintf(buf + out, len - out, " %d",
rm->rm_entries[i]);
- out += snprintf(buf + out, len - out, "\n");
+ out += scnprintf(buf + out, len - out, "\n");
}
spin_unlock(&osb->osb_lock);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => Pid: %d Interval: %lu\n", "Commit",
(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
osb->osb_commit_interval);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => State: %d TxnId: %lu NumTxns: %d\n",
"Journal", osb->journal->j_state,
osb->journal->j_trans_id,
atomic_read(&osb->journal->j_num_trans));
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => GlobalAllocs: %d LocalAllocs: %d "
"SubAllocs: %d LAWinMoves: %d SAExtends: %d\n",
"Stats",
@@ -306,7 +300,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
atomic_read(&osb->alloc_stats.moves),
atomic_read(&osb->alloc_stats.bg_extends));
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => State: %u Descriptor: %llu Size: %u bits "
"Default: %u bits\n",
"LocalAlloc", osb->local_alloc_state,
@@ -314,7 +308,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
osb->local_alloc_bits, osb->local_alloc_default_bits);
spin_lock(&osb->osb_lock);
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s => InodeSlot: %d StolenInodes: %d, "
"MetaSlot: %d StolenMeta: %d\n", "Steal",
osb->s_inode_steal_slot,
@@ -323,20 +317,20 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
atomic_read(&osb->s_num_meta_stolen));
spin_unlock(&osb->osb_lock);
- out += snprintf(buf + out, len - out, "OrphanScan => ");
- out += snprintf(buf + out, len - out, "Local: %u Global: %u ",
+ out += scnprintf(buf + out, len - out, "OrphanScan => ");
+ out += scnprintf(buf + out, len - out, "Local: %u Global: %u ",
os->os_count, os->os_seqno);
- out += snprintf(buf + out, len - out, " Last Scan: ");
+ out += scnprintf(buf + out, len - out, " Last Scan: ");
if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
- out += snprintf(buf + out, len - out, "Disabled\n");
+ out += scnprintf(buf + out, len - out, "Disabled\n");
else
- out += snprintf(buf + out, len - out, "%lu seconds ago\n",
- (get_seconds() - os->os_scantime.tv_sec));
+ out += scnprintf(buf + out, len - out, "%lu seconds ago\n",
+ (unsigned long)(ktime_get_seconds() - os->os_scantime));
- out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
+ out += scnprintf(buf + out, len - out, "%10s => %3s %10s\n",
"Slots", "Num", "RecoGen");
for (i = 0; i < osb->max_slots; ++i) {
- out += snprintf(buf + out, len - out,
+ out += scnprintf(buf + out, len - out,
"%10s %c %3d %10d\n",
" ",
(i == osb->slot_num ? '*' : ' '),
@@ -416,10 +410,10 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
ocfs2_schedule_truncate_log_flush(osb, 0);
}
- if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+ if (jbd2_journal_start_commit(osb->journal->j_journal,
&target)) {
if (wait)
- jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+ jbd2_log_wait_commit(osb->journal->j_journal,
target);
}
return 0;
@@ -467,9 +461,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs2_release_system_inodes(osb);
- status = -EINVAL;
+ status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL;
mlog_errno(status);
- /* FIXME: Should ERROR_RO_FS */
mlog(ML_ERROR, "Unable to load system inode %d, "
"possibly corrupt fs?", i);
goto bail;
@@ -498,7 +491,7 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs2_release_system_inodes(osb);
- status = -EINVAL;
+ status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL;
mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
status, i, osb->slot_num);
goto bail;
@@ -557,25 +550,23 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
{
struct ocfs2_inode_info *oi;
- oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS);
+ oi = alloc_inode_sb(sb, ocfs2_inode_cachep, GFP_NOFS);
if (!oi)
return NULL;
+ oi->i_sync_tid = 0;
+ oi->i_datasync_tid = 0;
+ memset(&oi->i_dquot, 0, sizeof(oi->i_dquot));
+
jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
return &oi->vfs_inode;
}
-static void ocfs2_i_callback(struct rcu_head *head)
+static void ocfs2_free_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
}
-static void ocfs2_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, ocfs2_i_callback);
-}
-
static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
unsigned int cbits)
{
@@ -590,13 +581,12 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
*/
#if BITS_PER_LONG == 32
-# if defined(CONFIG_LBDAF)
BUILD_BUG_ON(sizeof(sector_t) != 8);
/*
* We might be limited by page cache size.
*/
- if (bytes > PAGE_CACHE_SIZE) {
- bytes = PAGE_CACHE_SIZE;
+ if (bytes > PAGE_SIZE) {
+ bytes = PAGE_SIZE;
trim = 1;
/*
* Shift by 31 here so that we don't get larger than
@@ -604,15 +594,6 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
*/
bitshift = 31;
}
-# else
- /*
- * We are limited by the size of sector_t. Use block size, as
- * that's what we expose to the VFS.
- */
- bytes = 1 << bbits;
- trim = 1;
- bitshift = 31;
-# endif
#endif
/*
@@ -623,30 +604,32 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
return (((unsigned long long)bytes) << bitshift) - trim;
}
-static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
+static int ocfs2_reconfigure(struct fs_context *fc)
{
int incompat_features;
int ret = 0;
- struct mount_options parsed_options;
+ struct mount_options *parsed_options = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
u32 tmp;
- if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
- !ocfs2_check_set_options(sb, &parsed_options)) {
+ sync_filesystem(sb);
+
+ if (!ocfs2_check_set_options(sb, parsed_options)) {
ret = -EINVAL;
goto out;
}
tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
OCFS2_MOUNT_HB_NONE;
- if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
+ if ((osb->s_mount_opt & tmp) != (parsed_options->mount_opt & tmp)) {
ret = -EINVAL;
mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
goto out;
}
if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
- (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) {
+ (parsed_options->mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) {
ret = -EINVAL;
mlog(ML_ERROR, "Cannot change data mode on remount\n");
goto out;
@@ -655,16 +638,16 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
/* Probably don't want this on remount; it might
* mess with other nodes */
if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
- (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+ (parsed_options->mount_opt & OCFS2_MOUNT_INODE64)) {
ret = -EINVAL;
mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
goto out;
}
/* We're going to/from readonly mode. */
- if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+ if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
/* Disable quota accounting before remounting RO */
- if (*flags & MS_RDONLY) {
+ if (fc->sb_flags & SB_RDONLY) {
ret = ocfs2_susp_quotas(osb, 0);
if (ret < 0)
goto out;
@@ -678,8 +661,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto unlock_osb;
}
- if (*flags & MS_RDONLY) {
- sb->s_flags |= MS_RDONLY;
+ if (fc->sb_flags & SB_RDONLY) {
+ sb->s_flags |= SB_RDONLY;
osb->osb_flags |= OCFS2_OSB_SOFT_RO;
} else {
if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
@@ -696,14 +679,14 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
ret = -EINVAL;
goto unlock_osb;
}
- sb->s_flags &= ~MS_RDONLY;
+ sb->s_flags &= ~SB_RDONLY;
osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
}
- trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
+ trace_ocfs2_remount(sb->s_flags, osb->osb_flags, fc->sb_flags);
unlock_osb:
spin_unlock(&osb->osb_lock);
/* Enable quota accounting after remounting RW */
- if (!ret && !(*flags & MS_RDONLY)) {
+ if (!ret && !(fc->sb_flags & SB_RDONLY)) {
if (sb_any_quota_suspended(sb))
ret = ocfs2_susp_quotas(osb, 1);
else
@@ -711,7 +694,7 @@ unlock_osb:
if (ret < 0) {
/* Return back changes... */
spin_lock(&osb->osb_lock);
- sb->s_flags |= MS_RDONLY;
+ sb->s_flags |= SB_RDONLY;
osb->osb_flags |= OCFS2_OSB_SOFT_RO;
spin_unlock(&osb->osb_lock);
goto out;
@@ -722,18 +705,18 @@ unlock_osb:
if (!ret) {
/* Only save off the new mount options in case of a successful
* remount. */
- osb->s_mount_opt = parsed_options.mount_opt;
- osb->s_atime_quantum = parsed_options.atime_quantum;
- osb->preferred_slot = parsed_options.slot;
- if (parsed_options.commit_interval)
- osb->osb_commit_interval = parsed_options.commit_interval;
+ osb->s_mount_opt = parsed_options->mount_opt;
+ osb->s_atime_quantum = parsed_options->atime_quantum;
+ osb->preferred_slot = parsed_options->slot;
+ if (parsed_options->commit_interval)
+ osb->osb_commit_interval = parsed_options->commit_interval;
if (!ocfs2_is_hard_readonly(osb))
ocfs2_set_journal_params(osb);
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
- MS_POSIXACL : 0);
+ SB_POSIXACL : 0);
}
out:
return ret;
@@ -894,11 +877,12 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
{
int type;
struct super_block *sb = osb->sb;
- unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int feature[OCFS2_MAXQUOTAS] = {
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
int status = 0;
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
continue;
if (unsuspend)
@@ -922,17 +906,19 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
static int ocfs2_enable_quotas(struct ocfs2_super *osb)
{
- struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+ struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL };
struct super_block *sb = osb->sb;
- unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
- unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ unsigned int feature[OCFS2_MAXQUOTAS] = {
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int ino[OCFS2_MAXQUOTAS] = {
+ LOCAL_USER_QUOTA_SYSTEM_INODE,
LOCAL_GROUP_QUOTA_SYSTEM_INODE };
int status;
int type;
sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
continue;
inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
@@ -941,18 +927,18 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
status = -ENOENT;
goto out_quota_off;
}
- status = dquot_enable(inode[type], type, QFMT_OCFS2,
- DQUOT_USAGE_ENABLED);
+ status = dquot_load_quota_inode(inode[type], type, QFMT_OCFS2,
+ DQUOT_USAGE_ENABLED);
if (status < 0)
goto out_quota_off;
}
- for (type = 0; type < MAXQUOTAS; type++)
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++)
iput(inode[type]);
return 0;
out_quota_off:
ocfs2_disable_quotas(osb);
- for (type = 0; type < MAXQUOTAS; type++)
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++)
iput(inode[type]);
mlog_errno(status);
return status;
@@ -967,121 +953,85 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
/* We mostly ignore errors in this function because there's not much
* we can do when we see them */
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!sb_has_quota_loaded(sb, type))
continue;
- /* Cancel periodic syncing before we grab dqonoff_mutex */
- oinfo = sb_dqinfo(sb, type)->dqi_priv;
- cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ if (!sb_has_quota_suspended(sb, type)) {
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ }
inode = igrab(sb->s_dquot.files[type]);
/* Turn off quotas. This will remove all dquot structures from
* memory and so they will be automatically synced to global
* quota files */
dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
DQUOT_LIMITS_ENABLED);
- if (!inode)
- continue;
iput(inode);
}
}
-/* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
-{
- unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
-
- if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
- return -EINVAL;
-
- return dquot_enable(sb_dqopt(sb)->files[type], type,
- format_id, DQUOT_LIMITS_ENABLED);
-}
-
-/* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type)
-{
- return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
-
-static const struct quotactl_ops ocfs2_quotactl_ops = {
- .quota_on_meta = ocfs2_quota_on,
- .quota_off = ocfs2_quota_off,
- .quota_sync = dquot_quota_sync,
- .get_info = dquot_get_dqinfo,
- .set_info = dquot_set_dqinfo,
- .get_dqblk = dquot_get_dqblk,
- .set_dqblk = dquot_set_dqblk,
-};
-
-static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
+static int ocfs2_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct dentry *root;
int status, sector_size;
- struct mount_options parsed_options;
+ struct mount_options *parsed_options = fc->fs_private;
struct inode *inode = NULL;
struct ocfs2_super *osb = NULL;
struct buffer_head *bh = NULL;
- char nodestr[8];
+ char nodestr[12];
struct ocfs2_blockcheck_stats stats;
- trace_ocfs2_fill_super(sb, data, silent);
-
- if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
- status = -EINVAL;
- goto read_super_error;
- }
+ trace_ocfs2_fill_super(sb, fc, fc->sb_flags & SB_SILENT);
/* probe for superblock */
status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
if (status < 0) {
mlog(ML_ERROR, "superblock probe failed!\n");
- goto read_super_error;
+ goto out;
}
status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
- osb = OCFS2_SB(sb);
- if (status < 0) {
- mlog_errno(status);
- goto read_super_error;
- }
brelse(bh);
bh = NULL;
+ if (status < 0)
+ goto out;
+
+ osb = OCFS2_SB(sb);
- if (!ocfs2_check_set_options(sb, &parsed_options)) {
+ if (!ocfs2_check_set_options(sb, parsed_options)) {
status = -EINVAL;
- goto read_super_error;
- }
- osb->s_mount_opt = parsed_options.mount_opt;
- osb->s_atime_quantum = parsed_options.atime_quantum;
- osb->preferred_slot = parsed_options.slot;
- osb->osb_commit_interval = parsed_options.commit_interval;
-
- ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
- osb->osb_resv_level = parsed_options.resv_level;
- osb->osb_dir_resv_level = parsed_options.resv_level;
- if (parsed_options.dir_resv_level == -1)
- osb->osb_dir_resv_level = parsed_options.resv_level;
+ goto out_super;
+ }
+ osb->s_mount_opt = parsed_options->mount_opt;
+ osb->s_atime_quantum = parsed_options->atime_quantum;
+ osb->preferred_slot = parsed_options->slot;
+ osb->osb_commit_interval = parsed_options->commit_interval;
+
+ ocfs2_la_set_sizes(osb, parsed_options->localalloc_opt);
+ osb->osb_resv_level = parsed_options->resv_level;
+ osb->osb_dir_resv_level = parsed_options->resv_level;
+ if (parsed_options->dir_resv_level == -1)
+ osb->osb_dir_resv_level = parsed_options->resv_level;
else
- osb->osb_dir_resv_level = parsed_options.dir_resv_level;
+ osb->osb_dir_resv_level = parsed_options->dir_resv_level;
- status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+ status = ocfs2_verify_userspace_stack(osb, parsed_options);
if (status)
- goto read_super_error;
+ goto out_super;
sb->s_magic = OCFS2_SUPER_MAGIC;
- sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
- ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+ sb->s_flags = (sb->s_flags & ~(SB_POSIXACL | SB_NOSEC)) |
+ ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0);
- /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
+ /* Hard readonly mode only if: bdev_read_only, SB_RDONLY,
* heartbeat=none */
if (bdev_read_only(sb->s_bdev)) {
- if (!(sb->s_flags & MS_RDONLY)) {
+ if (!sb_rdonly(sb)) {
status = -EACCES;
mlog(ML_ERROR, "Readonly device detected but readonly "
"mount was not specified.\n");
- goto read_super_error;
+ goto out_super;
}
/* You should not be able to start a local heartbeat
@@ -1090,7 +1040,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = -EROFS;
mlog(ML_ERROR, "Local heartbeat specified on readonly "
"device.\n");
- goto read_super_error;
+ goto out_super;
}
status = ocfs2_check_journals_nolocks(osb);
@@ -1099,9 +1049,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
mlog(ML_ERROR, "Recovery required on readonly "
"file system, but write access is "
"unavailable.\n");
- else
- mlog_errno(status);
- goto read_super_error;
+ goto out_super;
}
ocfs2_set_ro_flag(osb, 1);
@@ -1112,64 +1060,59 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
}
if (!ocfs2_is_hard_readonly(osb)) {
- if (sb->s_flags & MS_RDONLY)
+ if (sb_rdonly(sb))
ocfs2_set_ro_flag(osb, 0);
}
status = ocfs2_verify_heartbeat(osb);
- if (status < 0) {
- mlog_errno(status);
- goto read_super_error;
- }
+ if (status < 0)
+ goto out_super;
osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
ocfs2_debugfs_root);
- if (!osb->osb_debug_root) {
- status = -EINVAL;
- mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
- goto read_super_error;
- }
- osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
- osb->osb_debug_root,
- osb,
- &ocfs2_osb_debug_fops);
- if (!osb->osb_ctxt) {
- status = -EINVAL;
- mlog_errno(status);
- goto read_super_error;
- }
+ debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root,
+ osb, &ocfs2_osb_debug_fops);
if (ocfs2_meta_ecc(osb)) {
- status = ocfs2_blockcheck_stats_debugfs_install(
- &osb->osb_ecc_stats,
- osb->osb_debug_root);
- if (status) {
- mlog(ML_ERROR,
- "Unable to create blockcheck statistics "
- "files\n");
- goto read_super_error;
- }
+ ocfs2_initialize_journal_triggers(sb, osb->s_journal_triggers);
+ ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats,
+ osb->osb_debug_root);
}
status = ocfs2_mount_volume(sb);
if (status < 0)
- goto read_super_error;
+ goto out_debugfs;
if (osb->root_inode)
inode = igrab(osb->root_inode);
if (!inode) {
status = -EIO;
- mlog_errno(status);
- goto read_super_error;
+ goto out_dismount;
+ }
+
+ osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
+ &ocfs2_kset->kobj);
+ if (!osb->osb_dev_kset) {
+ status = -ENOMEM;
+ mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id);
+ goto out_dismount;
+ }
+
+ /* Create filecheck sysfs related directories/files at
+ * /sys/fs/ocfs2/<devname>/filecheck */
+ if (ocfs2_filecheck_create_sysfs(osb)) {
+ status = -ENOMEM;
+ mlog(ML_ERROR, "Unable to create filecheck sysfs directory at "
+ "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id);
+ goto out_dismount;
}
root = d_make_root(inode);
if (!root) {
status = -ENOMEM;
- mlog_errno(status);
- goto read_super_error;
+ goto out_dismount;
}
sb->s_root = root;
@@ -1193,7 +1136,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
/* Now we can initialize quotas because we can afford to wait
* for cluster locks recovery now. That also means that truncation
* log recovery can happen but that waits for proper quota setup */
- if (!(sb->s_flags & MS_RDONLY)) {
+ if (!sb_rdonly(sb)) {
status = ocfs2_enable_quotas(osb);
if (status < 0) {
/* We have to err-out specially here because
@@ -1216,60 +1159,92 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
return status;
-read_super_error:
- brelse(bh);
+out_dismount:
+ atomic_set(&osb->vol_state, VOLUME_DISABLED);
+ wake_up(&osb->osb_mount_event);
+ ocfs2_free_replay_slots(osb);
+ ocfs2_dismount_volume(sb, 1);
+ goto out;
- if (osb) {
- atomic_set(&osb->vol_state, VOLUME_DISABLED);
- wake_up(&osb->osb_mount_event);
- ocfs2_dismount_volume(sb, 1);
- }
+out_debugfs:
+ debugfs_remove_recursive(osb->osb_debug_root);
+out_super:
+ ocfs2_release_system_inodes(osb);
+ kfree(osb->recovery_map);
+ ocfs2_delete_osb(osb);
+ kfree(osb);
+out:
+ mlog_errno(status);
- if (status)
- mlog_errno(status);
return status;
}
-static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
- int flags,
- const char *dev_name,
- void *data)
+static int ocfs2_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+ return get_tree_bdev(fc, ocfs2_fill_super);
}
-static void ocfs2_kill_sb(struct super_block *sb)
+static void ocfs2_free_fc(struct fs_context *fc)
{
- struct ocfs2_super *osb = OCFS2_SB(sb);
+ kfree(fc->fs_private);
+}
- /* Failed mount? */
- if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
- goto out;
+static const struct fs_context_operations ocfs2_context_ops = {
+ .parse_param = ocfs2_parse_param,
+ .get_tree = ocfs2_get_tree,
+ .reconfigure = ocfs2_reconfigure,
+ .free = ocfs2_free_fc,
+};
- /* Prevent further queueing of inode drop events */
- spin_lock(&dentry_list_lock);
- ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
- spin_unlock(&dentry_list_lock);
- /* Wait for work to finish and/or remove it */
- cancel_work_sync(&osb->dentry_lock_work);
-out:
- kill_block_super(sb);
+static int ocfs2_init_fs_context(struct fs_context *fc)
+{
+ struct mount_options *mopt;
+
+ mopt = kzalloc(sizeof(struct mount_options), GFP_KERNEL);
+ if (!mopt)
+ return -EINVAL;
+
+ mopt->commit_interval = 0;
+ mopt->mount_opt = OCFS2_MOUNT_NOINTR;
+ mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
+ mopt->slot = OCFS2_INVALID_SLOT;
+ mopt->localalloc_opt = -1;
+ mopt->cluster_stack[0] = '\0';
+ mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+ mopt->dir_resv_level = -1;
+
+ fc->fs_private = mopt;
+ fc->ops = &ocfs2_context_ops;
+
+ return 0;
}
static struct file_system_type ocfs2_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2",
- .mount = ocfs2_mount,
- .kill_sb = ocfs2_kill_sb,
-
+ .kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
- .next = NULL
+ .next = NULL,
+ .init_fs_context = ocfs2_init_fs_context,
+ .parameters = ocfs2_param_spec,
};
MODULE_ALIAS_FS("ocfs2");
static int ocfs2_check_set_options(struct super_block *sb,
struct mount_options *options)
{
+ if (options->user_stack == 0) {
+ u32 tmp;
+
+ /* Ensure only one heartbeat mode */
+ tmp = options->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
+ OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE);
+ if (hweight32(tmp) != 1) {
+ mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+ return 0;
+ }
+ }
if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
!OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
@@ -1301,233 +1276,142 @@ static int ocfs2_check_set_options(struct super_block *sb,
return 1;
}
-static int ocfs2_parse_options(struct super_block *sb,
- char *options,
- struct mount_options *mopt,
- int is_remount)
+static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- int status, user_stack = 0;
- char *p;
- u32 tmp;
-
- trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
-
- mopt->commit_interval = 0;
- mopt->mount_opt = OCFS2_MOUNT_NOINTR;
- mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
- mopt->slot = OCFS2_INVALID_SLOT;
- mopt->localalloc_opt = -1;
- mopt->cluster_stack[0] = '\0';
- mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
- mopt->dir_resv_level = -1;
-
- if (!options) {
- status = 1;
- goto bail;
- }
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token, option;
- substring_t args[MAX_OPT_ARGS];
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_hb_local:
- mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
- break;
- case Opt_hb_none:
- mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
- break;
- case Opt_hb_global:
- mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
- break;
- case Opt_barrier:
- if (match_int(&args[0], &option)) {
- status = 0;
- goto bail;
- }
- if (option)
- mopt->mount_opt |= OCFS2_MOUNT_BARRIER;
- else
- mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER;
- break;
- case Opt_intr:
- mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR;
- break;
- case Opt_nointr:
+ struct fs_parse_result result;
+ int opt;
+ struct mount_options *mopt = fc->fs_private;
+ bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE);
+
+ trace_ocfs2_parse_options(is_remount, param->key);
+
+ opt = fs_parse(fc, ocfs2_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_heartbeat:
+ mopt->mount_opt |= result.uint_32;
+ break;
+ case Opt_barrier:
+ if (result.uint_32)
+ mopt->mount_opt |= OCFS2_MOUNT_BARRIER;
+ else
+ mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER;
+ break;
+ case Opt_intr:
+ if (result.negated)
mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
- break;
- case Opt_err_panic:
- mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
- break;
- case Opt_err_ro:
- mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
- break;
- case Opt_data_ordered:
- mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
- break;
- case Opt_data_writeback:
- mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
- break;
- case Opt_user_xattr:
- mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
- break;
- case Opt_nouser_xattr:
+ else
+ mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR;
+ break;
+ case Opt_errors:
+ mopt->mount_opt &= ~(OCFS2_MOUNT_ERRORS_CONT |
+ OCFS2_MOUNT_ERRORS_ROFS |
+ OCFS2_MOUNT_ERRORS_PANIC);
+ mopt->mount_opt |= result.uint_32;
+ break;
+ case Opt_data:
+ mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
+ mopt->mount_opt |= result.uint_32;
+ break;
+ case Opt_user_xattr:
+ if (result.negated)
mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
- break;
- case Opt_atime_quantum:
- if (match_int(&args[0], &option)) {
- status = 0;
- goto bail;
- }
- if (option >= 0)
- mopt->atime_quantum = option;
- break;
- case Opt_slot:
- option = 0;
- if (match_int(&args[0], &option)) {
- status = 0;
- goto bail;
- }
- if (option)
- mopt->slot = (s16)option;
- break;
- case Opt_commit:
- option = 0;
- if (match_int(&args[0], &option)) {
- status = 0;
- goto bail;
- }
- if (option < 0)
- return 0;
- if (option == 0)
- option = JBD2_DEFAULT_MAX_COMMIT_AGE;
- mopt->commit_interval = HZ * option;
- break;
- case Opt_localalloc:
- option = 0;
- if (match_int(&args[0], &option)) {
- status = 0;
- goto bail;
- }
- if (option >= 0)
- mopt->localalloc_opt = option;
- break;
- case Opt_localflocks:
- /*
- * Changing this during remount could race
- * flock() requests, or "unbalance" existing
- * ones (e.g., a lock is taken in one mode but
- * dropped in the other). If users care enough
- * to flip locking modes during remount, we
- * could add a "local" flag to individual
- * flock structures for proper tracking of
- * state.
- */
- if (!is_remount)
- mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
- break;
- case Opt_stack:
- /* Check both that the option we were passed
- * is of the right length and that it is a proper
- * string of the right length.
- */
- if (((args[0].to - args[0].from) !=
- OCFS2_STACK_LABEL_LEN) ||
- (strnlen(args[0].from,
- OCFS2_STACK_LABEL_LEN) !=
- OCFS2_STACK_LABEL_LEN)) {
- mlog(ML_ERROR,
- "Invalid cluster_stack option\n");
- status = 0;
- goto bail;
- }
- memcpy(mopt->cluster_stack, args[0].from,
- OCFS2_STACK_LABEL_LEN);
- mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
- /*
- * Open code the memcmp here as we don't have
- * an osb to pass to
- * ocfs2_userspace_stack().
- */
- if (memcmp(mopt->cluster_stack,
- OCFS2_CLASSIC_CLUSTER_STACK,
- OCFS2_STACK_LABEL_LEN))
- user_stack = 1;
- break;
- case Opt_inode64:
- mopt->mount_opt |= OCFS2_MOUNT_INODE64;
- break;
- case Opt_usrquota:
- mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
- break;
- case Opt_grpquota:
- mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
- break;
- case Opt_coherency_buffered:
- mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
- break;
- case Opt_coherency_full:
- mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
- break;
- case Opt_acl:
- mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
- mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
- break;
- case Opt_noacl:
+ else
+ mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+ break;
+ case Opt_atime_quantum:
+ mopt->atime_quantum = result.uint_32;
+ break;
+ case Opt_slot:
+ if (result.uint_32)
+ mopt->slot = (u16)result.uint_32;
+ break;
+ case Opt_commit:
+ if (result.uint_32 == 0)
+ mopt->commit_interval = HZ * JBD2_DEFAULT_MAX_COMMIT_AGE;
+ else
+ mopt->commit_interval = HZ * result.uint_32;
+ break;
+ case Opt_localalloc:
+ if (result.int_32 >= 0)
+ mopt->localalloc_opt = result.int_32;
+ break;
+ case Opt_localflocks:
+ /*
+ * Changing this during remount could race flock() requests, or
+ * "unbalance" existing ones (e.g., a lock is taken in one mode
+ * but dropped in the other). If users care enough to flip
+ * locking modes during remount, we could add a "local" flag to
+ * individual flock structures for proper tracking of state.
+ */
+ if (!is_remount)
+ mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
+ break;
+ case Opt_stack:
+ /* Check both that the option we were passed is of the right
+ * length and that it is a proper string of the right length.
+ */
+ if (strlen(param->string) != OCFS2_STACK_LABEL_LEN) {
+ mlog(ML_ERROR, "Invalid cluster_stack option\n");
+ return -EINVAL;
+ }
+ memcpy(mopt->cluster_stack, param->string, OCFS2_STACK_LABEL_LEN);
+ mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ /*
+ * Open code the memcmp here as we don't have an osb to pass
+ * to ocfs2_userspace_stack().
+ */
+ if (memcmp(mopt->cluster_stack,
+ OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ mopt->user_stack = 1;
+ break;
+ case Opt_inode64:
+ mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+ break;
+ case Opt_usrquota:
+ mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+ break;
+ case Opt_grpquota:
+ mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+ break;
+ case Opt_coherency:
+ mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+ mopt->mount_opt |= result.uint_32;
+ break;
+ case Opt_acl:
+ if (result.negated) {
mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+ } else {
+ mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+ mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
+ }
+ break;
+ case Opt_resv_level:
+ if (is_remount)
break;
- case Opt_resv_level:
- if (is_remount)
- break;
- if (match_int(&args[0], &option)) {
- status = 0;
- goto bail;
- }
- if (option >= OCFS2_MIN_RESV_LEVEL &&
- option < OCFS2_MAX_RESV_LEVEL)
- mopt->resv_level = option;
- break;
- case Opt_dir_resv_level:
- if (is_remount)
- break;
- if (match_int(&args[0], &option)) {
- status = 0;
- goto bail;
- }
- if (option >= OCFS2_MIN_RESV_LEVEL &&
- option < OCFS2_MAX_RESV_LEVEL)
- mopt->dir_resv_level = option;
+ if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL &&
+ result.uint_32 < OCFS2_MAX_RESV_LEVEL)
+ mopt->resv_level = result.uint_32;
+ break;
+ case Opt_dir_resv_level:
+ if (is_remount)
break;
- default:
- mlog(ML_ERROR,
- "Unrecognized mount option \"%s\" "
- "or missing value\n", p);
- status = 0;
- goto bail;
- }
- }
-
- if (user_stack == 0) {
- /* Ensure only one heartbeat mode */
- tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
- OCFS2_MOUNT_HB_GLOBAL |
- OCFS2_MOUNT_HB_NONE);
- if (hweight32(tmp) != 1) {
- mlog(ML_ERROR, "Invalid heartbeat mount options\n");
- status = 0;
- goto bail;
- }
+ if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL &&
+ result.uint_32 < OCFS2_MAX_RESV_LEVEL)
+ mopt->dir_resv_level = result.uint_32;
+ break;
+ case Opt_journal_async_commit:
+ mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
+ break;
+ default:
+ return -EINVAL;
}
- status = 1;
-
-bail:
- return status;
+ return 0;
}
static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
@@ -1558,6 +1442,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_ERRORS_PANIC)
seq_printf(s, ",errors=panic");
+ else if (opts & OCFS2_MOUNT_ERRORS_CONT)
+ seq_printf(s, ",errors=continue");
else
seq_printf(s, ",errors=remount-ro");
@@ -1578,8 +1464,7 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",localflocks,");
if (osb->osb_cluster_stack[0])
- seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
- osb->osb_cluster_stack);
+ seq_show_option(s, "cluster_stack", osb->osb_cluster_stack);
if (opts & OCFS2_MOUNT_USRQUOTA)
seq_printf(s, ",usrquota");
if (opts & OCFS2_MOUNT_GRPQUOTA)
@@ -1609,19 +1494,15 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (osb->osb_dir_resv_level != osb->osb_resv_level)
seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+ if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+ seq_printf(s, ",journal_async_commit");
+
return 0;
}
-wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
-
static int __init ocfs2_init(void)
{
- int status, i;
-
- ocfs2_print_version();
-
- for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
- init_waitqueue_head(&ocfs2__ioend_wq[i]);
+ int status;
status = init_ocfs2_uptodate_cache();
if (status < 0)
@@ -1631,32 +1512,18 @@ static int __init ocfs2_init(void)
if (status < 0)
goto out2;
- ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
- if (!ocfs2_wq) {
- status = -ENOMEM;
- goto out3;
- }
-
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
- if (!ocfs2_debugfs_root) {
- status = -EFAULT;
- mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
- }
ocfs2_set_locking_protocol();
- status = register_quota_format(&ocfs2_quota_format);
- if (status < 0)
- goto out4;
+ register_quota_format(&ocfs2_quota_format);
+
status = register_filesystem(&ocfs2_fs_type);
if (!status)
return 0;
unregister_quota_format(&ocfs2_quota_format);
-out4:
- destroy_workqueue(ocfs2_wq);
debugfs_remove(ocfs2_debugfs_root);
-out3:
ocfs2_free_mem_caches();
out2:
exit_ocfs2_uptodate_cache();
@@ -1667,11 +1534,6 @@ out1:
static void __exit ocfs2_exit(void)
{
- if (ocfs2_wq) {
- flush_workqueue(ocfs2_wq);
- destroy_workqueue(ocfs2_wq);
- }
-
unregister_quota_format(&ocfs2_quota_format);
debugfs_remove(ocfs2_debugfs_root);
@@ -1744,8 +1606,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
ocfs2_inode_unlock(inode, 0);
status = 0;
bail:
- if (inode)
- iput(inode);
+ iput(inode);
if (status)
mlog_errno(status);
@@ -1762,14 +1623,15 @@ static void ocfs2_inode_init_once(void *data)
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
+ INIT_LIST_HEAD(&oi->ip_unwritten_list);
oi->ip_dir_start_lookup = 0;
- atomic_set(&oi->ip_unaligned_aio, 0);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
oi->ip_blkno = 0ULL;
oi->ip_clusters = 0;
+ oi->ip_next_orphan = NULL;
ocfs2_resv_init_once(&oi->ip_la_data_resv);
@@ -1789,27 +1651,23 @@ static int ocfs2_initialize_mem_caches(void)
sizeof(struct ocfs2_inode_info),
0,
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_ACCOUNT),
ocfs2_inode_init_once);
ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
sizeof(struct ocfs2_dquot),
0,
- (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
NULL);
ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
sizeof(struct ocfs2_quota_chunk),
0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ SLAB_RECLAIM_ACCOUNT,
NULL);
if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
!ocfs2_qf_chunk_cachep) {
- if (ocfs2_inode_cachep)
- kmem_cache_destroy(ocfs2_inode_cachep);
- if (ocfs2_dquot_cachep)
- kmem_cache_destroy(ocfs2_dquot_cachep);
- if (ocfs2_qf_chunk_cachep)
- kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+ kmem_cache_destroy(ocfs2_inode_cachep);
+ kmem_cache_destroy(ocfs2_dquot_cachep);
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
return -ENOMEM;
}
@@ -1823,16 +1681,13 @@ static void ocfs2_free_mem_caches(void)
* destroy cache.
*/
rcu_barrier();
- if (ocfs2_inode_cachep)
- kmem_cache_destroy(ocfs2_inode_cachep);
+ kmem_cache_destroy(ocfs2_inode_cachep);
ocfs2_inode_cachep = NULL;
- if (ocfs2_dquot_cachep)
- kmem_cache_destroy(ocfs2_dquot_cachep);
+ kmem_cache_destroy(ocfs2_dquot_cachep);
ocfs2_dquot_cachep = NULL;
- if (ocfs2_qf_chunk_cachep)
- kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
ocfs2_qf_chunk_cachep = NULL;
}
@@ -1848,16 +1703,14 @@ static int ocfs2_get_sector(struct super_block *sb,
*bh = sb_getblk(sb, block);
if (!*bh) {
- mlog_errno(-EIO);
- return -EIO;
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
lock_buffer(*bh);
if (!buffer_dirty(*bh))
clear_buffer_uptodate(*bh);
unlock_buffer(*bh);
- ll_rw_block(READ, 1, bh);
- wait_on_buffer(*bh);
- if (!buffer_uptodate(*bh)) {
+ if (bh_read(*bh, 0) < 0) {
mlog_errno(-EIO);
brelse(*bh);
*bh = NULL;
@@ -1870,53 +1723,71 @@ static int ocfs2_get_sector(struct super_block *sb,
static int ocfs2_mount_volume(struct super_block *sb)
{
int status = 0;
- int unlock_super = 0;
struct ocfs2_super *osb = OCFS2_SB(sb);
if (ocfs2_is_hard_readonly(osb))
- goto leave;
+ goto out;
+
+ mutex_init(&osb->obs_trim_fs_mutex);
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ if (status == -EBADR && ocfs2_userspace_stack(osb))
+ mlog(ML_ERROR, "couldn't mount because cluster name on"
+ " disk does not match the running cluster name.\n");
+ goto out;
}
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_dlm;
}
- unlock_super = 1;
/* This will load up the node map and add ourselves to it. */
status = ocfs2_find_slot(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_super_lock;
}
/* load all node-local system inodes */
status = ocfs2_init_local_system_inodes(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_super_lock;
}
status = ocfs2_check_volume(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_system_inodes;
}
status = ocfs2_truncate_log_init(osb);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ goto out_check_volume;
+ }
-leave:
- if (unlock_super)
- ocfs2_super_unlock(osb, 1);
+ ocfs2_super_unlock(osb, 1);
+ return 0;
+out_check_volume:
+ ocfs2_free_replay_slots(osb);
+out_system_inodes:
+ if (osb->local_alloc_state == OCFS2_LA_ENABLED)
+ ocfs2_shutdown_local_alloc(osb);
+ ocfs2_release_system_inodes(osb);
+ /* before journal shutdown, we should release slot_info */
+ ocfs2_free_slot_info(osb);
+ ocfs2_journal_shutdown(osb);
+out_super_lock:
+ ocfs2_super_unlock(osb, 1);
+out_dlm:
+ ocfs2_dlm_shutdown(osb, 0);
+out:
return status;
}
@@ -1924,7 +1795,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
{
int tmp, hangup_needed = 0;
struct ocfs2_super *osb = NULL;
- char nodestr[8];
+ char nodestr[12];
trace_ocfs2_dismount_volume(sb);
@@ -1932,19 +1803,25 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
osb = OCFS2_SB(sb);
BUG_ON(!osb);
- debugfs_remove(osb->osb_ctxt);
+ /* Remove file check sysfs related directories/files,
+ * and wait for the pending file check operations */
+ ocfs2_filecheck_remove_sysfs(osb);
- /*
- * Flush inode dropping work queue so that deletes are
- * performed while the filesystem is still working
- */
- ocfs2_drop_all_dl_inodes(osb);
+ kset_unregister(osb->osb_dev_kset);
/* Orphan scan should be stopped as early as possible */
ocfs2_orphan_scan_stop(osb);
+ /* Stop quota recovery so that we can disable quotas */
+ ocfs2_recovery_disable_quota(osb);
+
ocfs2_disable_quotas(osb);
+ /* All dquots should be freed by now */
+ WARN_ON(!llist_empty(&osb->dquot_drop_list));
+ /* Wait for worker to be done with the work structure in osb */
+ cancel_work_sync(&osb->dquot_drop_work);
+
ocfs2_shutdown_local_alloc(osb);
ocfs2_truncate_log_shutdown(osb);
@@ -1952,8 +1829,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
/* This will disable recovery and flush any recovery work. */
ocfs2_recovery_exit(osb);
- ocfs2_journal_shutdown(osb);
-
ocfs2_sync_blockdev(sb);
ocfs2_purge_refcount_trees(osb);
@@ -1976,6 +1851,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_release_system_inodes(osb);
+ ocfs2_journal_shutdown(osb);
+
/*
* If we're dismounting due to mount error, mount.ocfs2 will clean
* up heartbeat. If we're a local mount, there is no heartbeat.
@@ -1986,11 +1863,10 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
!ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
- if (osb->cconn)
- ocfs2_dlm_shutdown(osb, hangup_needed);
+ ocfs2_dlm_shutdown(osb, hangup_needed);
ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
- debugfs_remove(osb->osb_debug_root);
+ debugfs_remove_recursive(osb->osb_debug_root);
if (hangup_needed)
ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
@@ -2074,8 +1950,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
int i, cbits, bbits;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
- struct ocfs2_journal *journal;
- __le32 uuid_net_key;
struct ocfs2_super *osb;
u64 total_blocks;
@@ -2083,22 +1957,25 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out;
}
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
- sb->s_d_op = &ocfs2_dentry_ops;
+ set_default_d_op(sb, &ocfs2_dentry_ops);
sb->s_export_op = &ocfs2_export_ops;
- sb->s_qcop = &ocfs2_quotactl_ops;
+ sb->s_qcop = &dquot_quotactl_sysfile_ops;
sb->dq_op = &ocfs2_quota_operations;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
sb->s_xattr = ocfs2_xattr_handlers;
sb->s_time_gran = 1;
- sb->s_flags |= MS_NOATIME;
+ sb->s_flags |= SB_NOATIME;
/* this is needed to support O_LARGEFILE */
cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+ super_set_uuid(sb, di->id2.i_super.s_uuid,
+ sizeof(di->id2.i_super.s_uuid));
osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
@@ -2107,7 +1984,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
osb->sb = sb;
- /* Save off for ocfs2_rw_direct */
osb->s_sectsize_bits = blksize_bits(sector_size);
BUG_ON(!osb->s_sectsize_bits);
@@ -2121,6 +1997,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
spin_lock_init(&osb->osb_xattr_lock);
ocfs2_init_steal_slots(osb);
+ mutex_init(&osb->system_file_mutex);
+
atomic_set(&osb->alloc_stats.moves, 0);
atomic_set(&osb->alloc_stats.local_data, 0);
atomic_set(&osb->alloc_stats.bitmap_data, 0);
@@ -2140,7 +2018,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
osb->max_slots);
status = -EINVAL;
- goto bail;
+ goto out;
}
ocfs2_orphan_scan_init(osb);
@@ -2149,7 +2027,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (status) {
mlog(ML_ERROR, "Unable to initialize recovery state\n");
mlog_errno(status);
- goto bail;
+ goto out;
}
init_waitqueue_head(&osb->checkpoint_event);
@@ -2167,17 +2045,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
init_waitqueue_head(&osb->osb_mount_event);
- status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_resmap_init(osb, &osb->osb_la_resmap);
osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
if (!osb->vol_label) {
mlog(ML_ERROR, "unable to alloc vol label\n");
status = -ENOMEM;
- goto bail;
+ goto out_recovery_map;
}
osb->slot_recovery_generations =
@@ -2186,7 +2060,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb->slot_recovery_generations) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_vol_label;
}
init_waitqueue_head(&osb->osb_wipe_event);
@@ -2196,7 +2070,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb->osb_orphan_wipes) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_slot_recovery_gen;
}
osb->osb_rf_lock_tree = RB_ROOT;
@@ -2212,31 +2086,38 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "couldn't mount because of unsupported "
"optional features (%x).\n", i);
status = -EINVAL;
- goto bail;
+ goto out_orphan_wipes;
}
- if (!(osb->sb->s_flags & MS_RDONLY) &&
- (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
+ if (!sb_rdonly(osb->sb) && (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
mlog(ML_ERROR, "couldn't mount RDWR because of "
"unsupported optional features (%x).\n", i);
status = -EINVAL;
- goto bail;
+ goto out_orphan_wipes;
}
if (ocfs2_clusterinfo_valid(osb)) {
+ /*
+ * ci_stack and ci_cluster in ocfs2_cluster_info may not be null
+ * terminated, so make sure no overflow happens here by using
+ * memcpy. Destination strings will always be null terminated
+ * because osb is allocated using kzalloc.
+ */
osb->osb_stackflags =
OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
memcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
OCFS2_STACK_LABEL_LEN);
- osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
mlog(ML_ERROR,
"couldn't mount because of an invalid "
"cluster stack label (%s) \n",
osb->osb_cluster_stack);
status = -EINVAL;
- goto bail;
+ goto out_orphan_wipes;
}
+ memcpy(osb->osb_cluster_name,
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
+ OCFS2_CLUSTER_NAME_LEN);
} else {
/* The empty string is identical with classic tools that
* don't know about s_cluster_info. */
@@ -2245,35 +2126,17 @@ static int ocfs2_initialize_super(struct super_block *sb,
get_random_bytes(&osb->s_next_generation, sizeof(u32));
- /* FIXME
- * This should be done in ocfs2_journal_init(), but unknown
- * ordering issues will cause the filesystem to crash.
- * If anyone wants to figure out what part of the code
- * refers to osb->journal before ocfs2_journal_init() is run,
- * be my guest.
+ /*
+ * FIXME
+ * This should be done in ocfs2_journal_init(), but any inode
+ * writes back operation will cause the filesystem to crash.
*/
- /* initialize our journal structure */
-
- journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
- if (!journal) {
- mlog(ML_ERROR, "unable to alloc journal\n");
- status = -ENOMEM;
- goto bail;
- }
- osb->journal = journal;
- journal->j_osb = osb;
-
- atomic_set(&journal->j_num_trans, 0);
- init_rwsem(&journal->j_trans_barrier);
- init_waitqueue_head(&journal->j_checkpointed);
- spin_lock_init(&journal->j_lock);
- journal->j_trans_id = (unsigned long) 1;
- INIT_LIST_HEAD(&journal->j_la_cleanups);
- INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
- journal->j_state = OCFS2_JOURNAL_FREE;
+ status = ocfs2_journal_alloc(osb);
+ if (status < 0)
+ goto out_orphan_wipes;
- INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
- osb->dentry_lock_list = NULL;
+ INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
+ init_llist_head(&osb->dquot_drop_list);
/* get some pseudo constants for clustersize bits */
osb->s_clustersize_bits =
@@ -2285,7 +2148,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
osb->s_clustersize);
status = -EINVAL;
- goto bail;
+ goto out_journal;
}
total_blocks = ocfs2_clusters_to_blocks(osb->sb,
@@ -2297,20 +2160,18 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "Volume too large "
"to mount safely on this system");
status = -EFBIG;
- goto bail;
+ goto out_journal;
}
if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
sizeof(di->id2.i_super.s_uuid))) {
mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
status = -ENOMEM;
- goto bail;
+ goto out_journal;
}
- memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
-
- strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
- osb->vol_label[63] = '\0';
+ strscpy(osb->vol_label, di->id2.i_super.s_label,
+ OCFS2_MAX_VOL_LABEL_LEN);
osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
osb->first_cluster_group_blkno =
@@ -2326,7 +2187,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb->osb_dlm_debug) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_uuid_str;
}
atomic_set(&osb->vol_state, VOLUME_INIT);
@@ -2335,7 +2196,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = ocfs2_init_global_system_inodes(osb);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto out_dlm_out;
}
/*
@@ -2346,7 +2207,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!inode) {
status = -EINVAL;
mlog_errno(status);
- goto bail;
+ goto out_system_inodes;
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
@@ -2359,11 +2220,39 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = ocfs2_init_slot_info(osb);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto out_system_inodes;
}
- cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
-bail:
+ osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM);
+ if (!osb->ocfs2_wq) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_slot_info;
+ }
+
+ return status;
+
+out_slot_info:
+ ocfs2_free_slot_info(osb);
+out_system_inodes:
+ ocfs2_release_system_inodes(osb);
+out_dlm_out:
+ ocfs2_put_dlm_debug(osb->osb_dlm_debug);
+out_uuid_str:
+ kfree(osb->uuid_str);
+out_journal:
+ kfree(osb->journal);
+out_orphan_wipes:
+ kfree(osb->osb_orphan_wipes);
+out_slot_recovery_gen:
+ kfree(osb->slot_recovery_generations);
+out_vol_label:
+ kfree(osb->vol_label);
+out_recovery_map:
+ kfree(osb->recovery_map);
+out:
+ kfree(osb);
+ sb->s_fs_info = NULL;
return status;
}
@@ -2378,6 +2267,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
struct ocfs2_blockcheck_stats *stats)
{
int status = -EAGAIN;
+ u32 blksz_bits;
if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
@@ -2392,11 +2282,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
goto out;
}
status = -EINVAL;
- if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
+ /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */
+ blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
+ if (blksz_bits < 9 || blksz_bits > 12) {
mlog(ML_ERROR, "found superblock with incorrect block "
- "size: found %u, should be %u\n",
- 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
- blksz);
+ "size bits: found %u, should be 9, 10, 11, or 12\n",
+ blksz_bits);
+ } else if ((1 << blksz_bits) != blksz) {
+ mlog(ML_ERROR, "found superblock with incorrect block "
+ "size: found %u, should be %u\n", 1 << blksz_bits, blksz);
} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
OCFS2_MAJOR_REV_LEVEL ||
le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
@@ -2414,8 +2308,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
(unsigned long long)bh->b_blocknr);
} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
- mlog(ML_ERROR, "bad cluster size found: %u\n",
- 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
+ mlog(ML_ERROR, "bad cluster size bit found: %u\n",
+ le32_to_cpu(di->id2.i_super.s_clustersize_bits));
} else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
mlog(ML_ERROR, "bad root_blkno: 0\n");
} else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
@@ -2448,7 +2342,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* ourselves. */
/* Init our journal object. */
- status = ocfs2_journal_init(osb->journal, &dirty);
+ status = ocfs2_journal_init(osb, &dirty);
if (status < 0) {
mlog(ML_ERROR, "Could not initialize journal!\n");
goto finally;
@@ -2483,6 +2377,15 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
+ if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+ jbd2_journal_set_features(osb->journal->j_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ else
+ jbd2_journal_clear_features(osb->journal->j_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+
if (dirty) {
/* recover my local alloc if we didn't unmount cleanly. */
status = ocfs2_begin_local_alloc_recovery(osb,
@@ -2505,7 +2408,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
if (dirty) {
/* Recovery will be completed after we've mounted the
* rest of the volume. */
- osb->dirty = 1;
osb->local_alloc_copy = local_alloc;
local_alloc = NULL;
}
@@ -2541,81 +2443,99 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
{
/* This function assumes that the caller has the main osb resource */
+ /* ocfs2_initializer_super have already created this workqueue */
+ if (osb->ocfs2_wq)
+ destroy_workqueue(osb->ocfs2_wq);
+
ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
kfree(osb->slot_recovery_generations);
/* FIXME
* This belongs in journal shutdown, but because we have to
- * allocate osb->journal at the start of ocfs2_initialize_osb(),
+ * allocate osb->journal at the middle of ocfs2_initialize_super(),
* we free it here.
*/
kfree(osb->journal);
kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
+ kfree(osb->vol_label);
ocfs2_put_dlm_debug(osb->osb_dlm_debug);
memset(osb, 0, sizeof(struct ocfs2_super));
}
-/* Put OCFS2 into a readonly state, or (if the user specifies it),
- * panic(). We do not support continue-on-error operation. */
-static void ocfs2_handle_error(struct super_block *sb)
+/* Depending on the mount option passed, perform one of the following:
+ * Put OCFS2 into a readonly state (default)
+ * Return EIO so that only the process errs
+ * Fix the error as if fsck.ocfs2 -y
+ * panic
+ */
+static int ocfs2_handle_error(struct super_block *sb)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
+ int rv = 0;
+
+ ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+ pr_crit("On-disk corruption discovered. "
+ "Please run fsck.ocfs2 once the filesystem is unmounted.\n");
- if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
+ if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) {
panic("OCFS2: (device %s): panic forced after error\n",
sb->s_id);
+ } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
+ pr_crit("OCFS2: Returning error to the calling process.\n");
+ rv = -EIO;
+ } else { /* default option */
+ rv = -EROFS;
+ if (sb_rdonly(sb) && (ocfs2_is_soft_readonly(osb) || ocfs2_is_hard_readonly(osb)))
+ return rv;
- ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
-
- if (sb->s_flags & MS_RDONLY &&
- (ocfs2_is_soft_readonly(osb) ||
- ocfs2_is_hard_readonly(osb)))
- return;
+ pr_crit("OCFS2: File system is now read-only.\n");
+ sb->s_flags |= SB_RDONLY;
+ ocfs2_set_ro_flag(osb, 0);
+ }
- printk(KERN_CRIT "File system is now read-only due to the potential "
- "of on-disk corruption. Please run fsck.ocfs2 once the file "
- "system is unmounted.\n");
- sb->s_flags |= MS_RDONLY;
- ocfs2_set_ro_flag(osb, 0);
+ return rv;
}
-static char error_buf[1024];
-
-void __ocfs2_error(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
+int __ocfs2_error(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- vsnprintf(error_buf, sizeof(error_buf), fmt, args);
- va_end(args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
/* Not using mlog here because we want to show the actual
* function the error came from. */
- printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
+ printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV",
+ sb->s_id, function, &vaf);
- ocfs2_handle_error(sb);
+ va_end(args);
+
+ return ocfs2_handle_error(sb);
}
/* Handle critical errors. This is intentionally more drastic than
* ocfs2_handle_error, so we only use for things like journal errors,
* etc. */
-void __ocfs2_abort(struct super_block* sb,
- const char *function,
+void __ocfs2_abort(struct super_block *sb, const char *function,
const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- vsnprintf(error_buf, sizeof(error_buf), fmt, args);
- va_end(args);
- printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV",
+ sb->s_id, function, &vaf);
+
+ va_end(args);
/* We don't have the cluster support yet to go straight to
* hard readonly in here. Until then, we want to keep
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 74ff74cf78fe..8312651135b9 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -1,47 +1,28 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* super.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_SUPER_H
#define OCFS2_SUPER_H
-extern struct workqueue_struct *ocfs2_wq;
-
-int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
- int node_num);
-
__printf(3, 4)
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_error(sb, fmt, ...) \
+ __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
__printf(3, 4)
void __ocfs2_abort(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_abort(sb, fmt, ...) \
+ __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
/*
* Void signal blockers, because in-kernel sigprocmask() only fails
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 66edce7ecfd7..ad8be3300b49 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
* linux/cluster/ssi/cfs/symlink.c
*
* This program is free software; you can redistribute it and/or
@@ -54,47 +52,39 @@
#include "buffer_head_io.h"
-static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
+static int ocfs2_fast_symlink_read_folio(struct file *f, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct buffer_head *bh = NULL;
int status = ocfs2_read_inode_block(inode, &bh);
struct ocfs2_dinode *fe;
const char *link;
- void *kaddr;
size_t len;
if (status < 0) {
mlog_errno(status);
- return status;
+ goto out;
}
fe = (struct ocfs2_dinode *) bh->b_data;
link = (char *) fe->id2.i_symlink;
/* will be less than a page size */
len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb));
- kaddr = kmap_atomic(page);
- memcpy(kaddr, link, len + 1);
- kunmap_atomic(kaddr);
- SetPageUptodate(page);
- unlock_page(page);
+ memcpy_to_folio(folio, 0, link, len + 1);
+out:
+ folio_end_read(folio, status == 0);
brelse(bh);
- return 0;
+ return status;
}
const struct address_space_operations ocfs2_fast_symlink_aops = {
- .readpage = ocfs2_fast_symlink_readpage,
+ .read_folio = ocfs2_fast_symlink_read_folio,
};
const struct inode_operations ocfs2_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
- .removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
index 71ee4245e919..ffcf0210545c 100644
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* symlink.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_SYMLINK_H
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index f053688d22a3..d53a6cc866be 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* sysfile.c
*
* Initialize, read, write, etc. system files.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -69,10 +53,11 @@ static struct inode **get_local_system_inode(struct ocfs2_super *osb,
spin_unlock(&osb->osb_lock);
if (unlikely(!local_system_inodes)) {
- local_system_inodes = kzalloc(sizeof(struct inode *) *
- NUM_LOCAL_SYSTEM_INODES *
- osb->max_slots,
- GFP_NOFS);
+ local_system_inodes =
+ kzalloc(array3_size(sizeof(struct inode *),
+ NUM_LOCAL_SYSTEM_INODES,
+ osb->max_slots),
+ GFP_NOFS);
if (!local_system_inodes) {
mlog_errno(-ENOMEM);
/*
@@ -113,9 +98,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
} else
arr = get_local_system_inode(osb, type, slot);
+ mutex_lock(&osb->system_file_mutex);
if (arr && ((inode = *arr) != NULL)) {
/* get a ref in addition to the array ref */
inode = igrab(inode);
+ mutex_unlock(&osb->system_file_mutex);
BUG_ON(!inode);
return inode;
@@ -129,6 +116,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
*arr = igrab(inode);
BUG_ON(!*arr);
}
+ mutex_unlock(&osb->system_file_mutex);
return inode;
}
@@ -139,14 +127,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
char namebuf[40];
struct inode *inode = NULL;
u64 blkno;
- int status = 0;
+ int len, status = 0;
- ocfs2_sprintf_system_inode_name(namebuf,
- sizeof(namebuf),
- type, slot);
+ len = ocfs2_sprintf_system_inode_name(namebuf,
+ sizeof(namebuf),
+ type, slot);
- status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
- strlen(namebuf), &blkno);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf, len, &blkno);
if (status < 0) {
goto bail;
}
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
index cc9ea661ffc1..2b38c75990fd 100644
--- a/fs/ocfs2/sysfile.h
+++ b/fs/ocfs2/sysfile.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* sysfile.h
*
* Function prototypes
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_SYSFILE_H
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 52eaf33d346f..09854925fa5c 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -1,6 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* uptodate.c
*
* Tracking the up-to-date-ness of a local buffer_head with respect to
@@ -8,21 +7,6 @@
*
* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
* Standard buffer head caching flags (uptodate, etc) are insufficient
* in a clustered environment - a buffer may be marked up to date on
* our local node but could have been modified by another cluster
@@ -67,7 +51,7 @@ struct ocfs2_meta_cache_item {
sector_t c_block;
};
-static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
+static struct kmem_cache *ocfs2_uptodate_cachep;
u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
{
@@ -633,6 +617,5 @@ int __init init_ocfs2_uptodate_cache(void)
void exit_ocfs2_uptodate_cache(void)
{
- if (ocfs2_uptodate_cachep)
- kmem_cache_destroy(ocfs2_uptodate_cachep);
+ kmem_cache_destroy(ocfs2_uptodate_cachep);
}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 0d826fe2da0d..85d94134001b 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
* uptodate.h
*
* Cluster uptodate tracking
*
* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#ifndef OCFS2_UPTODATE_H
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a2..000000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define OCFS2_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
-
-void ocfs2_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2f..000000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_VER_H
-#define OCFS2_VER_H
-
-void ocfs2_print_version(void);
-
-#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 317ef0abccbb..dc1761e84814 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1,6 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* xattr.c
*
* Copyright (C) 2004, 2008 Oracle. All rights reserved.
@@ -8,15 +7,6 @@
* CREDITS:
* Lots of code in this file is copy from linux/fs/ext3/xattr.c.
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/capability.h>
@@ -97,23 +87,19 @@ static struct ocfs2_xattr_def_value_root def_xv = {
.xv.xr_list.l_count = cpu_to_le16(1),
};
-const struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler * const ocfs2_xattr_handlers[] = {
&ocfs2_xattr_user_handler,
- &ocfs2_xattr_acl_access_handler,
- &ocfs2_xattr_acl_default_handler,
&ocfs2_xattr_trusted_handler,
&ocfs2_xattr_security_handler,
NULL
};
-static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
- [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
- [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
- = &ocfs2_xattr_acl_access_handler,
- [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
- = &ocfs2_xattr_acl_default_handler,
- [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
- [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
+static const struct xattr_handler * const ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+ [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
+ [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
+ [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
+ [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
+ [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
};
struct ocfs2_xattr_info {
@@ -369,7 +355,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
* them fully.
*/
static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
- u64 xb_blkno)
+ u64 xb_blkno, int new)
{
int i, rc = 0;
@@ -377,15 +363,22 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
xb_blkno + i);
if (!bucket->bu_bhs[i]) {
- rc = -EIO;
+ rc = -ENOMEM;
mlog_errno(rc);
break;
}
if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
- bucket->bu_bhs[i]))
- ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
- bucket->bu_bhs[i]);
+ bucket->bu_bhs[i])) {
+ if (new)
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i]);
+ else {
+ set_buffer_uptodate(bucket->bu_bhs[i]);
+ ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i]);
+ }
+ }
}
if (rc)
@@ -492,30 +485,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has bad "
- "signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- xb->xb_signature);
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ xb->xb_signature);
}
if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an "
- "invalid xb_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(xb->xb_blkno));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(xb->xb_blkno));
}
if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an invalid "
- "xb_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(xb->xb_fs_generation));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(xb->xb_fs_generation));
}
return 0;
@@ -543,8 +530,7 @@ static inline const char *ocfs2_xattr_prefix(int name_index)
if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
handler = ocfs2_xattr_handler_map[name_index];
-
- return handler ? handler->prefix : NULL;
+ return handler ? xattr_prefix(handler) : NULL;
}
static u32 ocfs2_xattr_name_hash(struct inode *inode,
@@ -638,14 +624,17 @@ int ocfs2_calc_xattr_init(struct inode *dir,
si->value_len);
if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ down_read(&OCFS2_I(dir)->ip_xattr_sem);
acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
"", NULL, 0);
+ up_read(&OCFS2_I(dir)->ip_xattr_sem);
if (acl_len > 0) {
a_size = ocfs2_xattr_entry_real_size(0, acl_len);
if (S_ISDIR(mode))
a_size <<= 1;
} else if (acl_len != 0 && acl_len != -ENODATA) {
+ ret = acl_len;
mlog_errno(ret);
return ret;
}
@@ -659,7 +648,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
* 256(name) + 80(value) + 16(entry) = 352 bytes,
* The max space of acl xattr taken inline is
* 80(value) + 16(entry) * 2(if directory) = 192 bytes,
- * when blocksize = 512, may reserve one more cluser for
+ * when blocksize = 512, may reserve one more cluster for
* xattr bucket, otherwise reserve one metadata block
* for them is ok.
* If this is a new directory with inline data,
@@ -754,8 +743,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
BUG_ON(why == RESTART_META);
credits = ocfs2_calc_extend_credits(inode->i_sb,
- &vb->vb_xv->xr_list,
- clusters_to_add);
+ &vb->vb_xv->xr_list);
status = ocfs2_extend_trans(handle, credits);
if (status < 0) {
status = -ENOMEM;
@@ -884,14 +872,39 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
return ret;
}
-static int ocfs2_xattr_list_entry(char *buffer, size_t size,
- size_t *result, const char *prefix,
+static int ocfs2_xattr_list_entry(struct super_block *sb,
+ char *buffer, size_t size,
+ size_t *result, int type,
const char *name, int name_len)
{
char *p = buffer + *result;
- int prefix_len = strlen(prefix);
- int total_len = prefix_len + name_len + 1;
+ const char *prefix;
+ int prefix_len;
+ int total_len;
+
+ switch(type) {
+ case OCFS2_XATTR_INDEX_USER:
+ if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return 0;
+ break;
+ case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
+ case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
+ if (!(sb->s_flags & SB_POSIXACL))
+ return 0;
+ break;
+
+ case OCFS2_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+ break;
+ }
+
+ prefix = ocfs2_xattr_prefix(type);
+ if (!prefix)
+ return 0;
+ prefix_len = strlen(prefix);
+ total_len = prefix_len + name_len + 1;
*result += total_len;
/* we are just looking for how big our buffer needs to be */
@@ -914,23 +927,20 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
{
size_t result = 0;
int i, type, ret;
- const char *prefix, *name;
+ const char *name;
for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
- prefix = ocfs2_xattr_prefix(type);
-
- if (prefix) {
- name = (const char *)header +
- le16_to_cpu(entry->xe_name_offset);
+ name = (const char *)header +
+ le16_to_cpu(entry->xe_name_offset);
- ret = ocfs2_xattr_list_entry(buffer, buffer_size,
- &result, prefix, name,
- entry->xe_name_len);
- if (ret)
- return ret;
- }
+ ret = ocfs2_xattr_list_entry(inode->i_sb,
+ buffer, buffer_size,
+ &result, type, name,
+ entry->xe_name_len);
+ if (ret)
+ return ret;
}
return result;
@@ -1014,7 +1024,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry,
int ret = 0, i_ret = 0, b_ret = 0;
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di = NULL;
- struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
+ struct ocfs2_inode_info *oi = OCFS2_I(d_inode(dentry));
if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
return -EOPNOTSUPP;
@@ -1022,7 +1032,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry,
if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
return ret;
- ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+ ret = ocfs2_inode_lock(d_inode(dentry), &di_bh, 0);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1031,7 +1041,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry,
di = (struct ocfs2_dinode *)di_bh->b_data;
down_read(&oi->ip_xattr_sem);
- i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+ i_ret = ocfs2_xattr_ibody_list(d_inode(dentry), di, buffer, size);
if (i_ret < 0)
b_ret = 0;
else {
@@ -1039,26 +1049,26 @@ ssize_t ocfs2_listxattr(struct dentry *dentry,
buffer += i_ret;
size -= i_ret;
}
- b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+ b_ret = ocfs2_xattr_block_list(d_inode(dentry), di,
buffer, size);
if (b_ret < 0)
i_ret = 0;
}
up_read(&oi->ip_xattr_sem);
- ocfs2_inode_unlock(dentry->d_inode, 0);
+ ocfs2_inode_unlock(d_inode(dentry), 0);
brelse(di_bh);
return i_ret + b_ret;
}
-static int ocfs2_xattr_find_entry(int name_index,
+static int ocfs2_xattr_find_entry(struct inode *inode, int name_index,
const char *name,
struct ocfs2_xattr_search *xs)
{
struct ocfs2_xattr_entry *entry;
size_t name_len;
- int i, cmp = 1;
+ int i, name_offset, cmp = 1;
if (name == NULL)
return -EINVAL;
@@ -1066,13 +1076,22 @@ static int ocfs2_xattr_find_entry(int name_index,
name_len = strlen(name);
entry = xs->here;
for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+ if ((void *)entry >= xs->end) {
+ ocfs2_error(inode->i_sb, "corrupted xattr entries");
+ return -EFSCORRUPTED;
+ }
cmp = name_index - ocfs2_xattr_get_type(entry);
if (!cmp)
cmp = name_len - entry->xe_name_len;
- if (!cmp)
- cmp = memcmp(name, (xs->base +
- le16_to_cpu(entry->xe_name_offset)),
- name_len);
+ if (!cmp) {
+ name_offset = le16_to_cpu(entry->xe_name_offset);
+ if ((xs->base + name_offset + name_len) > xs->end) {
+ ocfs2_error(inode->i_sb,
+ "corrupted xattr entries");
+ return -EFSCORRUPTED;
+ }
+ cmp = memcmp(name, (xs->base + name_offset), name_len);
+ }
if (cmp == 0)
break;
entry += 1;
@@ -1156,7 +1175,7 @@ static int ocfs2_xattr_ibody_get(struct inode *inode,
xs->base = (void *)xs->header;
xs->here = xs->header->xh_entries;
- ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ ret = ocfs2_xattr_find_entry(inode, name_index, name, xs);
if (ret)
return ret;
size = le64_to_cpu(xs->here->xe_value_size);
@@ -1195,7 +1214,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
struct ocfs2_xattr_value_root *xv;
size_t size;
int ret = -ENODATA, name_offset, name_len, i;
- int uninitialized_var(block_off);
+ int block_off;
xs->bucket = ocfs2_xattr_bucket_new(inode);
if (!xs->bucket) {
@@ -1232,6 +1251,10 @@ static int ocfs2_xattr_block_get(struct inode *inode,
i,
&block_off,
&name_offset);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
xs->base = bucket_block(xs->bucket, block_off);
}
if (ocfs2_xattr_is_local(xs->here)) {
@@ -1278,7 +1301,7 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
return -EOPNOTSUPP;
if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
- ret = -ENODATA;
+ return -ENODATA;
xis.inode_bh = xbs.inode_bh = di_bh;
di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1303,20 +1326,21 @@ static int ocfs2_xattr_get(struct inode *inode,
void *buffer,
size_t buffer_size)
{
- int ret;
+ int ret, had_lock;
struct buffer_head *di_bh = NULL;
+ struct ocfs2_lock_holder oh;
- ret = ocfs2_inode_lock(inode, &di_bh, 0);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
+ had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh);
+ if (had_lock < 0) {
+ mlog_errno(had_lock);
+ return had_lock;
}
down_read(&OCFS2_I(inode)->ip_xattr_sem);
ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
name, buffer, buffer_size);
up_read(&OCFS2_I(inode)->ip_xattr_sem);
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
brelse(di_bh);
@@ -2012,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
rc = 0;
ocfs2_xa_cleanup_value_truncate(loc, "removing",
orig_clusters);
- if (rc)
- goto out;
+ goto out;
}
}
@@ -2499,7 +2522,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
mlog_errno(ret);
goto out;
}
- mutex_lock(&xb_alloc_inode->i_mutex);
+ inode_lock(xb_alloc_inode);
ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
if (ret < 0) {
@@ -2524,7 +2547,7 @@ out_unlock:
ocfs2_inode_unlock(xb_alloc_inode, 1);
brelse(xb_alloc_bh);
out_mutex:
- mutex_unlock(&xb_alloc_inode->i_mutex);
+ inode_unlock(xb_alloc_inode);
iput(xb_alloc_inode);
out:
brelse(blk_bh);
@@ -2552,7 +2575,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
return 0;
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+ if (ocfs2_is_refcount_inode(inode)) {
ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
le64_to_cpu(di->i_refcount_loc),
1, &ref_tree, &ref_root_bh);
@@ -2603,6 +2626,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
out_commit:
@@ -2682,7 +2706,7 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
/* Find the named attribute. */
if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
- ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ ret = ocfs2_xattr_find_entry(inode, name_index, name, xs);
if (ret && ret != -ENODATA)
return ret;
xs->not_found = ret;
@@ -2817,7 +2841,7 @@ static int ocfs2_xattr_block_find(struct inode *inode,
xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
xs->here = xs->header->xh_entries;
- ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ ret = ocfs2_xattr_find_entry(inode, name_index, name, xs);
} else
ret = ocfs2_xattr_index_block_find(inode, blk_bh,
name_index,
@@ -2865,6 +2889,12 @@ static int ocfs2_create_xattr_block(struct inode *inode,
}
new_bh = sb_getblk(inode->i_sb, first_blkno);
+ if (!new_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto end;
+ }
+
ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
@@ -2878,7 +2908,7 @@ static int ocfs2_create_xattr_block(struct inode *inode,
/* Initialize ocfs2_xattr_block */
xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
memset(xblk, 0, inode->i_sb->s_blocksize);
- strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+ strscpy(xblk->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE);
xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
@@ -3040,8 +3070,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
clusters_add += new_clusters;
credits += ocfs2_calc_extend_credits(inode->i_sb,
- &def_xv.xv.xr_list,
- new_clusters);
+ &def_xv.xv.xr_list);
}
goto meta_guess;
@@ -3106,8 +3135,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
if (!ocfs2_xattr_is_local(xe))
credits += ocfs2_calc_extend_credits(
inode->i_sb,
- &def_xv.xv.xr_list,
- new_clusters);
+ &def_xv.xv.xr_list);
goto out;
}
}
@@ -3132,9 +3160,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
clusters_add += new_clusters - old_clusters;
credits += ocfs2_calc_extend_credits(inode->i_sb,
- &xv->xr_list,
- new_clusters -
- old_clusters);
+ &xv->xr_list);
if (value_size >= OCFS2_XATTR_ROOT_SIZE)
goto out;
}
@@ -3180,7 +3206,7 @@ meta_guess:
&xb->xb_attrs.xb_root.xt_list;
meta_add += ocfs2_extend_meta_needed(el);
credits += ocfs2_calc_extend_credits(inode->i_sb,
- el, 1);
+ el);
} else
credits += OCFS2_SUBALLOC_ALLOC + 1;
@@ -3199,8 +3225,15 @@ meta_guess:
clusters_add += 1;
}
} else {
- meta_add += 1;
credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ struct ocfs2_extent_list *el = &def_xv.xv.xr_list;
+ meta_add += ocfs2_extend_meta_needed(el);
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ el);
+ } else {
+ meta_add += 1;
+ }
}
out:
if (clusters_need)
@@ -3396,9 +3429,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- inode->i_ctime = CURRENT_TIME;
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ inode_set_ctime_current(inode);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
}
out:
@@ -3502,11 +3535,12 @@ int ocfs2_xattr_set(struct inode *inode,
{
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
- int ret, credits, ref_meta = 0, ref_credits = 0;
+ int ret, credits, had_lock, ref_meta = 0, ref_credits = 0;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
- struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+ struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, };
struct ocfs2_refcount_tree *ref_tree = NULL;
+ struct ocfs2_lock_holder oh;
struct ocfs2_xattr_info xi = {
.xi_name_index = name_index,
@@ -3524,7 +3558,7 @@ int ocfs2_xattr_set(struct inode *inode,
.not_found = -ENODATA,
};
- if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ if (!ocfs2_supports_xattr(osb))
return -EOPNOTSUPP;
/*
@@ -3537,8 +3571,9 @@ int ocfs2_xattr_set(struct inode *inode,
return -ENOMEM;
}
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
+ had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 1, &oh);
+ if (had_lock < 0) {
+ ret = had_lock;
mlog_errno(ret);
goto cleanup_nolock;
}
@@ -3573,7 +3608,7 @@ int ocfs2_xattr_set(struct inode *inode,
}
/* Check whether the value is refcounted and do some preparation. */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
+ if (ocfs2_is_refcount_inode(inode) &&
(!xis.not_found || !xbs.not_found)) {
ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
&xis, &xbs, &ref_tree,
@@ -3584,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode,
}
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
if (ret < 0) {
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
mlog_errno(ret);
goto cleanup;
}
}
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
&xbs, &ctxt, ref_meta, &credits);
@@ -3609,13 +3644,15 @@ int ocfs2_xattr_set(struct inode *inode,
if (IS_ERR(ctxt.handle)) {
ret = PTR_ERR(ctxt.handle);
mlog_errno(ret);
- goto cleanup;
+ goto out_free_ac;
}
ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+ ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);
ocfs2_commit_trans(osb, ctxt.handle);
+out_free_ac:
if (ctxt.data_ac)
ocfs2_free_alloc_context(ctxt.data_ac);
if (ctxt.meta_ac)
@@ -3633,7 +3670,7 @@ cleanup:
if (ret)
mlog_errno(ret);
}
- ocfs2_inode_unlock(inode, 1);
+ ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
cleanup_nolock:
brelse(di_bh);
brelse(xbs.xattr_bh);
@@ -3672,11 +3709,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in xattr tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -3691,11 +3727,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
}
if (!e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
@@ -3792,7 +3827,6 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
int low_bucket = 0, bucket, high_bucket;
struct ocfs2_xattr_bucket *search;
- u32 last_hash;
u64 blkno, lower_blkno = 0;
search = ocfs2_xattr_bucket_new(inode);
@@ -3836,8 +3870,6 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
if (xh->xh_count)
xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
- last_hash = le32_to_cpu(xe->xe_name_hash);
-
/* record lower_blkno which may be the insert place. */
lower_blkno = blkno;
@@ -4019,32 +4051,30 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
int ret = 0, type;
struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
int i, block_off, new_offset;
- const char *prefix, *name;
+ const char *name;
for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
- prefix = ocfs2_xattr_prefix(type);
- if (prefix) {
- ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
- bucket_xh(bucket),
- i,
- &block_off,
- &new_offset);
- if (ret)
- break;
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(bucket),
+ i,
+ &block_off,
+ &new_offset);
+ if (ret)
+ break;
- name = (const char *)bucket_block(bucket, block_off) +
- new_offset;
- ret = ocfs2_xattr_list_entry(xl->buffer,
- xl->buffer_size,
- &xl->result,
- prefix, name,
- entry->xe_name_len);
- if (ret)
- break;
- }
+ name = (const char *)bucket_block(bucket, block_off) +
+ new_offset;
+ ret = ocfs2_xattr_list_entry(inode->i_sb,
+ xl->buffer,
+ xl->buffer_size,
+ &xl->result,
+ type, name,
+ entry->xe_name_len);
+ if (ret)
+ break;
}
return ret;
@@ -4136,15 +4166,6 @@ static int cmp_xe(const void *a, const void *b)
return 0;
}
-static void swap_xe(void *a, void *b, int size)
-{
- struct ocfs2_xattr_entry *l = a, *r = b, tmp;
-
- tmp = *l;
- memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
- memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
-}
-
/*
* When the ocfs2_xattr_block is filled up, new bucket will be created
* and all the xattr entries will be moved to the new bucket.
@@ -4210,7 +4231,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
- cmp_xe, swap_xe);
+ cmp_xe, NULL);
}
/*
@@ -4292,7 +4313,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
- ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
+ ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4350,7 +4371,7 @@ static int cmp_xe_offset(const void *a, const void *b)
/*
* defrag a xattr bucket if we find that the bucket has some
- * holes beteen name/value pairs.
+ * holes between name/value pairs.
* We will move all the name/value pairs to the end of the bucket
* so that we can spare some space for insertion.
*/
@@ -4405,7 +4426,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
*/
sort(entries, le16_to_cpu(xh->xh_count),
sizeof(struct ocfs2_xattr_entry),
- cmp_xe_offset, swap_xe);
+ cmp_xe_offset, NULL);
/* Move all name/values to the end of the bucket. */
xe = xh->xh_entries;
@@ -4447,7 +4468,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
/* sort the entries by their name_hash. */
sort(entries, le16_to_cpu(xh->xh_count),
sizeof(struct ocfs2_xattr_entry),
- cmp_xe, swap_xe);
+ cmp_xe, NULL);
buf = bucket_buf;
for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
@@ -4636,7 +4657,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
* Even if !new_bucket_head, we're overwriting t_bucket. Thus,
* there's no need to read it.
*/
- ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
+ ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4802,7 +4823,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
* Even if !t_is_new, we're overwriting t_bucket. Thus,
* there's no need to read it.
*/
- ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+ ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);
if (ret)
goto out;
@@ -4990,7 +5011,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
* 2. If cluster_size == bucket_size:
* a) If the previous extent rec has more than one cluster and the insert
* place isn't in the last cluster, copy the entire last cluster to the
- * new one. This time, we don't need to upate the first_bh and header_bh
+ * new one. This time, we don't need to update the first_bh and header_bh
* since they will not be moved into the new cluster.
* b) Otherwise, move the bottom half of the xattrs in the last cluster into
* the new one. And we set the extend flag to zero if the insert place is
@@ -5316,16 +5337,6 @@ out:
return ret;
}
-static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
- struct ocfs2_xattr_bucket *bucket,
- int offs)
-{
- int block_off = offs >> inode->i_sb->s_blocksize_bits;
-
- offs = offs % inode->i_sb->s_blocksize;
- return bucket_block(bucket, block_off) + offs;
-}
-
/*
* Truncate the specified xe_off entry in xattr bucket.
* bucket is indicated by header_bh and len is the new length.
@@ -5437,7 +5448,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
return ret;
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
@@ -5474,13 +5485,14 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
if (ret)
mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
out_commit:
ocfs2_commit_trans(osb, handle);
out:
ocfs2_schedule_truncate_log_flush(osb, 1);
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
@@ -5656,6 +5668,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
i, &xv, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
args->ref_ci,
@@ -5881,6 +5897,10 @@ static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
while (cpos < clusters) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
&num_clusters, el, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
cpos += num_clusters;
if ((ext_flags & OCFS2_EXT_REFCOUNTED))
@@ -6169,7 +6189,7 @@ struct ocfs2_xattr_reflink {
/*
* Given a xattr header and xe offset,
* return the proper xv and the corresponding bh.
- * xattr in inode, block and xattr tree have different implementaions.
+ * xattr in inode, block and xattr tree have different implementations.
*/
typedef int (get_xattr_value_root)(struct super_block *sb,
struct buffer_head *bh,
@@ -6211,8 +6231,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
le16_to_cpu(xv->xr_list.l_next_free_rec);
*credits += ocfs2_calc_extend_credits(sb,
- &def_xv.xv.xr_list,
- le32_to_cpu(xv->xr_clusters));
+ &def_xv.xv.xr_list);
/*
* If the value is a tree with depth > 1, We don't go deep
@@ -6250,7 +6269,7 @@ static int ocfs2_get_xattr_value_root(struct super_block *sb,
}
/*
- * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
+ * Lock the meta_ac and calculate how much credits we need for reflink xattrs.
* It is only used for inline xattr and xattr block.
*/
static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
@@ -6332,7 +6351,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr,
le16_to_cpu(xh->xh_count));
- last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
+ last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)] - 1;
for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
xe = &xh->xh_entries[i];
@@ -6381,7 +6400,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
* and then insert the extents one by one.
*/
if (xv->xr_list.l_tree_depth) {
- memcpy(new_xv, &def_xv, sizeof(def_xv));
+ memcpy(new_xv, &def_xv, OCFS2_XATTR_ROOT_SIZE);
vb->vb_xv = new_xv;
vb->vb_bh = value_bh;
ocfs2_init_xattr_value_extent_tree(&data_et,
@@ -6491,16 +6510,7 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
}
new_oi = OCFS2_I(args->new_inode);
- /*
- * Adjust extent record count to reserve space for extended attribute.
- * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
- */
- if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
- !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
- struct ocfs2_extent_list *el = &new_di->id2.i_list;
- le16_add_cpu(&el->l_count, -(inline_size /
- sizeof(struct ocfs2_extent_rec)));
- }
+
spin_lock(&new_oi->ip_lock);
new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
@@ -6766,7 +6776,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
*credits += 1;
/* count in the xattr tree change. */
- num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+ num_free_extents = ocfs2_num_free_extents(xt_et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
@@ -6777,7 +6787,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
*credits += ocfs2_calc_extend_credits(osb->sb,
- xt_et->et_root_el, len);
+ xt_et->et_root_el);
if (metas.num_metas) {
ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
@@ -6797,7 +6807,7 @@ out:
if (ret) {
if (*meta_ac) {
ocfs2_free_alloc_context(*meta_ac);
- meta_ac = NULL;
+ *meta_ac = NULL;
}
}
@@ -6825,7 +6835,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle,
break;
}
- ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
+ ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);
if (ret) {
mlog_errno(ret);
break;
@@ -7181,7 +7191,7 @@ out:
* Used for reflink a non-preserve-security file.
*
* It uses common api like ocfs2_xattr_set, so the caller
- * must not hold any lock expect i_mutex.
+ * must not hold any lock expect i_rwsem.
*/
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
@@ -7201,7 +7211,6 @@ int ocfs2_init_security_and_acl(struct inode *dir,
mlog_errno(ret);
goto leave;
}
-
ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
if (ret)
mlog_errno(ret);
@@ -7211,49 +7220,46 @@ int ocfs2_init_security_and_acl(struct inode *dir,
leave:
return ret;
}
+
/*
* 'security' attributes support
*/
-static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY,
name, buffer, size);
}
-static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
-int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
{
+ struct ocfs2_security_xattr_info *si = fs_info;
const struct xattr *xattr;
int err = 0;
+ if (si) {
+ si->value = kmemdup(xattr_array->value, xattr_array->value_len,
+ GFP_KERNEL);
+ if (!si->value)
+ return -ENOMEM;
+
+ si->name = xattr_array->name;
+ si->value_len = xattr_array->value_len;
+ return 0;
+ }
+
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
xattr->name, xattr->value,
@@ -7269,13 +7275,23 @@ int ocfs2_init_security_get(struct inode *inode,
const struct qstr *qstr,
struct ocfs2_security_xattr_info *si)
{
+ int ret;
+
/* check whether ocfs2 support feature xattr */
if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
return -EOPNOTSUPP;
- if (si)
- return security_old_inode_init_security(inode, dir, qstr,
- &si->name, &si->value,
- &si->value_len);
+ if (si) {
+ ret = security_inode_init_security(inode, dir, qstr,
+ &ocfs2_initxattrs, si);
+ /*
+ * security_inode_init_security() does not return -EOPNOTSUPP,
+ * we have to check the xattr ourselves.
+ */
+ if (!ret && !si->name)
+ si->enable = 0;
+
+ return ret;
+ }
return security_inode_init_security(inode, dir, qstr,
&ocfs2_initxattrs, NULL);
@@ -7296,7 +7312,6 @@ int ocfs2_init_security_set(handle_t *handle,
const struct xattr_handler ocfs2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ocfs2_xattr_security_list,
.get = ocfs2_xattr_security_get,
.set = ocfs2_xattr_security_set,
};
@@ -7304,43 +7319,26 @@ const struct xattr_handler ocfs2_xattr_security_handler = {
/*
* 'trusted' attributes support
*/
-static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
+static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
-static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
const struct xattr_handler ocfs2_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = ocfs2_xattr_trusted_list,
.get = ocfs2_xattr_trusted_get,
.set = ocfs2_xattr_trusted_set,
};
@@ -7348,55 +7346,35 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
/*
* 'user' attributes support
*/
-static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
+static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-
- if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
- return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name,
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
buffer, size);
}
-static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
- return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER,
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER,
name, value, size, flags);
}
const struct xattr_handler ocfs2_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = ocfs2_xattr_user_list,
.get = ocfs2_xattr_user_get,
.set = ocfs2_xattr_user_set,
};
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index e5c7f15465b4..65e9aa743919 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -1,18 +1,8 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
* xattr.h
*
* Copyright (C) 2004, 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef OCFS2_XATTR_H
@@ -32,7 +22,7 @@ enum ocfs2_xattr_type {
struct ocfs2_security_xattr_info {
int enable;
- char *name;
+ const char *name;
void *value;
size_t value_len;
};
@@ -40,9 +30,7 @@ struct ocfs2_security_xattr_info {
extern const struct xattr_handler ocfs2_xattr_user_handler;
extern const struct xattr_handler ocfs2_xattr_trusted_handler;
extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
-extern const struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler * const ocfs2_xattr_handlers[];
ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,