summaryrefslogtreecommitdiff
path: root/fs/ocfs2/dlmglue.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/dlmglue.c')
-rw-r--r--fs/ocfs2/dlmglue.c793
1 files changed, 605 insertions, 188 deletions
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3a44a648dae7..619ff03b15d6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1,26 +1,10 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
* dlmglue.c
*
* Code which implements an OCFS2 specific interface to our DLM.
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/types.h>
@@ -32,7 +16,10 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/delay.h>
#include <linux/quotaops.h>
+#include <linux/sched/signal.h>
+#include <linux/string_choices.h>
#define MLOG_MASK_PREFIX ML_DLM_GLUE
#include <cluster/masklog.h>
@@ -54,6 +41,7 @@
#include "uptodate.h"
#include "quota.h"
#include "refcounttree.h"
+#include "acl.h"
#include "buffer_head_io.h"
@@ -94,7 +82,9 @@ struct ocfs2_unblock_ctl {
};
/* Lockdep class keys */
-struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+#endif
static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
int new_level);
@@ -232,12 +222,12 @@ struct ocfs2_lock_res_ops {
*/
#define LOCK_TYPE_USES_LVB 0x2
-static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
.get_osb = ocfs2_get_inode_osb,
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
.get_osb = ocfs2_get_inode_osb,
.check_downconvert = ocfs2_check_meta_downconvert,
.set_lvb = ocfs2_set_meta_lvb,
@@ -245,46 +235,50 @@ static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
.flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
};
-static struct ocfs2_lock_res_ops ocfs2_super_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_super_lops = {
.flags = LOCK_TYPE_REQUIRES_REFRESH,
};
-static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_rename_lops = {
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = {
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
+static const struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
.flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
};
-static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.get_osb = ocfs2_get_dentry_osb,
.post_unlock = ocfs2_dentry_post_unlock,
.downconvert_worker = ocfs2_dentry_convert_worker,
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
.get_osb = ocfs2_get_inode_osb,
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_flock_lops = {
.get_osb = ocfs2_get_file_osb,
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
.set_lvb = ocfs2_set_qinfo_lvb,
.get_osb = ocfs2_get_qinfo_osb,
.flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
};
-static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
+static const struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
.check_downconvert = ocfs2_check_refcount_downconvert,
.downconvert_worker = ocfs2_refcount_convert_worker,
.flags = 0,
@@ -432,6 +426,7 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
{
res->l_lock_refresh = 0;
+ res->l_lock_wait = 0;
memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
}
@@ -466,6 +461,8 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
if (ret)
stats->ls_fail++;
+
+ stats->ls_last = ktime_to_us(ktime_get_real());
}
static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
@@ -473,6 +470,21 @@ static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
lockres->l_lock_refresh++;
}
+static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_mask_waiter *mw;
+
+ if (list_empty(&lockres->l_mask_waiters)) {
+ lockres->l_lock_wait = 0;
+ return;
+ }
+
+ mw = list_first_entry(&lockres->l_mask_waiters,
+ struct ocfs2_mask_waiter, mw_item);
+ lockres->l_lock_wait =
+ ktime_to_us(ktime_mono_to_real(mw->mw_lock_start));
+}
+
static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
{
mw->mw_lock_start = ktime_get();
@@ -488,6 +500,9 @@ static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
{
}
+static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres)
+{
+}
static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
{
}
@@ -496,7 +511,7 @@ static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
- struct ocfs2_lock_res_ops *ops,
+ const struct ocfs2_lock_res_ops *ops,
void *priv)
{
res->l_type = type;
@@ -531,6 +546,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
init_waitqueue_head(&res->l_event);
INIT_LIST_HEAD(&res->l_blocked_list);
INIT_LIST_HEAD(&res->l_mask_waiters);
+ INIT_LIST_HEAD(&res->l_holders);
}
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
@@ -538,7 +554,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
unsigned int generation,
struct inode *inode)
{
- struct ocfs2_lock_res_ops *ops;
+ const struct ocfs2_lock_res_ops *ops;
switch(type) {
case OCFS2_LOCK_TYPE_RW:
@@ -554,7 +570,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
mlog_bug_on_msg(1, "type: %d\n", type);
ops = NULL; /* thanks, gcc */
break;
- };
+ }
ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
generation, res->l_name);
@@ -673,6 +689,35 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb)
+{
+ ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ init_rwsem(&osb->nfs_sync_rwlock);
+}
+
+void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ /* Only one trimfs thread are allowed to work at the same time. */
+ mutex_lock(&osb->obs_trim_fs_mutex);
+
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
+ ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
+ &ocfs2_trim_fs_lops, osb);
+}
+
+void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
+{
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ ocfs2_simple_drop_lockres(osb, lockres);
+ ocfs2_lock_res_free(lockres);
+
+ mutex_unlock(&osb->obs_trim_fs_mutex);
+}
+
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
struct ocfs2_super *osb)
{
@@ -748,6 +793,49 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
res->l_flags = 0UL;
}
+/*
+ * Keep a list of processes who have interest in a lockres.
+ * Note: this is now only used for check recursive cluster locking.
+ */
+static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
+ struct ocfs2_lock_holder *oh)
+{
+ INIT_LIST_HEAD(&oh->oh_list);
+ oh->oh_owner_pid = get_pid(task_pid(current));
+
+ spin_lock(&lockres->l_lock);
+ list_add_tail(&oh->oh_list, &lockres->l_holders);
+ spin_unlock(&lockres->l_lock);
+}
+
+static struct ocfs2_lock_holder *
+ocfs2_pid_holder(struct ocfs2_lock_res *lockres,
+ struct pid *pid)
+{
+ struct ocfs2_lock_holder *oh;
+
+ spin_lock(&lockres->l_lock);
+ list_for_each_entry(oh, &lockres->l_holders, oh_list) {
+ if (oh->oh_owner_pid == pid) {
+ spin_unlock(&lockres->l_lock);
+ return oh;
+ }
+ }
+ spin_unlock(&lockres->l_lock);
+ return NULL;
+}
+
+static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
+ struct ocfs2_lock_holder *oh)
+{
+ spin_lock(&lockres->l_lock);
+ list_del(&oh->oh_list);
+ spin_unlock(&lockres->l_lock);
+
+ put_pid(oh->oh_owner_pid);
+}
+
+
static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
int level)
{
@@ -814,6 +902,7 @@ static void lockres_set_flags(struct ocfs2_lock_res *lockres,
list_del_init(&mw->mw_item);
mw->mw_status = 0;
complete(&mw->mw_complete);
+ ocfs2_track_lock_wait(lockres);
}
}
static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
@@ -861,8 +950,13 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
* We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
* the OCFS2_LOCK_BUSY flag to prevent the dc thread from
* downconverting the lock before the upconvert has fully completed.
+ * Do not prevent the dc thread from downconverting if NONBLOCK lock
+ * had already returned.
*/
- lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED))
+ lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ else
+ lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED);
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
}
@@ -1304,7 +1398,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
{
wait_for_completion(&mw->mw_complete);
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return mw->mw_status;
}
@@ -1320,24 +1414,37 @@ static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
mw->mw_mask = mask;
mw->mw_goal = goal;
+ ocfs2_track_lock_wait(lockres);
}
/* returns 0 if the mw that was removed was already satisfied, -EBUSY
* if the mask still hadn't reached its goal */
-static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
struct ocfs2_mask_waiter *mw)
{
- unsigned long flags;
int ret = 0;
- spin_lock_irqsave(&lockres->l_lock, flags);
+ assert_spin_locked(&lockres->l_lock);
if (!list_empty(&mw->mw_item)) {
if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
ret = -EBUSY;
list_del_init(&mw->mw_item);
init_completion(&mw->mw_complete);
+ ocfs2_track_lock_wait(lockres);
}
+
+ return ret;
+}
+
+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+ struct ocfs2_mask_waiter *mw)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ ret = __lockres_remove_mask_waiter(lockres, mw);
spin_unlock_irqrestore(&lockres->l_lock, flags);
return ret;
@@ -1355,7 +1462,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
else
ret = mw->mw_status;
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return ret;
}
@@ -1373,6 +1480,13 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
unsigned long flags;
unsigned int gen;
int noqueue_attempted = 0;
+ int dlm_locked = 0;
+ int kick_dc = 0;
+
+ if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
+ mlog_errno(-EINVAL);
+ return -EINVAL;
+ }
ocfs2_init_mask_waiter(&mw);
@@ -1481,6 +1595,7 @@ again:
ocfs2_recover_from_dlm_error(lockres, 1);
goto out;
}
+ dlm_locked = 1;
mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
lockres->l_name);
@@ -1501,7 +1616,12 @@ update_holders:
unlock:
lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ /* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+ kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
spin_unlock_irqrestore(&lockres->l_lock, flags);
+ if (kick_dc)
+ ocfs2_wake_downconvert_thread(osb);
out:
/*
* This is helping work around a lock inversion between the page lock
@@ -1514,10 +1634,17 @@ out:
if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
wait = 0;
- if (lockres_remove_mask_waiter(lockres, &mw))
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ if (__lockres_remove_mask_waiter(lockres, &mw)) {
+ if (dlm_locked)
+ lockres_or_flags(lockres,
+ OCFS2_LOCK_NONBLOCK_FINISHED);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = -EAGAIN;
- else
+ } else {
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
goto again;
+ }
}
if (wait) {
ret = ocfs2_wait_for_mask(&mw);
@@ -1566,7 +1693,7 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
spin_unlock_irqrestore(&lockres->l_lock, flags);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (lockres->l_lockdep_map.key != NULL)
- rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+ rwsem_release(&lockres->l_lockdep_map, caller_ip);
#endif
}
@@ -1598,7 +1725,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
int ret;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
BUG_ON(!ocfs2_inode_is_new(inode));
mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1628,10 +1754,8 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
}
ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto bail;
- }
bail:
return ret;
@@ -1643,8 +1767,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
-
mlog(0, "inode %llu take %s RW lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
@@ -1656,14 +1778,34 @@ int ocfs2_rw_lock(struct inode *inode, int write)
level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
- 0);
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
if (status < 0)
mlog_errno(status);
return status;
}
+int ocfs2_try_rw_lock(struct inode *inode, int write)
+{
+ int status, level;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ mlog(0, "inode %llu try to take %s RW lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ write ? "EXMODE" : "PRMODE");
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+ level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+ status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
+ return status;
+}
+
void ocfs2_rw_unlock(struct inode *inode, int write)
{
int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
@@ -1675,7 +1817,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
write ? "EXMODE" : "PRMODE");
if (!ocfs2_mount_local(osb))
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+ ocfs2_cluster_unlock(osb, lockres, level);
}
/*
@@ -1687,8 +1829,6 @@ int ocfs2_open_lock(struct inode *inode)
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
-
mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1697,8 +1837,7 @@ int ocfs2_open_lock(struct inode *inode)
lockres = &OCFS2_I(inode)->ip_open_lockres;
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_PR, 0, 0);
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -1712,8 +1851,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- BUG_ON(!inode);
-
mlog(0, "inode %llu try to take %s open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
@@ -1737,8 +1874,7 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
* other nodes and the -EAGAIN will indicate to the caller that
* this inode is still in use.
*/
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
- level, DLM_LKF_NOQUEUE, 0);
+ status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
out:
return status;
@@ -1759,11 +1895,9 @@ void ocfs2_open_unlock(struct inode *inode)
goto out;
if(lockres->l_ro_holders)
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_PR);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR);
if(lockres->l_ex_holders)
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_EX);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
out:
return;
@@ -2005,15 +2139,15 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
}
#define OCFS2_SEC_BITS 34
-#define OCFS2_SEC_SHIFT (64 - 34)
+#define OCFS2_SEC_SHIFT (64 - OCFS2_SEC_BITS)
#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
/* LVB only has room for 64 bits of time here so we pack it for
* now. */
-static u64 ocfs2_pack_timespec(struct timespec *spec)
+static u64 ocfs2_pack_timespec(struct timespec64 *spec)
{
u64 res;
- u64 sec = spec->tv_sec;
+ u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull);
u32 nsec = spec->tv_nsec;
res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
@@ -2029,6 +2163,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
+ struct timespec64 ts;
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2049,12 +2184,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
- lvb->lvb_iatime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
- lvb->lvb_ictime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
- lvb->lvb_imtime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+ ts = inode_get_atime(inode);
+ lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+ ts = inode_get_ctime(inode);
+ lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+ ts = inode_get_mtime(inode);
+ lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2063,22 +2198,25 @@ out:
mlog_meta_lvb(0, lockres);
}
-static void ocfs2_unpack_timespec(struct timespec *spec,
+static void ocfs2_unpack_timespec(struct timespec64 *spec,
u64 packed_time)
{
spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
}
-static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
+ struct timespec64 ts;
mlog_meta_lvb(0, lockres);
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ if (inode_wrong_type(inode, be16_to_cpu(lvb->lvb_imode)))
+ return -ESTALE;
/* We're safe here without the lockres lock... */
spin_lock(&oi->ip_lock);
@@ -2099,13 +2237,14 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
- ocfs2_unpack_timespec(&inode->i_atime,
- be64_to_cpu(lvb->lvb_iatime_packed));
- ocfs2_unpack_timespec(&inode->i_mtime,
- be64_to_cpu(lvb->lvb_imtime_packed));
- ocfs2_unpack_timespec(&inode->i_ctime,
- be64_to_cpu(lvb->lvb_ictime_packed));
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed));
+ inode_set_atime_to_ts(inode, ts);
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed));
+ inode_set_mtime_to_ts(inode, ts);
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed));
+ inode_set_ctime_to_ts(inode, ts);
spin_unlock(&oi->ip_lock);
+ return 0;
}
static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
@@ -2208,7 +2347,8 @@ static int ocfs2_inode_lock_update(struct inode *inode,
if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
mlog(0, "Trusting LVB on inode %llu\n",
(unsigned long long)oi->ip_blkno);
- ocfs2_refresh_inode_from_lvb(inode);
+ status = ocfs2_refresh_inode_from_lvb(inode);
+ goto bail_refresh;
} else {
/* Boo, we have to go to disk. */
/* read bh, cast, ocfs2_refresh_inode */
@@ -2218,6 +2358,10 @@ static int ocfs2_inode_lock_update(struct inode *inode,
goto bail_refresh;
}
fe = (struct ocfs2_dinode *) (*bh)->b_data;
+ if (inode_wrong_type(inode, le16_to_cpu(fe->i_mode))) {
+ status = -ESTALE;
+ goto bail_refresh;
+ }
/* This is a good chance to make sure we're not
* locking an invalid object. ocfs2_read_inode_block()
@@ -2291,8 +2435,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *local_bh = NULL;
- BUG_ON(!inode);
-
mlog(0, "inode %llu, take %s META lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
@@ -2307,8 +2449,9 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
goto getbh;
}
- if (ocfs2_mount_local(osb))
- goto local;
+ if ((arg_flags & OCFS2_META_LOCK_GETBH) ||
+ ocfs2_mount_local(osb))
+ goto update;
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
ocfs2_wait_for_recovery(osb);
@@ -2337,14 +2480,14 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
ocfs2_wait_for_recovery(osb);
-local:
+update:
/*
* We only see this flag if we're being called from
* ocfs2_read_locked_inode(). It means we're locking an inode
* which hasn't been populated yet, so clear the refresh flag
* and let the caller handle it.
*/
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
status = 0;
if (lockres)
ocfs2_complete_lock_res_refresh(lockres, 0);
@@ -2381,44 +2524,41 @@ bail:
ocfs2_inode_unlock(inode, ex);
}
- if (local_bh)
- brelse(local_bh);
-
+ brelse(local_bh);
return status;
}
/*
* This is working around a lock inversion between tasks acquiring DLM
- * locks while holding a page lock and the downconvert thread which
- * blocks dlm lock acquiry while acquiring page locks.
+ * locks while holding a folio lock and the downconvert thread which
+ * blocks dlm lock acquiry while acquiring folio locks.
*
- * ** These _with_page variantes are only intended to be called from aop
- * methods that hold page locks and return a very specific *positive* error
+ * ** These _with_folio variants are only intended to be called from aop
+ * methods that hold folio locks and return a very specific *positive* error
* code that aop methods pass up to the VFS -- test for errors with != 0. **
*
* The DLM is called such that it returns -EAGAIN if it would have
* blocked waiting for the downconvert thread. In that case we unlock
- * our page so the downconvert thread can make progress. Once we've
+ * our folio so the downconvert thread can make progress. Once we've
* done this we have to return AOP_TRUNCATED_PAGE so the aop method
* that called us can bubble that back up into the VFS who will then
* immediately retry the aop call.
- *
- * We do a blocking lock and immediate unlock before returning, though, so that
- * the lock has a great chance of being cached on this node by the time the VFS
- * calls back to retry the aop. This has a potential to livelock as nodes
- * ping locks back and forth, but that's a risk we're willing to take to avoid
- * the lock inversion simply.
*/
-int ocfs2_inode_lock_with_page(struct inode *inode,
- struct buffer_head **ret_bh,
- int ex,
- struct page *page)
+int ocfs2_inode_lock_with_folio(struct inode *inode,
+ struct buffer_head **ret_bh, int ex, struct folio *folio)
{
int ret;
ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
- unlock_page(page);
+ folio_unlock(folio);
+ /*
+ * If we can't get inode lock immediately, we should not return
+ * directly here, since this will lead to a softlockup problem.
+ * The method is to get a blocking lock and immediately unlock
+ * before returning, this can avoid CPU resource waste due to
+ * lots of retries, and benefits fairness in getting lock.
+ */
if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
@@ -2429,13 +2569,18 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
- int *level)
+ int *level, int wait)
{
int ret;
- ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ else
+ ret = ocfs2_try_inode_lock(inode, NULL, 0);
+
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
return ret;
}
@@ -2447,16 +2592,20 @@ int ocfs2_inode_lock_atime(struct inode *inode,
struct buffer_head *bh = NULL;
ocfs2_inode_unlock(inode, 0);
- ret = ocfs2_inode_lock(inode, &bh, 1);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, &bh, 1);
+ else
+ ret = ocfs2_try_inode_lock(inode, &bh, 1);
+
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
return ret;
}
*level = 1;
if (ocfs2_should_update_atime(inode, vfsmnt))
ocfs2_update_inode_atime(inode, bh);
- if (bh)
- brelse(bh);
+ brelse(bh);
} else
*level = 0;
@@ -2474,9 +2623,126 @@ void ocfs2_inode_unlock(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
- if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
+ if (!ocfs2_is_hard_readonly(osb) &&
!ocfs2_mount_local(osb))
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+/*
+ * This _tracker variants are introduced to deal with the recursive cluster
+ * locking issue. The idea is to keep track of a lock holder on the stack of
+ * the current process. If there's a lock holder on the stack, we know the
+ * task context is already protected by cluster locking. Currently, they're
+ * used in some VFS entry routines.
+ *
+ * return < 0 on error, return == 0 if there's no lock holder on the stack
+ * before this call, return == 1 if this call would be a recursive locking.
+ * return == -1 if this lock attempt will cause an upgrade which is forbidden.
+ *
+ * When taking lock levels into account,we face some different situations.
+ *
+ * 1. no lock is held
+ * In this case, just lock the inode as requested and return 0
+ *
+ * 2. We are holding a lock
+ * For this situation, things diverges into several cases
+ *
+ * wanted holding what to do
+ * ex ex see 2.1 below
+ * ex pr see 2.2 below
+ * pr ex see 2.1 below
+ * pr pr see 2.1 below
+ *
+ * 2.1 lock level that is been held is compatible
+ * with the wanted level, so no lock action will be tacken.
+ *
+ * 2.2 Otherwise, an upgrade is needed, but it is forbidden.
+ *
+ * Reason why upgrade within a process is forbidden is that
+ * lock upgrade may cause dead lock. The following illustrates
+ * how it happens.
+ *
+ * thread on node1 thread on node2
+ * ocfs2_inode_lock_tracker(ex=0)
+ *
+ * <====== ocfs2_inode_lock_tracker(ex=1)
+ *
+ * ocfs2_inode_lock_tracker(ex=1)
+ */
+int ocfs2_inode_lock_tracker(struct inode *inode,
+ struct buffer_head **ret_bh,
+ int ex,
+ struct ocfs2_lock_holder *oh)
+{
+ int status = 0;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_lock_holder *tmp_oh;
+ struct pid *pid = task_pid(current);
+
+
+ lockres = &OCFS2_I(inode)->ip_inode_lockres;
+ tmp_oh = ocfs2_pid_holder(lockres, pid);
+
+ if (!tmp_oh) {
+ /*
+ * This corresponds to the case 1.
+ * We haven't got any lock before.
+ */
+ status = ocfs2_inode_lock_full(inode, ret_bh, ex, 0);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+
+ oh->oh_ex = ex;
+ ocfs2_add_holder(lockres, oh);
+ return 0;
+ }
+
+ if (unlikely(ex && !tmp_oh->oh_ex)) {
+ /*
+ * case 2.2 upgrade may cause dead lock, forbid it.
+ */
+ mlog(ML_ERROR, "Recursive locking is not permitted to "
+ "upgrade to EX level from PR level.\n");
+ dump_stack();
+ return -EINVAL;
+ }
+
+ /*
+ * case 2.1 OCFS2_META_LOCK_GETBH flag make ocfs2_inode_lock_full.
+ * ignore the lock level and just update it.
+ */
+ if (ret_bh) {
+ status = ocfs2_inode_lock_full(inode, ret_bh, ex,
+ OCFS2_META_LOCK_GETBH);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+ }
+ return 1;
+}
+
+void ocfs2_inode_unlock_tracker(struct inode *inode,
+ int ex,
+ struct ocfs2_lock_holder *oh,
+ int had_lock)
+{
+ struct ocfs2_lock_res *lockres;
+
+ lockres = &OCFS2_I(inode)->ip_inode_lockres;
+ /* had_lock means that the current process already takes the cluster
+ * lock previously.
+ * If had_lock is 1, we have nothing to do here.
+ * If had_lock is 0, we will release the lock.
+ */
+ if (!had_lock) {
+ ocfs2_inode_unlock(inode, oh->oh_ex);
+ ocfs2_remove_holder(lockres, oh);
+ }
}
int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
@@ -2544,11 +2810,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
* refreshed, so we do it here. Of course, making sense of
* everything is up to the caller :) */
status = ocfs2_should_refresh_lock_res(lockres);
- if (status < 0) {
- ocfs2_cluster_unlock(osb, lockres, level);
- mlog_errno(status);
- goto bail;
- }
if (status) {
status = ocfs2_refresh_slot_info(osb);
@@ -2608,14 +2869,25 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
+ if (ex)
+ down_write(&osb->nfs_sync_rwlock);
+ else
+ down_read(&osb->nfs_sync_rwlock);
+
if (ocfs2_mount_local(osb))
return 0;
status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
0, 0);
- if (status < 0)
+ if (status < 0) {
mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+ if (ex)
+ up_write(&osb->nfs_sync_rwlock);
+ else
+ up_read(&osb->nfs_sync_rwlock);
+ }
+
return status;
}
@@ -2626,6 +2898,74 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
if (!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, lockres,
ex ? LKM_EXMODE : LKM_PRMODE);
+ if (ex)
+ up_write(&osb->nfs_sync_rwlock);
+ else
+ up_read(&osb->nfs_sync_rwlock);
+}
+
+int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
+ struct ocfs2_trim_fs_info *info, int trylock)
+{
+ int status;
+ struct ocfs2_trim_fs_lvb *lvb;
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ if (info)
+ info->tf_valid = 0;
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX,
+ trylock ? DLM_LKF_NOQUEUE : 0, 0);
+ if (status < 0) {
+ if (status != -EAGAIN)
+ mlog_errno(status);
+ return status;
+ }
+
+ if (info) {
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+ lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) {
+ info->tf_valid = 1;
+ info->tf_success = lvb->lvb_success;
+ info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum);
+ info->tf_start = be64_to_cpu(lvb->lvb_start);
+ info->tf_len = be64_to_cpu(lvb->lvb_len);
+ info->tf_minlen = be64_to_cpu(lvb->lvb_minlen);
+ info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen);
+ }
+ }
+
+ return status;
+}
+
+void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
+ struct ocfs2_trim_fs_info *info)
+{
+ struct ocfs2_trim_fs_lvb *lvb;
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ if (ocfs2_mount_local(osb))
+ return;
+
+ if (info) {
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION;
+ lvb->lvb_success = info->tf_success;
+ lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum);
+ lvb->lvb_start = cpu_to_be64(info->tf_start);
+ lvb->lvb_len = cpu_to_be64(info->tf_len);
+ lvb->lvb_minlen = cpu_to_be64(info->tf_minlen);
+ lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen);
+ }
+
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
}
int ocfs2_dentry_lock(struct dentry *dentry, int ex)
@@ -2698,7 +3038,7 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
kref_init(&dlm_debug->d_refcnt);
INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
- dlm_debug->d_locking_state = NULL;
+ dlm_debug->d_filter_secs = 0;
out:
return dlm_debug;
}
@@ -2769,6 +3109,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
struct ocfs2_lock_res *iter = v;
struct ocfs2_lock_res *dummy = &priv->p_iter_res;
+ (*pos)++;
spin_lock(&ocfs2_dlm_tracking_lock);
iter = ocfs2_dlm_next_res(iter, priv);
list_del_init(&dummy->l_debug_list);
@@ -2789,17 +3130,40 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
* - Lock stats printed
* New in version 3
* - Max time in lock stats is in usecs (instead of nsecs)
+ * New in version 4
+ * - Add last pr/ex unlock times and first lock wait time in usecs
*/
-#define OCFS2_DLM_DEBUG_STR_VERSION 3
+#define OCFS2_DLM_DEBUG_STR_VERSION 4
static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
{
int i;
char *lvb;
struct ocfs2_lock_res *lockres = v;
+#ifdef CONFIG_OCFS2_FS_STATS
+ u64 now, last;
+ struct ocfs2_dlm_debug *dlm_debug =
+ ((struct ocfs2_dlm_seq_priv *)m->private)->p_dlm_debug;
+#endif
if (!lockres)
return -EINVAL;
+#ifdef CONFIG_OCFS2_FS_STATS
+ if (!lockres->l_lock_wait && dlm_debug->d_filter_secs) {
+ now = ktime_to_us(ktime_get_real());
+ last = max(lockres->l_lock_prmode.ls_last,
+ lockres->l_lock_exmode.ls_last);
+ /*
+ * Use d_filter_secs field to filter lock resources dump,
+ * the default d_filter_secs(0) value filters nothing,
+ * otherwise, only dump the last N seconds active lock
+ * resources.
+ */
+ if (div_u64(now - last, 1000000) > dlm_debug->d_filter_secs)
+ return 0;
+ }
+#endif
+
seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
@@ -2841,6 +3205,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max)
# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max)
# define lock_refresh(_l) ((_l)->l_lock_refresh)
+# define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last)
+# define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last)
+# define lock_wait(_l) ((_l)->l_lock_wait)
#else
# define lock_num_prmode(_l) (0)
# define lock_num_exmode(_l) (0)
@@ -2851,6 +3218,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
# define lock_max_prmode(_l) (0)
# define lock_max_exmode(_l) (0)
# define lock_refresh(_l) (0)
+# define lock_last_prmode(_l) (0ULL)
+# define lock_last_exmode(_l) (0ULL)
+# define lock_wait(_l) (0ULL)
#endif
/* The following seq_print was added in version 2 of this output */
seq_printf(m, "%u\t"
@@ -2861,7 +3231,10 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
"%llu\t"
"%u\t"
"%u\t"
- "%u\t",
+ "%u\t"
+ "%llu\t"
+ "%llu\t"
+ "%llu\t",
lock_num_prmode(lockres),
lock_num_exmode(lockres),
lock_num_prmode_failed(lockres),
@@ -2870,7 +3243,10 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
lock_total_exmode(lockres),
lock_max_prmode(lockres),
lock_max_exmode(lockres),
- lock_refresh(lockres));
+ lock_refresh(lockres),
+ lock_last_prmode(lockres),
+ lock_last_exmode(lockres),
+ lock_wait(lockres));
/* End the line */
seq_printf(m, "\n");
@@ -2897,37 +3273,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
{
- int ret;
struct ocfs2_dlm_seq_priv *priv;
- struct seq_file *seq;
struct ocfs2_super *osb;
- priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
+ priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
if (!priv) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
+
osb = inode->i_private;
ocfs2_get_dlm_debug(osb->osb_dlm_debug);
priv->p_dlm_debug = osb->osb_dlm_debug;
INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
- ret = seq_open(file, &ocfs2_dlm_seq_ops);
- if (ret) {
- kfree(priv);
- mlog_errno(ret);
- goto out;
- }
-
- seq = file->private_data;
- seq->private = priv;
-
ocfs2_add_lockres_tracking(&priv->p_iter_res,
priv->p_dlm_debug);
-out:
- return ret;
+ return 0;
}
static const struct file_operations ocfs2_dlm_debug_fops = {
@@ -2937,36 +3300,24 @@ static const struct file_operations ocfs2_dlm_debug_fops = {
.llseek = seq_lseek,
};
-static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
+static void ocfs2_dlm_init_debug(struct ocfs2_super *osb)
{
- int ret = 0;
struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
- dlm_debug->d_locking_state = debugfs_create_file("locking_state",
- S_IFREG|S_IRUSR,
- osb->osb_debug_root,
- osb,
- &ocfs2_dlm_debug_fops);
- if (!dlm_debug->d_locking_state) {
- ret = -EINVAL;
- mlog(ML_ERROR,
- "Unable to create locking state debugfs file.\n");
- goto out;
- }
+ debugfs_create_file("locking_state", S_IFREG|S_IRUSR,
+ osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops);
+ debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
+ &dlm_debug->d_filter_secs);
ocfs2_get_dlm_debug(dlm_debug);
-out:
- return ret;
}
static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
{
struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
- if (dlm_debug) {
- debugfs_remove(dlm_debug->d_locking_state);
+ if (dlm_debug)
ocfs2_put_dlm_debug(dlm_debug);
- }
}
int ocfs2_dlm_init(struct ocfs2_super *osb)
@@ -2979,14 +3330,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto local;
}
- status = ocfs2_dlm_init_debug(osb);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_dlm_init_debug(osb);
/* launch downconvert thread */
- osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+ osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
+ osb->uuid_str);
if (IS_ERR(osb->dc_task)) {
status = PTR_ERR(osb->dc_task);
osb->dc_task = NULL;
@@ -2996,6 +3344,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
/* for now, uuid == domain */
status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+ osb->osb_cluster_name,
+ strlen(osb->osb_cluster_name),
osb->uuid_str,
strlen(osb->uuid_str),
&lproto, ocfs2_do_node_down, osb,
@@ -3005,7 +3355,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_cluster_this_node(&osb->node_num);
+ status = ocfs2_cluster_this_node(conn, &osb->node_num);
if (status < 0) {
mlog_errno(status);
mlog(ML_ERROR,
@@ -3017,12 +3367,10 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
- ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_nfs_sync_lock_init(osb);
ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
-
- status = 0;
bail:
if (status < 0) {
ocfs2_dlm_shutdown_debug(osb);
@@ -3054,10 +3402,12 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
- ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
- osb->cconn = NULL;
+ if (osb->cconn) {
+ ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+ osb->cconn = NULL;
- ocfs2_dlm_shutdown_debug(osb);
+ ocfs2_dlm_shutdown_debug(osb);
+ }
}
static int ocfs2_drop_lock(struct ocfs2_super *osb,
@@ -3142,22 +3492,60 @@ out:
return 0;
}
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+
/* Mark the lockres as being dropped. It will no longer be
* queued if blocking, but we still may have to wait on it
* being dequeued from the downconvert thread before we can consider
* it safe to drop.
*
* You can *not* attempt to call cluster_lock on this lockres anymore. */
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres)
{
int status;
struct ocfs2_mask_waiter mw;
- unsigned long flags;
+ unsigned long flags, flags2;
ocfs2_init_mask_waiter(&mw);
spin_lock_irqsave(&lockres->l_lock, flags);
lockres->l_flags |= OCFS2_LOCK_FREEING;
+ if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
+ /*
+ * We know the downconvert is queued but not in progress
+ * because we are the downconvert thread and processing
+ * different lock. So we can just remove the lock from the
+ * queue. This is not only an optimization but also a way
+ * to avoid the following deadlock:
+ * ocfs2_dentry_post_unlock()
+ * ocfs2_dentry_lock_put()
+ * ocfs2_drop_dentry_lock()
+ * iput()
+ * ocfs2_evict_inode()
+ * ocfs2_clear_inode()
+ * ocfs2_mark_lockres_freeing()
+ * ... blocks waiting for OCFS2_LOCK_QUEUED
+ * since we are the downconvert thread which
+ * should clear the flag.
+ */
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ spin_lock_irqsave(&osb->dc_task_lock, flags2);
+ list_del_init(&lockres->l_blocked_list);
+ osb->blocked_lock_count--;
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
+ /*
+ * Warn if we recurse into another post_unlock call. Strictly
+ * speaking it isn't a problem but we need to be careful if
+ * that happens (stack overflow, deadlocks, ...) so warn if
+ * ocfs2 grows a path for which this can happen.
+ */
+ WARN_ON_ONCE(lockres->l_ops->post_unlock);
+ /* Since the lock is freeing we don't do much in the fn below */
+ ocfs2_process_blocked_lock(osb, lockres);
+ return;
+ }
while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3178,7 +3566,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
{
int ret;
- ocfs2_mark_lockres_freeing(lockres);
+ ocfs2_mark_lockres_freeing(osb, lockres);
ret = ocfs2_drop_lock(osb, lockres);
if (ret)
mlog_errno(ret);
@@ -3264,6 +3652,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
lockres->l_level, new_level);
+ /*
+ * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
+ * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
+ * we can recover correctly from node failure. Otherwise, we may get
+ * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
+ */
+ if (ocfs2_userspace_stack(osb) &&
+ lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+ lvb = 1;
+
if (lvb)
dlm_flags |= DLM_LKF_VALBLK;
@@ -3403,9 +3801,9 @@ recheck:
* set when the ast is received for an upconvert just before the
* OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
* on the heels of the ast, we want to delay the downconvert just
- * enough to allow the up requestor to do its task. Because this
+ * enough to allow the up requester to do its task. Because this
* lock is in the blocked queue, the lock will be downconverted
- * as soon as the requestor is done with the lock.
+ * as soon as the requester is done with the lock.
*/
if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
goto leave_requeue;
@@ -3516,6 +3914,17 @@ downconvert:
spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
gen);
+ /* The dlm lock convert is being cancelled in background,
+ * ocfs2_cancel_convert() is asynchronous in fs/dlm,
+ * requeue it, try again later.
+ */
+ if (ret == -EBUSY) {
+ ctl->requeue = 1;
+ mlog(ML_BASTS, "lockres %s, ReQ: Downconvert busy\n",
+ lockres->l_name);
+ ret = 0;
+ msleep(20);
+ }
leave:
if (ret)
@@ -3543,7 +3952,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
oi = OCFS2_I(inode);
oi->ip_dir_lock_gen++;
mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
- goto out;
+ goto out_forget;
}
if (!S_ISREG(inode->i_mode))
@@ -3574,6 +3983,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
filemap_fdatawait(mapping);
}
+out_forget:
+ forget_all_cached_acls(inode);
+
out:
return UNBLOCK_CONTINUE;
}
@@ -3703,8 +4115,10 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
break;
spin_unlock(&dentry_attach_lock);
- mlog(0, "d_delete(%.*s);\n", dentry->d_name.len,
- dentry->d_name.name);
+ if (S_ISDIR(dl->dl_inode->i_mode))
+ shrink_dcache_parent(dentry);
+
+ mlog(0, "d_delete(%pd);\n", dentry);
/*
* The following dcache calls may do an
@@ -3924,7 +4338,7 @@ unqueue:
ocfs2_schedule_blocked_lock(osb, lockres);
mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
- ctl.requeue ? "yes" : "no");
+ str_yes_no(ctl.requeue));
spin_unlock_irqrestore(&lockres->l_lock, flags);
if (ctl.unblock_action != UNBLOCK_CONTINUE
@@ -3971,9 +4385,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
osb->dc_work_sequence = osb->dc_wake_sequence;
processed = osb->blocked_lock_count;
- while (processed) {
- BUG_ON(list_empty(&osb->blocked_lock_list));
-
+ /*
+ * blocked lock processing in this loop might call iput which can
+ * remove items off osb->blocked_lock_list. Downconvert up to
+ * 'processed' number of locks, but stop short if we had some
+ * removed in ocfs2_mark_lockres_freeing when downconverting.
+ */
+ while (processed && !list_empty(&osb->blocked_lock_list)) {
lockres = list_entry(osb->blocked_lock_list.next,
struct ocfs2_lock_res, l_blocked_list);
list_del_init(&lockres->l_blocked_list);
@@ -4018,7 +4436,6 @@ static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
static int ocfs2_downconvert_thread(void *arg)
{
- int status = 0;
struct ocfs2_super *osb = arg;
/* only quit once we've been asked to stop and there is no more
@@ -4036,7 +4453,7 @@ static int ocfs2_downconvert_thread(void *arg)
}
osb->dc_task = NULL;
- return status;
+ return 0;
}
void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)