summaryrefslogtreecommitdiff
path: root/fs/ocfs2/stack_user.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/stack_user.c')
-rw-r--r--fs/ocfs2/stack_user.c394
1 files changed, 296 insertions, 98 deletions
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 286edf1e231f..be0a5758bd40 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1,29 +1,21 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+// SPDX-License-Identifier: GPL-2.0-only
+/*
* stack_user.c
*
* Code which interfaces ocfs2 with fs/dlm and a userspace stack.
*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/module.h>
#include <linux/fs.h>
+#include <linux/filelock.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/reboot.h>
-#include <asm/uaccess.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
#include "stackglue.h"
@@ -102,6 +94,12 @@
#define OCFS2_TEXT_UUID_LEN 32
#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
+#define VERSION_LOCK "version_lock"
+
+enum ocfs2_connection_type {
+ WITH_CONTROLD,
+ NO_CONTROLD
+};
/*
* ocfs2_live_connection is refcounted because the filesystem and
@@ -110,6 +108,13 @@
struct ocfs2_live_connection {
struct list_head oc_list;
struct ocfs2_cluster_connection *oc_conn;
+ enum ocfs2_connection_type oc_type;
+ atomic_t oc_this_node;
+ int oc_our_slot;
+ struct dlm_lksb oc_version_lksb;
+ char oc_lvb[DLM_LVB_LEN];
+ struct completion oc_sync_wait;
+ wait_queue_head_t oc_wait;
};
struct ocfs2_control_private {
@@ -198,20 +203,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
* mount path. Since the VFS prevents multiple calls to
* fill_super(), we can't get dupes here.
*/
-static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
- struct ocfs2_live_connection **c_ret)
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_live_connection *c)
{
int rc = 0;
- struct ocfs2_live_connection *c;
-
- c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
- if (!c)
- return -ENOMEM;
mutex_lock(&ocfs2_control_lock);
c->oc_conn = conn;
- if (atomic_read(&ocfs2_control_opened))
+ if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
list_add(&c->oc_list, &ocfs2_live_connection_list);
else {
printk(KERN_ERR
@@ -220,12 +220,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
}
mutex_unlock(&ocfs2_control_lock);
-
- if (!rc)
- *c_ret = c;
- else
- kfree(c);
-
return rc;
}
@@ -366,7 +360,6 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
struct ocfs2_control_message_setn *msg)
{
long nodenum;
- char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
if (ocfs2_control_get_handshake_state(file) !=
@@ -381,8 +374,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
return -EINVAL;
msg->space = msg->newline = '\0';
- nodenum = simple_strtol(msg->nodestr, &ptr, 16);
- if (!ptr || *ptr)
+ if (kstrtol(msg->nodestr, 16, &nodenum))
return -EINVAL;
if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
@@ -395,9 +387,8 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
static int ocfs2_control_do_setversion_msg(struct file *file,
struct ocfs2_control_message_setv *msg)
- {
+{
long major, minor;
- char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
struct ocfs2_protocol_version *max =
&ocfs2_user_plugin.sp_max_proto;
@@ -415,11 +406,9 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
return -EINVAL;
msg->space1 = msg->space2 = msg->newline = '\0';
- major = simple_strtol(msg->major, &ptr, 16);
- if (!ptr || *ptr)
+ if (kstrtol(msg->major, 16, &major))
return -EINVAL;
- minor = simple_strtol(msg->minor, &ptr, 16);
- if (!ptr || *ptr)
+ if (kstrtol(msg->minor, 16, &minor))
return -EINVAL;
/*
@@ -447,7 +436,6 @@ static int ocfs2_control_do_down_msg(struct file *file,
struct ocfs2_control_message_down *msg)
{
long nodenum;
- char *p = NULL;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_VALID)
@@ -462,8 +450,7 @@ static int ocfs2_control_do_down_msg(struct file *file,
return -EINVAL;
msg->space1 = msg->space2 = msg->newline = '\0';
- nodenum = simple_strtol(msg->nodestr, &p, 16);
- if (!p || *p)
+ if (kstrtol(msg->nodestr, 16, &nodenum))
return -EINVAL;
if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
@@ -588,7 +575,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
*/
ocfs2_control_this_node = -1;
running_proto.pv_major = 0;
- running_proto.pv_major = 0;
+ running_proto.pv_minor = 0;
}
out:
@@ -652,14 +639,7 @@ static int ocfs2_control_init(void)
static void ocfs2_control_exit(void)
{
- int rc;
-
- rc = misc_deregister(&ocfs2_control_device);
- if (rc)
- printk(KERN_ERR
- "ocfs2: Unable to deregister ocfs2_control device "
- "(errno %d)\n",
- -rc);
+ misc_deregister(&ocfs2_control_device);
}
static void fsdlm_lock_ast_wrapper(void *astarg)
@@ -697,28 +677,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
void *name,
unsigned int namelen)
{
- int ret;
-
if (!lksb->lksb_fsdlm.sb_lvbptr)
lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
sizeof(struct dlm_lksb);
- ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
- flags|DLM_LKF_NODLCKWT, name, namelen, 0,
- fsdlm_lock_ast_wrapper, lksb,
- fsdlm_blocking_ast_wrapper);
- return ret;
+ return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+ flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+ fsdlm_lock_ast_wrapper, lksb,
+ fsdlm_blocking_ast_wrapper);
}
static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags)
{
- int ret;
-
- ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
- flags, &lksb->lksb_fsdlm, lksb);
- return ret;
+ return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+ flags, &lksb->lksb_fsdlm, lksb);
}
static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
@@ -757,20 +731,13 @@ static int user_plock(struct ocfs2_cluster_connection *conn,
*
* Internally, fs/dlm will pass these to a misc device, which
* a userspace daemon will read and write to.
- *
- * For now, cancel requests (which happen internally only),
- * are turned into unlocks. Most of this function taken from
- * gfs2_lock.
*/
- if (cmd == F_CANCELLK) {
- cmd = F_SETLK;
- fl->fl_type = F_UNLCK;
- }
-
- if (IS_GETLK(cmd))
+ if (cmd == F_CANCELLK)
+ return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl);
+ else if (IS_GETLK(cmd))
return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
- else if (fl->fl_type == F_UNLCK)
+ else if (lock_is_unlock(fl))
return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
else
return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
@@ -799,18 +766,257 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
return 0;
}
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ ver->pv_major = pv->pv_major;
+ ver->pv_minor = pv->pv_minor;
+}
+
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ pv->pv_major = ver->pv_major;
+ pv->pv_minor = ver->pv_minor;
+}
+
+static void sync_wait_cb(void *arg)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ complete(&lc->oc_sync_wait);
+}
+
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x error %d\n",
+ name, lksb->sb_lkid, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ if (lksb->sb_status != -DLM_EUNLOCK) {
+ printk(KERN_ERR "%s lkid %x status %d\n",
+ name, lksb->sb_lkid, lksb->sb_status);
+ return -1;
+ }
+ return 0;
+}
+
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+ int mode, uint32_t flags,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error, status;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+ name, strlen(name),
+ 0, sync_wait_cb, conn, NULL);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+ name, lksb->sb_lkid, flags, mode, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ status = lksb->sb_status;
+
+ if (status && status != -EAGAIN) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+ name, lksb->sb_lkid, flags, mode, status);
+ }
+
+ return status;
+}
+
+
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+ int flags)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_lock(conn, mode, flags,
+ &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ * version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ * taking the PR lock.
+ */
+
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+ int ret;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ struct ocfs2_protocol_version pv;
+
+ running_proto.pv_major =
+ ocfs2_user_plugin.sp_max_proto.pv_major;
+ running_proto.pv_minor =
+ ocfs2_user_plugin.sp_max_proto.pv_minor;
+
+ lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+ ret = version_lock(conn, DLM_LOCK_EX,
+ DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+ if (!ret) {
+ conn->cc_version.pv_major = running_proto.pv_major;
+ conn->cc_version.pv_minor = running_proto.pv_minor;
+ version_to_lvb(&running_proto, lc->oc_lvb);
+ version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ } else if (ret == -EAGAIN) {
+ ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+ if (ret)
+ goto out;
+ lvb_to_version(lc->oc_lvb, &pv);
+
+ if ((pv.pv_major != running_proto.pv_major) ||
+ (pv.pv_minor > running_proto.pv_minor)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ conn->cc_version.pv_major = pv.pv_major;
+ conn->cc_version.pv_minor = pv.pv_minor;
+ }
+out:
+ return ret;
+}
+
+static void user_recover_prep(void *arg)
+{
+}
+
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+ slot->nodeid, slot->slot);
+ conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+
+}
+
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+ int num_slots, int our_slot,
+ uint32_t generation)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ int i;
+
+ for (i = 0; i < num_slots; i++)
+ if (slots[i].slot == our_slot) {
+ atomic_set(&lc->oc_this_node, slots[i].nodeid);
+ break;
+ }
+
+ lc->oc_our_slot = our_slot;
+ wake_up(&lc->oc_wait);
+}
+
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
+ .recover_prep = user_recover_prep,
+ .recover_slot = user_recover_slot,
+ .recover_done = user_recover_done,
+};
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+ version_unlock(conn);
+ dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL);
+ conn->cc_lockspace = NULL;
+ ocfs2_live_connection_drop(conn->cc_private);
+ conn->cc_private = NULL;
+ return 0;
+}
+
static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
{
dlm_lockspace_t *fsdlm;
- struct ocfs2_live_connection *uninitialized_var(control);
- int rc = 0;
+ struct ocfs2_live_connection *lc;
+ int rc, ops_rv;
BUG_ON(conn == NULL);
- rc = ocfs2_live_connection_new(conn, &control);
+ lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+ if (!lc)
+ return -ENOMEM;
+
+ init_waitqueue_head(&lc->oc_wait);
+ init_completion(&lc->oc_sync_wait);
+ atomic_set(&lc->oc_this_node, 0);
+ conn->cc_private = lc;
+ lc->oc_type = NO_CONTROLD;
+
+ rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+ DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
+ &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
+ if (rc) {
+ if (rc == -EEXIST || rc == -EPROTO)
+ printk(KERN_ERR "ocfs2: Unable to create the "
+ "lockspace %s (%d), because a ocfs2-tools "
+ "program is running on this file system "
+ "with the same name lockspace\n",
+ conn->cc_name, rc);
+ goto out;
+ }
+
+ if (ops_rv == -EOPNOTSUPP) {
+ lc->oc_type = WITH_CONTROLD;
+ printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+ "version of dlm_controld and/or ocfs2-tools."
+ " Please consider upgrading.\n");
+ } else if (ops_rv) {
+ rc = ops_rv;
+ goto out;
+ }
+ conn->cc_lockspace = fsdlm;
+
+ rc = ocfs2_live_connection_attach(conn, lc);
if (rc)
goto out;
+ if (lc->oc_type == NO_CONTROLD) {
+ rc = get_protocol_version(conn);
+ if (rc) {
+ printk(KERN_ERR "ocfs2: Could not determine"
+ " locking version\n");
+ user_cluster_disconnect(conn);
+ lc = NULL;
+ goto out;
+ }
+ wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+ }
+
/*
* running_proto must have been set before we allowed any mounts
* to proceed.
@@ -818,42 +1024,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
printk(KERN_ERR
"Unable to mount with fs locking protocol version "
- "%u.%u because the userspace control daemon has "
- "negotiated %u.%u\n",
+ "%u.%u because negotiated protocol is %u.%u\n",
conn->cc_version.pv_major, conn->cc_version.pv_minor,
running_proto.pv_major, running_proto.pv_minor);
rc = -EPROTO;
- ocfs2_live_connection_drop(control);
- goto out;
+ ocfs2_live_connection_drop(lc);
+ lc = NULL;
}
- rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
- NULL, NULL, NULL, &fsdlm);
- if (rc) {
- ocfs2_live_connection_drop(control);
- goto out;
- }
-
- conn->cc_private = control;
- conn->cc_lockspace = fsdlm;
out:
+ if (rc)
+ kfree(lc);
return rc;
}
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-{
- dlm_release_lockspace(conn->cc_lockspace, 2);
- conn->cc_lockspace = NULL;
- ocfs2_live_connection_drop(conn->cc_private);
- conn->cc_private = NULL;
- return 0;
-}
-static int user_cluster_this_node(unsigned int *this_node)
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *this_node)
{
int rc;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ if (lc->oc_type == WITH_CONTROLD)
+ rc = ocfs2_control_get_this_node();
+ else if (lc->oc_type == NO_CONTROLD)
+ rc = atomic_read(&lc->oc_this_node);
+ else
+ rc = -EINVAL;
- rc = ocfs2_control_get_this_node();
if (rc < 0)
return rc;
@@ -861,7 +1059,7 @@ static int user_cluster_this_node(unsigned int *this_node)
return 0;
}
-static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
+static const struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
.connect = user_cluster_connect,
.disconnect = user_cluster_disconnect,
.this_node = user_cluster_this_node,