From 61d9102b62129e13a2258c1e0566962f9a1732f0 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 12 Sep 2017 08:55:04 +0000
Subject: DLM: Eliminate CF_CONNECT_PENDING flag

Before this patch, there was a flag in the con structure that was
used to determine whether or not a connect was needed. The bit was
set here and there, and cleared here and there, so it left some
race conditions: the bit was set, work was queued, then the worker
cleared the bit, allowing someone else to set it while the worker
ran. For the most part, this worked okay, but we got into trouble
if connections were lost and it needed to reconnect.

This patch eliminates the flag in favor of simply checking if we
actually have a sock pointer while protected by the mutex.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Reviewed-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 4813d0e0cd9b..6a7a49b93374 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -107,7 +107,6 @@ struct connection {
 	unsigned long flags;
 #define CF_READ_PENDING 1
 #define CF_WRITE_PENDING 2
-#define CF_CONNECT_PENDING 3
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
@@ -435,8 +434,8 @@ static inline void lowcomms_connect_sock(struct connection *con)
 {
 	if (test_bit(CF_CLOSE, &con->flags))
 		return;
-	if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
-		queue_work(send_workqueue, &con->swork);
+	queue_work(send_workqueue, &con->swork);
+	cond_resched();
 }
 
 static void lowcomms_state_change(struct sock *sk)
@@ -579,7 +578,6 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 static void close_connection(struct connection *con, bool and_other,
 			     bool tx, bool rx)
 {
-	clear_bit(CF_CONNECT_PENDING, &con->flags);
 	clear_bit(CF_WRITE_PENDING, &con->flags);
 	if (tx && cancel_work_sync(&con->swork))
 		log_print("canceled swork for node %d", con->nodeid);
@@ -1098,7 +1096,6 @@ socket_err:
 			  con->retries, result);
 		mutex_unlock(&con->sock_mutex);
 		msleep(1000);
-		clear_bit(CF_CONNECT_PENDING, &con->flags);
 		lowcomms_connect_sock(con);
 		return;
 	}
@@ -1194,7 +1191,6 @@ out_err:
 			  con->retries, result);
 		mutex_unlock(&con->sock_mutex);
 		msleep(1000);
-		clear_bit(CF_CONNECT_PENDING, &con->flags);
 		lowcomms_connect_sock(con);
 		return;
 	}
@@ -1593,7 +1589,7 @@ static void process_send_sockets(struct work_struct *work)
 {
 	struct connection *con = container_of(work, struct connection, swork);
 
-	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags))
+	if (con->sock == NULL) /* not mutex protected so check it inside too */
 		con->connect_action(con);
 	if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
 		send_to_sock(con);
-- 
cgit 


From 01da24d3fbed92dc6faf60b753e6bd50cdafb646 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 12 Sep 2017 08:55:14 +0000
Subject: DLM: Eliminate CF_WRITE_PENDING flag

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Reviewed-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 6a7a49b93374..ed707d4323f4 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,7 +106,6 @@ struct connection {
 	struct mutex sock_mutex;
 	unsigned long flags;
 #define CF_READ_PENDING 1
-#define CF_WRITE_PENDING 2
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
@@ -426,8 +425,7 @@ static void lowcomms_write_space(struct sock *sk)
 		clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
 	}
 
-	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
-		queue_work(send_workqueue, &con->swork);
+	queue_work(send_workqueue, &con->swork);
 }
 
 static inline void lowcomms_connect_sock(struct connection *con)
@@ -578,7 +576,6 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 static void close_connection(struct connection *con, bool and_other,
 			     bool tx, bool rx)
 {
-	clear_bit(CF_WRITE_PENDING, &con->flags);
 	if (tx && cancel_work_sync(&con->swork))
 		log_print("canceled swork for node %d", con->nodeid);
 	if (rx && cancel_work_sync(&con->rwork))
@@ -1077,7 +1074,6 @@ static void sctp_connect_to_sock(struct connection *con)
 	if (result == 0)
 		goto out;
 
-
 bind_err:
 	con->sock = NULL;
 	sock_release(sock);
@@ -1102,7 +1098,6 @@ socket_err:
 
 out:
 	mutex_unlock(&con->sock_mutex);
-	set_bit(CF_WRITE_PENDING, &con->flags);
 }
 
 /* Connect a new socket to its peer */
@@ -1196,7 +1191,6 @@ out_err:
 	}
 out:
 	mutex_unlock(&con->sock_mutex);
-	set_bit(CF_WRITE_PENDING, &con->flags);
 	return;
 }
 
@@ -1452,9 +1446,7 @@ void dlm_lowcomms_commit_buffer(void *mh)
 	e->len = e->end - e->offset;
 	spin_unlock(&con->writequeue_lock);
 
-	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
-		queue_work(send_workqueue, &con->swork);
-	}
+	queue_work(send_workqueue, &con->swork);
 	return;
 
 out:
@@ -1524,12 +1516,15 @@ out:
 send_error:
 	mutex_unlock(&con->sock_mutex);
 	close_connection(con, false, false, true);
-	lowcomms_connect_sock(con);
+	/* Requeue the send work. When the work daemon runs again, it will try
+	   a new connection, then call this function again. */
+	queue_work(send_workqueue, &con->swork);
 	return;
 
 out_connect:
 	mutex_unlock(&con->sock_mutex);
-	lowcomms_connect_sock(con);
+	queue_work(send_workqueue, &con->swork);
+	cond_resched();
 }
 
 static void clean_one_writequeue(struct connection *con)
@@ -1591,7 +1586,7 @@ static void process_send_sockets(struct work_struct *work)
 
 	if (con->sock == NULL) /* not mutex protected so check it inside too */
 		con->connect_action(con);
-	if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
+	if (!list_empty(&con->writequeue))
 		send_to_sock(con);
 }
 
-- 
cgit 


From cc661fc934a004526a714a7b804ee3f119d27093 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 12 Sep 2017 08:55:23 +0000
Subject: DLM: Fix saving of NULL callbacks

In a previous patch I noted that accept() often copies the struct
sock (sk) which overwrites the sock callbacks. However, in testing
we discovered that the dlm connection structures (con) are sometimes
deleted and recreated as connections come and go, and since they're
zeroed out by kmem_cache_zalloc, the saved callback pointers are
also initialized to zero. But with today's DLM code, the callbacks
are only saved when a socket is added.

During recovery testing, we discovered a common situation in which
the new con is initialized to zero, then a socket is added after
accept(). In this case, the sock's saved values are all NULL, but
the saved values are wiped out, due to accept(). Therefore, we
don't have a known good copy of the callbacks from which we can
restore.

Since the struct sock callbacks are always good after listen(),
this patch saves the known good values after listen(). These good
values are then used for subsequent restores.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Reviewed-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index ed707d4323f4..32b534f4a9b6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -122,10 +122,6 @@ struct connection {
 	struct connection *othercon;
 	struct work_struct rwork; /* Receive workqueue */
 	struct work_struct swork; /* Send workqueue */
-	void (*orig_error_report)(struct sock *);
-	void (*orig_data_ready)(struct sock *);
-	void (*orig_state_change)(struct sock *);
-	void (*orig_write_space)(struct sock *);
 };
 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
 
@@ -148,6 +144,13 @@ struct dlm_node_addr {
 	struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
 };
 
+static struct listen_sock_callbacks {
+	void (*sk_error_report)(struct sock *);
+	void (*sk_data_ready)(struct sock *);
+	void (*sk_state_change)(struct sock *);
+	void (*sk_write_space)(struct sock *);
+} listen_sock;
+
 static LIST_HEAD(dlm_node_addrs);
 static DEFINE_SPINLOCK(dlm_node_addrs_spin);
 
@@ -477,7 +480,7 @@ static void lowcomms_error_report(struct sock *sk)
 	if (con == NULL)
 		goto out;
 
-	orig_report = con->orig_error_report;
+	orig_report = listen_sock.sk_error_report;
 	if (con->sock == NULL ||
 	    kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) {
 		printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
@@ -514,22 +517,26 @@ out:
 }
 
 /* Note: sk_callback_lock must be locked before calling this function. */
-static void save_callbacks(struct connection *con, struct sock *sk)
+static void save_listen_callbacks(struct socket *sock)
 {
-	con->orig_data_ready = sk->sk_data_ready;
-	con->orig_state_change = sk->sk_state_change;
-	con->orig_write_space = sk->sk_write_space;
-	con->orig_error_report = sk->sk_error_report;
+	struct sock *sk = sock->sk;
+
+	listen_sock.sk_data_ready = sk->sk_data_ready;
+	listen_sock.sk_state_change = sk->sk_state_change;
+	listen_sock.sk_write_space = sk->sk_write_space;
+	listen_sock.sk_error_report = sk->sk_error_report;
 }
 
-static void restore_callbacks(struct connection *con, struct sock *sk)
+static void restore_callbacks(struct socket *sock)
 {
+	struct sock *sk = sock->sk;
+
 	write_lock_bh(&sk->sk_callback_lock);
 	sk->sk_user_data = NULL;
-	sk->sk_data_ready = con->orig_data_ready;
-	sk->sk_state_change = con->orig_state_change;
-	sk->sk_write_space = con->orig_write_space;
-	sk->sk_error_report = con->orig_error_report;
+	sk->sk_data_ready = listen_sock.sk_data_ready;
+	sk->sk_state_change = listen_sock.sk_state_change;
+	sk->sk_write_space = listen_sock.sk_write_space;
+	sk->sk_error_report = listen_sock.sk_error_report;
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
@@ -542,8 +549,6 @@ static void add_sock(struct socket *sock, struct connection *con, bool save_cb)
 	con->sock = sock;
 
 	sk->sk_user_data = con;
-	if (save_cb)
-		save_callbacks(con, sk);
 	/* Install a data_ready callback */
 	sk->sk_data_ready = lowcomms_data_ready;
 	sk->sk_write_space = lowcomms_write_space;
@@ -583,8 +588,7 @@ static void close_connection(struct connection *con, bool and_other,
 
 	mutex_lock(&con->sock_mutex);
 	if (con->sock) {
-		if (!test_bit(CF_IS_OTHERCON, &con->flags))
-			restore_callbacks(con, con->sock->sk);
+		restore_callbacks(con->sock);
 		sock_release(con->sock);
 		con->sock = NULL;
 	}
@@ -1226,7 +1230,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 		log_print("Failed to set SO_REUSEADDR on socket: %d", result);
 	}
 	sock->sk->sk_user_data = con;
-
+	save_listen_callbacks(sock);
 	con->rx_action = tcp_accept_from_sock;
 	con->connect_action = tcp_connect_to_sock;
 
@@ -1310,6 +1314,7 @@ static int sctp_listen_for_all(void)
 	write_lock_bh(&sock->sk->sk_callback_lock);
 	/* Init con struct */
 	sock->sk->sk_user_data = con;
+	save_listen_callbacks(sock);
 	con->sock = sock;
 	con->sock->sk->sk_data_ready = lowcomms_data_ready;
 	con->rx_action = sctp_accept_from_sock;
-- 
cgit 


From 988419a9deab68035364d8163bc27adb694ab28e Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 08:55:32 +0000
Subject: DLM: fix remove save_cb argument from add_sock()

save_cb argument is not used. We remove them.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 32b534f4a9b6..72247cb4bc5e 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -541,7 +541,7 @@ static void restore_callbacks(struct socket *sock)
 }
 
 /* Make a socket active */
-static void add_sock(struct socket *sock, struct connection *con, bool save_cb)
+static void add_sock(struct socket *sock, struct connection *con)
 {
 	struct sock *sk = sock->sk;
 
@@ -801,7 +801,7 @@ static int tcp_accept_from_sock(struct connection *con)
 			newcon->othercon = othercon;
 			othercon->sock = newsock;
 			newsock->sk->sk_user_data = othercon;
-			add_sock(newsock, othercon, false);
+			add_sock(newsock, othercon);
 			addcon = othercon;
 		}
 		else {
@@ -817,7 +817,7 @@ static int tcp_accept_from_sock(struct connection *con)
 		/* accept copies the sk after we've saved the callbacks, so we
 		   don't want to save them a second time or comm errors will
 		   result in calling sk_error_report recursively. */
-		add_sock(newsock, newcon, false);
+		add_sock(newsock, newcon);
 		addcon = newcon;
 	}
 
@@ -918,7 +918,7 @@ static int sctp_accept_from_sock(struct connection *con)
 			newcon->othercon = othercon;
 			othercon->sock = newsock;
 			newsock->sk->sk_user_data = othercon;
-			add_sock(newsock, othercon, false);
+			add_sock(newsock, othercon);
 			addcon = othercon;
 		} else {
 			printk("Extra connection from node %d attempted\n", nodeid);
@@ -929,7 +929,7 @@ static int sctp_accept_from_sock(struct connection *con)
 	} else {
 		newsock->sk->sk_user_data = newcon;
 		newcon->rx_action = receive_from_sock;
-		add_sock(newsock, newcon, false);
+		add_sock(newsock, newcon);
 		addcon = newcon;
 	}
 
@@ -1057,7 +1057,7 @@ static void sctp_connect_to_sock(struct connection *con)
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
 	con->connect_action = sctp_connect_to_sock;
-	add_sock(sock, con, true);
+	add_sock(sock, con);
 
 	/* Bind to all addresses. */
 	if (sctp_bind_addrs(con, 0))
@@ -1142,7 +1142,7 @@ static void tcp_connect_to_sock(struct connection *con)
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
 	con->connect_action = tcp_connect_to_sock;
-	add_sock(sock, con, true);
+	add_sock(sock, con);
 
 	/* Bind to our cluster-known address connecting to avoid
 	   routing problems */
@@ -1361,7 +1361,7 @@ static int tcp_listen_for_all(void)
 
 	sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
 	if (sock) {
-		add_sock(sock, con, true);
+		add_sock(sock, con);
 		result = 0;
 	}
 	else {
-- 
cgit 


From f0fb83cb9201a9f272f8ac771eed6b1e5745375c Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 08:55:40 +0000
Subject: DLM: fix double list_del()

dlm_lowcomms_stop() was not functioning properly. Correctly, we have to
wait until all processing is finished with send_workqueue and
recv_workqueue.
This problem causes the following issue. Senario is

1. dlm_send thread:
    send_to_sock refers con->writequeue
2. main thread:
    dlm_lowcomms_stop calls list_del
3. dlm_send thread:
    send_to_sock calls list_del in writequeue_entry_complete

[ 1925.770305] dlm: canceled swork for node 4
[ 1925.772374] general protection fault: 0000 [#1] SMP
[ 1925.777930] Modules linked in: ocfs2_stack_user ocfs2 ocfs2_nodemanager ocfs2_stackglue dlm fmxnet(O) fmx_api(O) fmx_cu(O) igb(O) kvm_intel kvm irqbypass autofs4
[ 1925.794131] CPU: 3 PID: 6994 Comm: kworker/u8:0 Tainted: G           O    4.4.39 #1
[ 1925.802684] Hardware name: TOSHIBA OX/OX, BIOS OX-P0015 12/03/2015
[ 1925.809595] Workqueue: dlm_send process_send_sockets [dlm]
[ 1925.815714] task: ffff8804398d3c00 ti: ffff88046910c000 task.ti: ffff88046910c000
[ 1925.824072] RIP: 0010:[<ffffffffa04bd158>]  [<ffffffffa04bd158>] process_send_sockets+0xf8/0x280 [dlm]
[ 1925.834480] RSP: 0018:ffff88046910fde0  EFLAGS: 00010246
[ 1925.840411] RAX: dead000000000200 RBX: 0000000000000001 RCX: 000000000000000a
[ 1925.848372] RDX: ffff88046bd980c0 RSI: 0000000000000000 RDI: ffff8804673c5670
[ 1925.856341] RBP: ffff88046910fe20 R08: 00000000000000c9 R09: 0000000000000010
[ 1925.864311] R10: ffffffff81e22fc0 R11: 0000000000000000 R12: ffff8804673c56d8
[ 1925.872281] R13: ffff8804673c5660 R14: ffff88046bd98440 R15: 0000000000000058
[ 1925.880251] FS:  0000000000000000(0000) GS:ffff88047fd80000(0000) knlGS:0000000000000000
[ 1925.889280] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1925.895694] CR2: 00007fff09eadf58 CR3: 00000004690f5000 CR4: 00000000001006e0
[ 1925.903663] Stack:
[ 1925.905903]  ffff8804673c5630 ffff8804673c5620 ffff8804673c5670 ffff88007d219b40
[ 1925.914181]  ffff88046f095800 0000000000000100 ffff8800717a1400 ffff8804673c56d8
[ 1925.922459]  ffff88046910fe60 ffffffff81073db2 00ff880400000000 ffff88007d219b40
[ 1925.930736] Call Trace:
[ 1925.933468]  [<ffffffff81073db2>] process_one_work+0x162/0x450
[ 1925.939983]  [<ffffffff81074459>] worker_thread+0x69/0x4a0
[ 1925.946109]  [<ffffffff810743f0>] ? rescuer_thread+0x350/0x350
[ 1925.952622]  [<ffffffff8107956f>] kthread+0xef/0x110
[ 1925.958165]  [<ffffffff81079480>] ? kthread_park+0x60/0x60
[ 1925.964283]  [<ffffffff8186ab2f>] ret_from_fork+0x3f/0x70
[ 1925.970312]  [<ffffffff81079480>] ? kthread_park+0x60/0x60
[ 1925.976436] Code: 01 00 00 48 8b 7d d0 e8 07 d3 3a e1 45 01 7e 18 45 29 7e 1c 75 ab 41 8b 46 24 85 c0 75 a3 49 8b 16 49 8b 46 08 31 f6 48 89 42 08 <48> 89 10 48 b8 00 01 00 00 00 00 ad de 49 8b 7e 10 49 89 06 66
[ 1925.997791] RIP  [<ffffffffa04bd158>] process_send_sockets+0xf8/0x280 [dlm]
[ 1926.005577]  RSP <ffff88046910fde0>

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 72247cb4bc5e..980c58befd53 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1628,11 +1628,20 @@ static int work_start(void)
 	return 0;
 }
 
-static void stop_conn(struct connection *con)
+static void _stop_conn(struct connection *con, bool and_other)
 {
-	con->flags |= 0x0F;
+	mutex_lock(&con->sock_mutex);
+	set_bit(CF_READ_PENDING, &con->flags);
 	if (con->sock && con->sock->sk)
 		con->sock->sk->sk_user_data = NULL;
+	if (con->othercon && and_other)
+		_stop_conn(con->othercon, false);
+	mutex_unlock(&con->sock_mutex);
+}
+
+static void stop_conn(struct connection *con)
+{
+	_stop_conn(con, true);
 }
 
 static void free_conn(struct connection *con)
@@ -1644,6 +1653,32 @@ static void free_conn(struct connection *con)
 	kmem_cache_free(con_cache, con);
 }
 
+static void work_flush(void)
+{
+	int ok;
+	int i;
+	struct hlist_node *n;
+	struct connection *con;
+
+	flush_workqueue(recv_workqueue);
+	flush_workqueue(send_workqueue);
+	do {
+		ok = 1;
+		foreach_conn(stop_conn);
+		flush_workqueue(recv_workqueue);
+		flush_workqueue(send_workqueue);
+		for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
+			hlist_for_each_entry_safe(con, n,
+						  &connection_hash[i], list) {
+				ok &= test_bit(CF_READ_PENDING, &con->flags);
+				if (con->othercon)
+					ok &= test_bit(CF_READ_PENDING,
+						       &con->othercon->flags);
+			}
+		}
+	} while (!ok);
+}
+
 void dlm_lowcomms_stop(void)
 {
 	/* Set all the flags to prevent any
@@ -1651,11 +1686,10 @@ void dlm_lowcomms_stop(void)
 	*/
 	mutex_lock(&connections_lock);
 	dlm_allow_conn = 0;
-	foreach_conn(stop_conn);
+	mutex_unlock(&connections_lock);
+	work_flush();
 	clean_writequeues();
 	foreach_conn(free_conn);
-	mutex_unlock(&connections_lock);
-
 	work_stop();
 
 	kmem_cache_destroy(con_cache);
-- 
cgit 


From b2a6662932c52304eee11323701f8a01aa110e37 Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 08:55:50 +0000
Subject: DLM: fix race condition between dlm_send and dlm_recv

When kernel_sendpage(in send_to_sock) and kernel_recvmsg
(in receive_from_sock) return error, close_connection may works at the
same time. At that time, they may wait for each other by cancel_work_sync.

Signed-off-by: Tadashi Miyauchi <miayuchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 980c58befd53..420946dcb7ca 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -110,6 +110,7 @@ struct connection {
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
 #define CF_APP_LIMITED 7
+#define CF_CLOSING 8
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
 	spinlock_t writequeue_lock;
 	int (*rx_action) (struct connection *);	/* What to do when active */
@@ -581,9 +582,11 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 static void close_connection(struct connection *con, bool and_other,
 			     bool tx, bool rx)
 {
-	if (tx && cancel_work_sync(&con->swork))
+	bool closing = test_and_set_bit(CF_CLOSING, &con->flags);
+
+	if (tx && !closing && cancel_work_sync(&con->swork))
 		log_print("canceled swork for node %d", con->nodeid);
-	if (rx && cancel_work_sync(&con->rwork))
+	if (rx && !closing && cancel_work_sync(&con->rwork))
 		log_print("canceled rwork for node %d", con->nodeid);
 
 	mutex_lock(&con->sock_mutex);
@@ -603,6 +606,7 @@ static void close_connection(struct connection *con, bool and_other,
 
 	con->retries = 0;
 	mutex_unlock(&con->sock_mutex);
+	clear_bit(CF_CLOSING, &con->flags);
 }
 
 /* Data received from remote end */
-- 
cgit 


From c7355827b27c550824bbcc1f0586cf993bf83d94 Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 08:56:00 +0000
Subject: DLM: fix to use sock_mutex correctly in xxx_accept_from_sock

In the current implementation, we think that exclusion control
for othercon in tcp_accept_from_sock() and sctp_accept_from_sock()
was not enough. We fix them.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 420946dcb7ca..b275813c9901 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -801,16 +801,19 @@ static int tcp_accept_from_sock(struct connection *con)
 			INIT_WORK(&othercon->rwork, process_recv_sockets);
 			set_bit(CF_IS_OTHERCON, &othercon->flags);
 		}
+		mutex_lock_nested(&othercon->sock_mutex, 2);
 		if (!othercon->sock) {
 			newcon->othercon = othercon;
 			othercon->sock = newsock;
 			newsock->sk->sk_user_data = othercon;
 			add_sock(newsock, othercon);
 			addcon = othercon;
+			mutex_unlock(&othercon->sock_mutex);
 		}
 		else {
 			printk("Extra connection from node %d attempted\n", nodeid);
 			result = -EAGAIN;
+			mutex_unlock(&othercon->sock_mutex);
 			mutex_unlock(&newcon->sock_mutex);
 			goto accept_err;
 		}
@@ -918,15 +921,18 @@ static int sctp_accept_from_sock(struct connection *con)
 			INIT_WORK(&othercon->rwork, process_recv_sockets);
 			set_bit(CF_IS_OTHERCON, &othercon->flags);
 		}
+		mutex_lock_nested(&othercon->sock_mutex, 2);
 		if (!othercon->sock) {
 			newcon->othercon = othercon;
 			othercon->sock = newsock;
 			newsock->sk->sk_user_data = othercon;
 			add_sock(newsock, othercon);
 			addcon = othercon;
+			mutex_unlock(&othercon->sock_mutex);
 		} else {
 			printk("Extra connection from node %d attempted\n", nodeid);
 			ret = -EAGAIN;
+			mutex_unlock(&othercon->sock_mutex);
 			mutex_unlock(&newcon->sock_mutex);
 			goto accept_err;
 		}
-- 
cgit 


From 5966121241b10a32396d770a0b39a41441511a8c Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 08:56:08 +0000
Subject: DLM: retry rcom when dlm_wait_function is timed out.

If a node sends a DLM_RCOM_STATUS command and an error occurs on the
receiving side, the DLM_RCOM_STATUS_REPLY response may not be returned.
We retransmitted the DLM_RCOM_STATUS command so that we do not wait for
an infinite response.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/rcom.c    | 6 ++++++
 fs/dlm/recover.c | 4 ++++
 2 files changed, 10 insertions(+)

(limited to 'fs/dlm')

diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f3f5e72a29ba..4ff061de927e 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -155,6 +155,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
 		goto out;
 	}
 
+retry:
 	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
 			    sizeof(struct rcom_status), &rc, &mh);
 	if (error)
@@ -169,6 +170,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
 
 	error = dlm_wait_function(ls, &rcom_response);
 	disallow_sync_reply(ls);
+	if (error == -ETIMEDOUT)
+		goto retry;
 	if (error)
 		goto out;
 
@@ -276,6 +279,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
 
 	ls->ls_recover_nodeid = nodeid;
 
+retry:
 	error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
 	if (error)
 		goto out;
@@ -288,6 +292,8 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
 
 	error = dlm_wait_function(ls, &rcom_response);
 	disallow_sync_reply(ls);
+	if (error == -ETIMEDOUT)
+		goto retry;
  out:
 	return error;
 }
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index eaea789bf97d..ce2aa54ca2e2 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -52,6 +52,10 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
 					dlm_config.ci_recover_timer * HZ);
 		if (rv)
 			break;
+		if (test_bit(LSFL_RCOM_WAIT, &ls->ls_flags)) {
+			log_debug(ls, "dlm_wait_function timed out");
+			return -ETIMEDOUT;
+		}
 	}
 
 	if (dlm_recovery_stopped(ls)) {
-- 
cgit 


From c553e173b022347dcb34721688f55167c654466d Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 08:56:15 +0000
Subject: DLM: close othercon at send/receive error

If an error occurs in the sending / receiving process, if othercon
exists, sending / receiving processing using othercon may also result
in an error. We fix to pre-close othercon as well.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index b275813c9901..b48640709bbd 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -703,7 +703,7 @@ out_resched:
 out_close:
 	mutex_unlock(&con->sock_mutex);
 	if (ret != -EAGAIN) {
-		close_connection(con, false, true, false);
+		close_connection(con, true, true, false);
 		/* Reconnect when there is something to send */
 	}
 	/* Don't return success if we really got EOF */
@@ -1530,7 +1530,7 @@ out:
 
 send_error:
 	mutex_unlock(&con->sock_mutex);
-	close_connection(con, false, false, true);
+	close_connection(con, true, false, true);
 	/* Requeue the send work. When the work daemon runs again, it will try
 	   a new connection, then call this function again. */
 	queue_work(send_workqueue, &con->swork);
-- 
cgit 


From e412f9201de255a503ae3d67f46aae580c79006c Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 08:56:30 +0000
Subject: DLM: fix race condition between dlm_recoverd_stop and dlm_recoverd

When dlm_recoverd_stop() is called between kthread_should_stop() and
set_task_state(TASK_INTERRUPTIBLE), dlm_recoverd will not wake up.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/recoverd.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 6859b4bf971e..d2ad1cab0f05 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -287,8 +287,17 @@ static int dlm_recoverd(void *arg)
 	set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
 	wake_up(&ls->ls_recover_lock_wait);
 
-	while (!kthread_should_stop()) {
+	while (1) {
+		/*
+		 * We call kthread_should_stop() after set_current_state().
+		 * This is because it works correctly if kthread_stop() is
+		 * called just before set_current_state().
+		 */
 		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop()) {
+			set_current_state(TASK_RUNNING);
+			break;
+		}
 		if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) &&
 		    !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags))
 			schedule();
-- 
cgit 


From 8a4abb0819769a556f9023845d3821a06f81452d Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 09:01:16 +0000
Subject: DLM: Reanimate CF_WRITE_PENDING flag

CF_WRITE_PENDING flag has been reanimated to make dlm_send stop properly
when running dlm_lowcomms_stop.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index b48640709bbd..306b5fe0866f 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
 	struct mutex sock_mutex;
 	unsigned long flags;
 #define CF_READ_PENDING 1
+#define CF_WRITE_PENDING 2
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
@@ -1599,6 +1600,7 @@ static void process_send_sockets(struct work_struct *work)
 {
 	struct connection *con = container_of(work, struct connection, swork);
 
+	clear_bit(CF_WRITE_PENDING, &con->flags);
 	if (con->sock == NULL) /* not mutex protected so check it inside too */
 		con->connect_action(con);
 	if (!list_empty(&con->writequeue))
@@ -1642,6 +1644,7 @@ static void _stop_conn(struct connection *con, bool and_other)
 {
 	mutex_lock(&con->sock_mutex);
 	set_bit(CF_READ_PENDING, &con->flags);
+	set_bit(CF_WRITE_PENDING, &con->flags);
 	if (con->sock && con->sock->sk)
 		con->sock->sk->sk_user_data = NULL;
 	if (con->othercon && and_other)
@@ -1681,9 +1684,13 @@ static void work_flush(void)
 			hlist_for_each_entry_safe(con, n,
 						  &connection_hash[i], list) {
 				ok &= test_bit(CF_READ_PENDING, &con->flags);
-				if (con->othercon)
+				ok &= test_bit(CF_WRITE_PENDING, &con->flags);
+				if (con->othercon) {
 					ok &= test_bit(CF_READ_PENDING,
 						       &con->othercon->flags);
+					ok &= test_bit(CF_WRITE_PENDING,
+						       &con->othercon->flags);
+				}
 			}
 		}
 	} while (!ok);
-- 
cgit 


From 173a31fe2b23b3ccc45d0b70edb225b1d836c31d Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 09:01:24 +0000
Subject: DLM: use CF_CLOSE flag to stop dlm_send correctly

If reconnection fails while executing dlm_lowcomms_stop,
dlm_send will not stop.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 306b5fe0866f..215515198edb 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1643,6 +1643,7 @@ static int work_start(void)
 static void _stop_conn(struct connection *con, bool and_other)
 {
 	mutex_lock(&con->sock_mutex);
+	set_bit(CF_CLOSE, &con->flags);
 	set_bit(CF_READ_PENDING, &con->flags);
 	set_bit(CF_WRITE_PENDING, &con->flags);
 	if (con->sock && con->sock->sk)
-- 
cgit 


From 294e7e458763dc1d229cbbe7147a6034bfc6e39a Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Fri, 15 Sep 2017 14:17:23 -0500
Subject: DLM: fix conversion deadlock when DLM_LKF_NODLCKWT flag is set

When the DLM_LKF_NODLCKWT flag was set, even if conversion deadlock
was detected, the caller of can_be_granted() was unknown.
We change the behavior of can_be_granted() and change it to detect
conversion deadlock regardless of whether the DLM_LKF_NODLCKWT flag
is set or not. And depending on whether the DLM_LKF_NODLCKWT flag
is set or not, we change the behavior at the caller of can_be_granted().

This fix has no effect except when using DLM_LKF_NODLCKWT flag.
Currently, ocfs2 uses the DLM_LKF_NODLCKWT flag and does not expect a
cancel operation from conversion deadlock when calling dlm_lock().
ocfs2 is implemented to perform a cancel operation by requesting
BASTs (callback).

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index d4aaddec1b16..f145a2a9d6cb 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2465,14 +2465,12 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
 		if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) {
 			lkb->lkb_grmode = DLM_LOCK_NL;
 			lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
-		} else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
-			if (err)
-				*err = -EDEADLK;
-			else {
-				log_print("can_be_granted deadlock %x now %d",
-					  lkb->lkb_id, now);
-				dlm_dump_rsb(r);
-			}
+		} else if (err) {
+			*err = -EDEADLK;
+		} else {
+			log_print("can_be_granted deadlock %x now %d",
+				  lkb->lkb_id, now);
+			dlm_dump_rsb(r);
 		}
 		goto out;
 	}
@@ -2501,13 +2499,6 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
 	return rv;
 }
 
-/* FIXME: I don't think that can_be_granted() can/will demote or find deadlock
-   for locks pending on the convert list.  Once verified (watch for these
-   log_prints), we should be able to just call _can_be_granted() and not
-   bother with the demote/deadlk cases here (and there's no easy way to deal
-   with a deadlk here, we'd have to generate something like grant_lock with
-   the deadlk error.) */
-
 /* Returns the highest requested mode of all blocked conversions; sets
    cw if there's a blocked conversion to DLM_LOCK_CW. */
 
@@ -2545,9 +2536,22 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw,
 		}
 
 		if (deadlk) {
-			log_print("WARN: pending deadlock %x node %d %s",
-				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
-			dlm_dump_rsb(r);
+			/*
+			 * If DLM_LKB_NODLKWT flag is set and conversion
+			 * deadlock is detected, we request blocking AST and
+			 * down (or cancel) conversion.
+			 */
+			if (lkb->lkb_exflags & DLM_LKF_NODLCKWT) {
+				if (lkb->lkb_highbast < lkb->lkb_rqmode) {
+					queue_bast(r, lkb, lkb->lkb_rqmode);
+					lkb->lkb_highbast = lkb->lkb_rqmode;
+				}
+			} else {
+				log_print("WARN: pending deadlock %x node %d %s",
+					  lkb->lkb_id, lkb->lkb_nodeid,
+					  r->res_name);
+				dlm_dump_rsb(r);
+			}
 			continue;
 		}
 
@@ -3123,7 +3127,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	   deadlock, so we leave it on the granted queue and return EDEADLK in
 	   the ast for the convert. */
 
-	if (deadlk) {
+	if (deadlk && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
 		/* it's left on the granted queue */
 		revert_lock(r, lkb);
 		queue_cast(r, lkb, -EDEADLK);
-- 
cgit 


From 3421fb15be01ff4714fed5a5d6db64849b59a1fd Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 09:01:38 +0000
Subject: DLM: fix memory leak in tcp_accept_from_sock()

The sk member of the socket generated by sock_create_kern() is overwritten
by ops->accept(). So the previous sk will not be released.
We use kernel_accept() instead of sock_create_kern() and ops->accept().

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 215515198edb..a464a8c446ab 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -732,22 +732,14 @@ static int tcp_accept_from_sock(struct connection *con)
 	}
 	mutex_unlock(&connections_lock);
 
-	memset(&peeraddr, 0, sizeof(peeraddr));
-	result = sock_create_lite(dlm_local_addr[0]->ss_family,
-				  SOCK_STREAM, IPPROTO_TCP, &newsock);
-	if (result < 0)
-		return -ENOMEM;
-
 	mutex_lock_nested(&con->sock_mutex, 0);
 
-	result = -ENOTCONN;
-	if (con->sock == NULL)
-		goto accept_err;
-
-	newsock->type = con->sock->type;
-	newsock->ops = con->sock->ops;
+	if (!con->sock) {
+		mutex_unlock(&con->sock_mutex);
+		return -ENOTCONN;
+	}
 
-	result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK, true);
+	result = kernel_accept(con->sock, &newsock, O_NONBLOCK);
 	if (result < 0)
 		goto accept_err;
 
@@ -844,7 +836,8 @@ static int tcp_accept_from_sock(struct connection *con)
 
 accept_err:
 	mutex_unlock(&con->sock_mutex);
-	sock_release(newsock);
+	if (newsock)
+		sock_release(newsock);
 
 	if (result != -EAGAIN)
 		log_print("error accepting connection from node: %d", result);
-- 
cgit 


From ccbbea04326e061651e0b38eda0792c758ce4f91 Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 09:01:46 +0000
Subject: DLM: fix overflow dlm_cb_seq

dlm_cb_seq is 64 bits. If dlm_cb_seq overflows and returns to 0,
dlm_rem_lkb_callback() will not work properly.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/dlm')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 07fed838d8fd..562fa8c3edff 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -181,6 +181,8 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
 
 	spin_lock(&dlm_cb_seq_spin);
 	new_seq = ++dlm_cb_seq;
+	if (!dlm_cb_seq)
+		new_seq = ++dlm_cb_seq;
 	spin_unlock(&dlm_cb_seq_spin);
 
 	if (lkb->lkb_flags & DLM_IFL_USER) {
-- 
cgit 


From 93eaadebe9e1c28528f01204cefccc7ba050c913 Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 09:01:55 +0000
Subject: DLM: fix to use sk_callback_lock correctly

In the current implementation, we think that exclusion control between
processing to set the callback function to the connection structure and
processing to refer to the connection structure from the callback function
was not enough. We fix them.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index a464a8c446ab..5d0de91adc36 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -411,17 +411,23 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
 /* Data available on socket or listen socket received a connect */
 static void lowcomms_data_ready(struct sock *sk)
 {
-	struct connection *con = sock2con(sk);
+	struct connection *con;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	con = sock2con(sk);
 	if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
 		queue_work(recv_workqueue, &con->rwork);
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 static void lowcomms_write_space(struct sock *sk)
 {
-	struct connection *con = sock2con(sk);
+	struct connection *con;
 
+	read_lock_bh(&sk->sk_callback_lock);
+	con = sock2con(sk);
 	if (!con)
-		return;
+		goto out;
 
 	clear_bit(SOCK_NOSPACE, &con->sock->flags);
 
@@ -431,6 +437,8 @@ static void lowcomms_write_space(struct sock *sk)
 	}
 
 	queue_work(send_workqueue, &con->swork);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 static inline void lowcomms_connect_sock(struct connection *con)
@@ -797,8 +805,6 @@ static int tcp_accept_from_sock(struct connection *con)
 		mutex_lock_nested(&othercon->sock_mutex, 2);
 		if (!othercon->sock) {
 			newcon->othercon = othercon;
-			othercon->sock = newsock;
-			newsock->sk->sk_user_data = othercon;
 			add_sock(newsock, othercon);
 			addcon = othercon;
 			mutex_unlock(&othercon->sock_mutex);
@@ -812,7 +818,6 @@ static int tcp_accept_from_sock(struct connection *con)
 		}
 	}
 	else {
-		newsock->sk->sk_user_data = newcon;
 		newcon->rx_action = receive_from_sock;
 		/* accept copies the sk after we've saved the callbacks, so we
 		   don't want to save them a second time or comm errors will
@@ -918,8 +923,6 @@ static int sctp_accept_from_sock(struct connection *con)
 		mutex_lock_nested(&othercon->sock_mutex, 2);
 		if (!othercon->sock) {
 			newcon->othercon = othercon;
-			othercon->sock = newsock;
-			newsock->sk->sk_user_data = othercon;
 			add_sock(newsock, othercon);
 			addcon = othercon;
 			mutex_unlock(&othercon->sock_mutex);
@@ -931,7 +934,6 @@ static int sctp_accept_from_sock(struct connection *con)
 			goto accept_err;
 		}
 	} else {
-		newsock->sk->sk_user_data = newcon;
 		newcon->rx_action = receive_from_sock;
 		add_sock(newsock, newcon);
 		addcon = newcon;
@@ -1058,7 +1060,6 @@ static void sctp_connect_to_sock(struct connection *con)
 	if (result < 0)
 		goto socket_err;
 
-	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
 	con->connect_action = sctp_connect_to_sock;
 	add_sock(sock, con);
@@ -1143,7 +1144,6 @@ static void tcp_connect_to_sock(struct connection *con)
 		goto out_err;
 	}
 
-	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
 	con->connect_action = tcp_connect_to_sock;
 	add_sock(sock, con);
@@ -1233,10 +1233,12 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 	if (result < 0) {
 		log_print("Failed to set SO_REUSEADDR on socket: %d", result);
 	}
+	write_lock_bh(&sock->sk->sk_callback_lock);
 	sock->sk->sk_user_data = con;
 	save_listen_callbacks(sock);
 	con->rx_action = tcp_accept_from_sock;
 	con->connect_action = tcp_connect_to_sock;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
 
 	/* Bind to our port */
 	make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
@@ -1639,8 +1641,11 @@ static void _stop_conn(struct connection *con, bool and_other)
 	set_bit(CF_CLOSE, &con->flags);
 	set_bit(CF_READ_PENDING, &con->flags);
 	set_bit(CF_WRITE_PENDING, &con->flags);
-	if (con->sock && con->sock->sk)
+	if (con->sock && con->sock->sk) {
+		write_lock_bh(&con->sock->sk->sk_callback_lock);
 		con->sock->sk->sk_user_data = NULL;
+		write_unlock_bh(&con->sock->sk->sk_callback_lock);
+	}
 	if (con->othercon && and_other)
 		_stop_conn(con->othercon, false);
 	mutex_unlock(&con->sock_mutex);
-- 
cgit 


From 0aa18464c812e0154e8bafc9f60ca8002b3a8e7c Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 09:02:02 +0000
Subject: DLM: fix to reschedule rwork

When an error occurs in kernel_recvmsg or kernel_sendpage and
close_connection is called and receive work is already scheduled,
receive work is canceled. In that case, the receive work will not
be scheduled forever after reconnection, because CF_READ_PENDING
flag is established.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 5d0de91adc36..c64e39f76ce8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -593,10 +593,14 @@ static void close_connection(struct connection *con, bool and_other,
 {
 	bool closing = test_and_set_bit(CF_CLOSING, &con->flags);
 
-	if (tx && !closing && cancel_work_sync(&con->swork))
+	if (tx && !closing && cancel_work_sync(&con->swork)) {
 		log_print("canceled swork for node %d", con->nodeid);
-	if (rx && !closing && cancel_work_sync(&con->rwork))
+		clear_bit(CF_WRITE_PENDING, &con->flags);
+	}
+	if (rx && !closing && cancel_work_sync(&con->rwork)) {
 		log_print("canceled rwork for node %d", con->nodeid);
+		clear_bit(CF_READ_PENDING, &con->flags);
+	}
 
 	mutex_lock(&con->sock_mutex);
 	if (con->sock) {
-- 
cgit 


From 26b41099e7e97d6d44769fd159e822a98c98afa2 Mon Sep 17 00:00:00 2001
From: "tsutomu.owa@toshiba.co.jp" <tsutomu.owa@toshiba.co.jp>
Date: Tue, 12 Sep 2017 09:02:10 +0000
Subject: DLM: fix NULL pointer dereference in send_to_sock()

The writequeue and writequeue_lock member of othercon was not initialized.
If lowcomms_state_change() is called from network layer, othercon->swork
may be scheduled. In this case, send_to_sock() will generate a NULL pointer
reference. We avoid this problem by correctly initializing writequeue and
writequeue_lock member of othercon.

Signed-off-by: Tadashi Miyauchi <miyauchi@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu.owa@toshiba.co.jp>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index c64e39f76ce8..05707850f93a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -802,6 +802,8 @@ static int tcp_accept_from_sock(struct connection *con)
 			othercon->nodeid = nodeid;
 			othercon->rx_action = receive_from_sock;
 			mutex_init(&othercon->sock_mutex);
+			INIT_LIST_HEAD(&othercon->writequeue);
+			spin_lock_init(&othercon->writequeue_lock);
 			INIT_WORK(&othercon->swork, process_send_sockets);
 			INIT_WORK(&othercon->rwork, process_recv_sockets);
 			set_bit(CF_IS_OTHERCON, &othercon->flags);
@@ -920,6 +922,8 @@ static int sctp_accept_from_sock(struct connection *con)
 			othercon->nodeid = nodeid;
 			othercon->rx_action = receive_from_sock;
 			mutex_init(&othercon->sock_mutex);
+			INIT_LIST_HEAD(&othercon->writequeue);
+			spin_lock_init(&othercon->writequeue_lock);
 			INIT_WORK(&othercon->swork, process_send_sockets);
 			INIT_WORK(&othercon->rwork, process_recv_sockets);
 			set_bit(CF_IS_OTHERCON, &othercon->flags);
-- 
cgit 


From 9e1b0211c5dd4acbd21a8ec1b86fc38a497a4656 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 25 Sep 2017 15:47:50 +0800
Subject: dlm: recheck kthread_should_stop() before schedule()

Call schedule() here could make the thread miss wake
up from kthread_stop(), so it is better to recheck
kthread_should_stop() before call schedule(), a symptom
happened when I run indefinite test (which mostly created
clustered raid1, assemble it in other nodes, then stop
them) of clustered raid.

$ ps aux|grep md|grep D
root      4211  0.0  0.0  19760  2220 ?        Ds   02:58   0:00 mdadm -Ssq
$ cat /proc/4211/stack
kthread_stop+0x4d/0x150
dlm_recoverd_stop+0x15/0x20 [dlm]
dlm_release_lockspace+0x2ab/0x460 [dlm]
leave+0xbf/0x150 [md_cluster]
md_cluster_stop+0x18/0x30 [md_mod]
bitmap_free+0x12e/0x140 [md_mod]
bitmap_destroy+0x7f/0x90 [md_mod]
__md_stop+0x21/0xa0 [md_mod]
do_md_stop+0x15f/0x5c0 [md_mod]
md_ioctl+0xa65/0x18a0 [md_mod]
blkdev_ioctl+0x49e/0x8d0
block_ioctl+0x41/0x50
do_vfs_ioctl+0x96/0x5b0
SyS_ioctl+0x79/0x90
entry_SYSCALL_64_fastpath+0x1e/0xad

This maybe not resolve the issue completely since the
KTHREAD_SHOULD_STOP flag could be set between "break"
and "schedule", but at least the chance for the symptom
happen could be reduce a lot (The indefinite test runs
more than 20 hours without problem and it happens easily
without the change).

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/recoverd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index d2ad1cab0f05..6f4e1d42d733 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -299,8 +299,11 @@ static int dlm_recoverd(void *arg)
 			break;
 		}
 		if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) &&
-		    !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags))
+		    !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
+			if (kthread_should_stop())
+				break;
 			schedule();
+		}
 		set_current_state(TASK_RUNNING);
 
 		if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
-- 
cgit 


From 9250e523592a8ced3ecd14abe29fbb1e036bd7eb Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 9 Oct 2017 09:29:31 -0500
Subject: dlm: remove dlm_send_rcom_lookup_dump

This function was only for debugging.  It would be
called in a condition that should not happen, and
should probably have been removed from the final
version of the original commit.

Remove it because it does mutex lock under spin lock.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c |  1 -
 fs/dlm/rcom.c | 20 +-------------------
 fs/dlm/rcom.h |  1 -
 3 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'fs/dlm')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index f145a2a9d6cb..cc91963683de 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1003,7 +1003,6 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
 		if (r->res_master_nodeid == our_nodeid) {
 			log_error(ls, "from_master %d our_master", from_nodeid);
 			dlm_dump_rsb(r);
-			dlm_send_rcom_lookup_dump(r, from_nodeid);
 			goto out_found;
 		}
 
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 4ff061de927e..70c625999d36 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -338,25 +338,6 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
 	return error;
 }
 
-int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid)
-{
-	struct dlm_rcom *rc;
-	struct dlm_mhandle *mh;
-	struct dlm_ls *ls = r->res_ls;
-	int error;
-
-	error = create_rcom(ls, to_nodeid, DLM_RCOM_LOOKUP, r->res_length,
-			    &rc, &mh);
-	if (error)
-		goto out;
-	memcpy(rc->rc_buf, r->res_name, r->res_length);
-	rc->rc_id = 0xFFFFFFFF;
-
-	send_rcom(ls, mh, rc);
- out:
-	return error;
-}
-
 static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 {
 	struct dlm_rcom *rc;
@@ -368,6 +349,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 	if (error)
 		return;
 
+	/* Old code would send this special id to trigger a debug dump. */
 	if (rc_in->rc_id == 0xFFFFFFFF) {
 		log_error(ls, "receive_rcom_lookup dump from %d", nodeid);
 		dlm_dump_rsb_name(ls, rc_in->rc_buf, len);
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index f8e243463c15..206723ab744d 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -17,7 +17,6 @@
 int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags);
 int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
 int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
-int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid);
 int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
 void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid);
 int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in);
-- 
cgit