From 8113aa91360a013ebe00763bb0823b5a41b11c4d Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 3 Jan 2023 19:37:49 +0800 Subject: fs: dlm: fix return value check in dlm_memory_init() It should check 'cb_cache', after calling kmem_cache_create("dlm_cb"). Fixes: 61bed0baa4db ("fs: dlm: use a non-static queue for callbacks") Signed-off-by: Yang Yingliang Signed-off-by: David Teigland --- fs/dlm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/dlm') diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index eb7a08641fcf..cdbaa452fc05 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -51,7 +51,7 @@ int __init dlm_memory_init(void) cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback), __alignof__(struct dlm_callback), 0, NULL); - if (!rsb_cache) + if (!cb_cache) goto cb; return 0; -- cgit From aa7f4a21f6e5b0fa256b4de0924fea506ad801ef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Jan 2023 16:38:05 -0800 Subject: fs/dlm: Remove "select SRCU" Now that the SRCU Kconfig option is unconditionally selected, there is no longer any point in selecting it. Therefore, remove the "select SRCU" Kconfig statements. Signed-off-by: Paul E. McKenney Signed-off-by: David Teigland --- fs/dlm/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/dlm') diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 1105ce3c80cb..b3b86dbdc187 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -4,7 +4,6 @@ menuconfig DLM depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP - select SRCU help A general purpose distributed lock manager for kernel or userspace applications. -- cgit From aad633dc0cf90093998b1ae0ba9f19b5f1dab644 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:10:31 -0500 Subject: fs: dlm: start midcomms before scand The scand kthread can send dlm messages out, especially dlm remove messages to free memory for unused rsb on other nodes. To send out dlm messages, midcomms must be initialized. This patch moves the midcomms start before scand is started. Cc: stable@vger.kernel.org Fixes: e7fd41792fc0 ("[DLM] The core of the DLM for GFS2/CLVM") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d0b4e2181a5f..99bc96f90779 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -381,23 +381,23 @@ static int threads_start(void) { int error; - error = dlm_scand_start(); + /* Thread for sending/receiving messages for all lockspace's */ + error = dlm_midcomms_start(); if (error) { - log_print("cannot start dlm_scand thread %d", error); + log_print("cannot start dlm midcomms %d", error); goto fail; } - /* Thread for sending/receiving messages for all lockspace's */ - error = dlm_midcomms_start(); + error = dlm_scand_start(); if (error) { - log_print("cannot start dlm midcomms %d", error); - goto scand_fail; + log_print("cannot start dlm_scand thread %d", error); + goto midcomms_fail; } return 0; - scand_fail: - dlm_scand_stop(); + midcomms_fail: + dlm_midcomms_stop(); fail: return error; } -- cgit From 724b6bab0d75f1dc01fdfbf7fe8d4217a5cb90ba Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:10:32 -0500 Subject: fs: dlm: fix use after free in midcomms commit While working on processing dlm message in softirq context I experienced the following KASAN use-after-free warning: [ 151.760477] ================================================================== [ 151.761803] BUG: KASAN: use-after-free in dlm_midcomms_commit_mhandle+0x19d/0x4b0 [ 151.763414] Read of size 4 at addr ffff88811a980c60 by task lock_torture/1347 [ 151.765284] CPU: 7 PID: 1347 Comm: lock_torture Not tainted 6.1.0-rc4+ #2828 [ 151.766778] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.16.0-3.module+el8.7.0+16134+e5908aa2 04/01/2014 [ 151.768726] Call Trace: [ 151.769277] [ 151.769748] dump_stack_lvl+0x5b/0x86 [ 151.770556] print_report+0x180/0x4c8 [ 151.771378] ? kasan_complete_mode_report_info+0x7c/0x1e0 [ 151.772241] ? dlm_midcomms_commit_mhandle+0x19d/0x4b0 [ 151.773069] kasan_report+0x93/0x1a0 [ 151.773668] ? dlm_midcomms_commit_mhandle+0x19d/0x4b0 [ 151.774514] __asan_load4+0x7e/0xa0 [ 151.775089] dlm_midcomms_commit_mhandle+0x19d/0x4b0 [ 151.775890] ? create_message.isra.29.constprop.64+0x57/0xc0 [ 151.776770] send_common+0x19f/0x1b0 [ 151.777342] ? remove_from_waiters+0x60/0x60 [ 151.778017] ? lock_downgrade+0x410/0x410 [ 151.778648] ? __this_cpu_preempt_check+0x13/0x20 [ 151.779421] ? rcu_lockdep_current_cpu_online+0x88/0xc0 [ 151.780292] _convert_lock+0x46/0x150 [ 151.780893] convert_lock+0x7b/0xc0 [ 151.781459] dlm_lock+0x3ac/0x580 [ 151.781993] ? 0xffffffffc0540000 [ 151.782522] ? torture_stop+0x120/0x120 [dlm_locktorture] [ 151.783379] ? dlm_scan_rsbs+0xa70/0xa70 [ 151.784003] ? preempt_count_sub+0xd6/0x130 [ 151.784661] ? is_module_address+0x47/0x70 [ 151.785309] ? torture_stop+0x120/0x120 [dlm_locktorture] [ 151.786166] ? 0xffffffffc0540000 [ 151.786693] ? lockdep_init_map_type+0xc3/0x360 [ 151.787414] ? 0xffffffffc0540000 [ 151.787947] torture_dlm_lock_sync.isra.3+0xe9/0x150 [dlm_locktorture] [ 151.789004] ? torture_stop+0x120/0x120 [dlm_locktorture] [ 151.789858] ? 0xffffffffc0540000 [ 151.790392] ? lock_torture_cleanup+0x20/0x20 [dlm_locktorture] [ 151.791347] ? delay_tsc+0x94/0xc0 [ 151.791898] torture_ex_iter+0xc3/0xea [dlm_locktorture] [ 151.792735] ? torture_start+0x30/0x30 [dlm_locktorture] [ 151.793606] lock_torture+0x177/0x270 [dlm_locktorture] [ 151.794448] ? torture_dlm_lock_sync.isra.3+0x150/0x150 [dlm_locktorture] [ 151.795539] ? lock_torture_stats+0x80/0x80 [dlm_locktorture] [ 151.796476] ? do_raw_spin_lock+0x11e/0x1e0 [ 151.797152] ? mark_held_locks+0x34/0xb0 [ 151.797784] ? _raw_spin_unlock_irqrestore+0x30/0x70 [ 151.798581] ? __kthread_parkme+0x79/0x110 [ 151.799246] ? trace_preempt_on+0x2a/0xf0 [ 151.799902] ? __kthread_parkme+0x79/0x110 [ 151.800579] ? preempt_count_sub+0xd6/0x130 [ 151.801271] ? __kasan_check_read+0x11/0x20 [ 151.801963] ? __kthread_parkme+0xec/0x110 [ 151.802630] ? lock_torture_stats+0x80/0x80 [dlm_locktorture] [ 151.803569] kthread+0x192/0x1d0 [ 151.804104] ? kthread_complete_and_exit+0x30/0x30 [ 151.804881] ret_from_fork+0x1f/0x30 [ 151.805480] [ 151.806111] Allocated by task 1347: [ 151.806681] kasan_save_stack+0x26/0x50 [ 151.807308] kasan_set_track+0x25/0x30 [ 151.807920] kasan_save_alloc_info+0x1e/0x30 [ 151.808609] __kasan_slab_alloc+0x63/0x80 [ 151.809263] kmem_cache_alloc+0x1ad/0x830 [ 151.809916] dlm_allocate_mhandle+0x17/0x20 [ 151.810590] dlm_midcomms_get_mhandle+0x96/0x260 [ 151.811344] _create_message+0x95/0x180 [ 151.811994] create_message.isra.29.constprop.64+0x57/0xc0 [ 151.812880] send_common+0x129/0x1b0 [ 151.813467] _convert_lock+0x46/0x150 [ 151.814074] convert_lock+0x7b/0xc0 [ 151.814648] dlm_lock+0x3ac/0x580 [ 151.815199] torture_dlm_lock_sync.isra.3+0xe9/0x150 [dlm_locktorture] [ 151.816258] torture_ex_iter+0xc3/0xea [dlm_locktorture] [ 151.817129] lock_torture+0x177/0x270 [dlm_locktorture] [ 151.817986] kthread+0x192/0x1d0 [ 151.818518] ret_from_fork+0x1f/0x30 [ 151.819369] Freed by task 1336: [ 151.819890] kasan_save_stack+0x26/0x50 [ 151.820514] kasan_set_track+0x25/0x30 [ 151.821128] kasan_save_free_info+0x2e/0x50 [ 151.821812] __kasan_slab_free+0x107/0x1a0 [ 151.822483] kmem_cache_free+0x204/0x5e0 [ 151.823152] dlm_free_mhandle+0x18/0x20 [ 151.823781] dlm_mhandle_release+0x2e/0x40 [ 151.824454] rcu_core+0x583/0x1330 [ 151.825047] rcu_core_si+0xe/0x20 [ 151.825594] __do_softirq+0xf4/0x5c2 [ 151.826450] Last potentially related work creation: [ 151.827238] kasan_save_stack+0x26/0x50 [ 151.827870] __kasan_record_aux_stack+0xa2/0xc0 [ 151.828609] kasan_record_aux_stack_noalloc+0xb/0x20 [ 151.829415] call_rcu+0x4c/0x760 [ 151.829954] dlm_mhandle_delete+0x97/0xb0 [ 151.830718] dlm_process_incoming_buffer+0x2fc/0xb30 [ 151.831524] process_dlm_messages+0x16e/0x470 [ 151.832245] process_one_work+0x505/0xa10 [ 151.832905] worker_thread+0x67/0x650 [ 151.833507] kthread+0x192/0x1d0 [ 151.834046] ret_from_fork+0x1f/0x30 [ 151.834900] The buggy address belongs to the object at ffff88811a980c30 which belongs to the cache dlm_mhandle of size 88 [ 151.836894] The buggy address is located 48 bytes inside of 88-byte region [ffff88811a980c30, ffff88811a980c88) [ 151.839007] The buggy address belongs to the physical page: [ 151.839904] page:0000000076cf5d62 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11a980 [ 151.841378] flags: 0x8000000000000200(slab|zone=2) [ 151.842141] raw: 8000000000000200 0000000000000000 dead000000000122 ffff8881089b43c0 [ 151.843401] raw: 0000000000000000 0000000000220022 00000001ffffffff 0000000000000000 [ 151.844640] page dumped because: kasan: bad access detected [ 151.845822] Memory state around the buggy address: [ 151.846602] ffff88811a980b00: fb fb fb fb fc fc fc fc fa fb fb fb fb fb fb fb [ 151.847761] ffff88811a980b80: fb fb fb fc fc fc fc fa fb fb fb fb fb fb fb fb [ 151.848921] >ffff88811a980c00: fb fb fc fc fc fc fa fb fb fb fb fb fb fb fb fb [ 151.850076] ^ [ 151.851085] ffff88811a980c80: fb fc fc fc fc fa fb fb fb fb fb fb fb fb fb fb [ 151.852269] ffff88811a980d00: fc fc fc fc fa fb fb fb fb fb fb fb fb fb fb fc [ 151.853428] ================================================================== [ 151.855618] Disabling lock debugging due to kernel taint It is accessing a mhandle in dlm_midcomms_commit_mhandle() and the mhandle was freed by a call_rcu() call in dlm_process_incoming_buffer(), dlm_mhandle_delete(). It looks like it was freed because an ack of this message was received. There is a short race between committing the dlm message to be transmitted and getting an ack back. If the ack is faster than returning from dlm_midcomms_commit_msg_3_2(), then we run into a use-after free because we still need to reference the mhandle when calling srcu_read_unlock(). To avoid that, we don't allow that mhandle to be freed between dlm_midcomms_commit_msg_3_2() and srcu_read_unlock() by using rcu read lock. We can do that because mhandle is protected by rcu handling. Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index fc015a6abe17..2e60d9a2c883 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1214,8 +1214,15 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: + /* held rcu read lock here, because we sending the + * dlm message out, when we do that we could receive + * an ack back which releases the mhandle and we + * get a use after free. + */ + rcu_read_lock(); dlm_midcomms_commit_msg_3_2(mh, name, namelen); srcu_read_unlock(&nodes_srcu, mh->idx); + rcu_read_unlock(); break; default: srcu_read_unlock(&nodes_srcu, mh->idx); -- cgit From 7354fa4ef697191effedc2ae9a8293427708bbf5 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:10:33 -0500 Subject: fs: dlm: be sure to call dlm_send_queue_flush() If we release a midcomms node structure, there should be nothing left inside the dlm midcomms send queue. However, sometimes this is not true because I believe some DLM_FIN message was not acked... if we run into a shutdown timeout, then we should be sure there is no pending send dlm message inside this queue when releasing midcomms node structure. Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 2e60d9a2c883..a3eb19c8cec5 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1402,6 +1402,7 @@ static void midcomms_node_release(struct rcu_head *rcu) struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); + dlm_send_queue_flush(node); kfree(node); } -- cgit From 164272113b685927126c938b4a9cbd2075eb15ee Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:10:34 -0500 Subject: fs: dlm: fix race setting stop tx flag This patch sets the stop tx flag before we commit the dlm message. This flag will report about unexpected transmissions after we send the DLM_FIN message out, which should be the last message sent. When we commit the dlm fin message, it could be that we already got an ack back and the CLOSED state change already happened. We should not set this flag when we are in CLOSED state. To avoid this race we simply set the tx flag before the state change can be in progress by moving it before dlm_midcomms_commit_mhandle(). Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index a3eb19c8cec5..9d459d5bf800 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -406,6 +406,7 @@ static int dlm_send_fin(struct midcomms_node *node, if (!mh) return -ENOMEM; + set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); mh->ack_rcv = ack_rcv; m_header = (struct dlm_header *)ppc; @@ -417,7 +418,6 @@ static int dlm_send_fin(struct midcomms_node *node, pr_debug("sending fin msg to node %d\n", node->nodeid); dlm_midcomms_commit_mhandle(mh, NULL, 0); - set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); return 0; } -- cgit From 15c63db8e86a72e0d5cfb9bf0cd1870e39a3e5fe Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:10:35 -0500 Subject: fs: dlm: don't set stop rx flag after node reset Similar to the stop tx flag, the rx flag should warn about a dlm message being received at DLM_FIN state change, when we are assuming no other dlm application messages. If we receive a FIN message and we are in the state DLM_FIN_WAIT2 we call midcomms_node_reset() which puts the midcomms node into DLM_CLOSED state. Afterwards we should not set the DLM_NODE_FLAG_STOP_RX flag any more. This patch changes the setting DLM_NODE_FLAG_STOP_RX in those state changes when we receive a FIN message and we assume there will be no other dlm application messages received until we hit DLM_CLOSED state. Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 9d459d5bf800..1fef99214204 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -524,6 +524,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, break; case DLM_FIN_WAIT1: node->state = DLM_CLOSING; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; @@ -544,8 +545,6 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, return; } spin_unlock(&node->state_lock); - - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); break; default: WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); -- cgit From a58496361802070996f9bd76e941d109c4a85ebd Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:10:36 -0500 Subject: fs: dlm: move sending fin message into state change handling This patch moves the send fin handling, which should appear in a specific state change, into the state change handling while the per node state_lock is held. I experienced issues with other messages because we changed the state and a fin message was sent out in a different state. Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 1fef99214204..fd9f413cabcd 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -402,7 +402,7 @@ static int dlm_send_fin(struct midcomms_node *node, struct dlm_mhandle *mh; char *ppc; - mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_NOFS, &ppc); + mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_ATOMIC, &ppc); if (!mh) return -ENOMEM; @@ -518,8 +518,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, node->state = DLM_LAST_ACK; pr_debug("switch node %d to state %s case 1\n", node->nodeid, dlm_state_str(node->state)); - spin_unlock(&node->state_lock); - goto send_fin; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); } break; case DLM_FIN_WAIT1: @@ -563,12 +563,6 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", seq, node->seq_next, node->nodeid); } - - return; - -send_fin: - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); - dlm_send_fin(node, dlm_pas_fin_ack_rcv); } static struct midcomms_node * @@ -1368,11 +1362,11 @@ void dlm_midcomms_remove_member(int nodeid) case DLM_CLOSE_WAIT: /* passive shutdown DLM_LAST_ACK case 2 */ node->state = DLM_LAST_ACK; - spin_unlock(&node->state_lock); - pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); - goto send_fin; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + break; case DLM_LAST_ACK: /* probably receive fin caught it, do nothing */ break; @@ -1388,12 +1382,6 @@ void dlm_midcomms_remove_member(int nodeid) spin_unlock(&node->state_lock); srcu_read_unlock(&nodes_srcu, idx); - return; - -send_fin: - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); - dlm_send_fin(node, dlm_pas_fin_ack_rcv); - srcu_read_unlock(&nodes_srcu, idx); } static void midcomms_node_release(struct rcu_head *rcu) @@ -1425,6 +1413,7 @@ static void midcomms_shutdown(struct midcomms_node *node) node->state = DLM_FIN_WAIT1; pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); + dlm_send_fin(node, dlm_act_fin_ack_rcv); break; case DLM_CLOSED: /* we have what we want */ @@ -1438,12 +1427,8 @@ static void midcomms_shutdown(struct midcomms_node *node) } spin_unlock(&node->state_lock); - if (node->state == DLM_FIN_WAIT1) { - dlm_send_fin(node, dlm_act_fin_ack_rcv); - - if (DLM_DEBUG_FENCE_TERMINATION) - msleep(5000); - } + if (DLM_DEBUG_FENCE_TERMINATION) + msleep(5000); /* wait for other side dlm + fin */ ret = wait_event_timeout(node->shutdown_wait, -- cgit From 00908b3388255fc1d3782b744d07f327712f401f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:10:37 -0500 Subject: fs: dlm: send FIN ack back in right cases This patch moves to send a ack back for receiving a FIN message only when we are in valid states. In other cases and there might be a sender waiting for a ack we just let it timeout at the senders time and hopefully all other cleanups will remove the FIN message on their sending queue. As an example we should never send out an ACK being in LAST_ACK state or we cannot assume a working socket communication when we are in CLOSED state. Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index fd9f413cabcd..ecfb3beb0bb8 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -375,7 +375,7 @@ static int dlm_send_ack(int nodeid, uint32_t seq) struct dlm_msg *msg; char *ppc; - msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_NOFS, &ppc, + msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_ATOMIC, &ppc, NULL, NULL); if (!msg) return -ENOMEM; @@ -498,15 +498,14 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, switch (p->header.h_cmd) { case DLM_FIN: - /* send ack before fin */ - dlm_send_ack(node->nodeid, node->seq_next); - spin_lock(&node->state_lock); pr_debug("receive fin msg from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); switch (node->state) { case DLM_ESTABLISHED: + dlm_send_ack(node->nodeid, node->seq_next); + node->state = DLM_CLOSE_WAIT; pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); @@ -523,12 +522,14 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, } break; case DLM_FIN_WAIT1: + dlm_send_ack(node->nodeid, node->seq_next); node->state = DLM_CLOSING; set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; case DLM_FIN_WAIT2: + dlm_send_ack(node->nodeid, node->seq_next); midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); -- cgit From 54fbe0c15d9a8072e2db7f6765f50c995834e2d7 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:18:42 -0500 Subject: fs: dlm: bring back previous shutdown handling This patch mostly reverts commit 4f567acb0b86 ("fs: dlm: remove socket shutdown handling"). There can be situations where the dlm midcomms nodes hash and lowcomms connection hash are not equal, but we need to guarantee that the lowcomms are all closed on a last release of a dlm lockspace, when a shutdown is invoked. This patch guarantees that we always close all sockets managed by the lowcomms connection hash, and calls shutdown for the last message sent. This ensures we don't cut the socket, which could cause the peer to get a connection reset. In future we should try to merge the midcomms/lowcomms hashes into one hash and not handle both in separate hashes. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 77 ++++++++++++++++++++++++++++++++++++++++--------------- fs/dlm/midcomms.c | 20 +++++---------- 2 files changed, 63 insertions(+), 34 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4450721ec83c..61cd6c2628fa 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -61,6 +61,7 @@ #include "memory.h" #include "config.h" +#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000) #define NEEDED_RMEM (4*1024*1024) struct connection { @@ -99,6 +100,7 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* receive worker */ struct work_struct swork; /* send worker */ + wait_queue_head_t shutdown_wait; unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE]; int rx_leftover; int mark; @@ -282,6 +284,7 @@ static void dlm_con_init(struct connection *con, int nodeid) INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); spin_lock_init(&con->addrs_lock); + init_waitqueue_head(&con->shutdown_wait); } /* @@ -790,6 +793,43 @@ static void close_connection(struct connection *con, bool and_other) up_write(&con->sock_lock); } +static void shutdown_connection(struct connection *con, bool and_other) +{ + int ret; + + if (con->othercon && and_other) + shutdown_connection(con->othercon, false); + + flush_workqueue(io_workqueue); + down_read(&con->sock_lock); + /* nothing to shutdown */ + if (!con->sock) { + up_read(&con->sock_lock); + return; + } + + ret = kernel_sock_shutdown(con->sock, SHUT_WR); + up_read(&con->sock_lock); + if (ret) { + log_print("Connection %p failed to shutdown: %d will force close", + con, ret); + goto force_close; + } else { + ret = wait_event_timeout(con->shutdown_wait, !con->sock, + DLM_SHUTDOWN_WAIT_TIMEOUT); + if (ret == 0) { + log_print("Connection %p shutdown timed out, will force close", + con); + goto force_close; + } + } + + return; + +force_close: + close_connection(con, false); +} + static struct processqueue_entry *new_processqueue_entry(int nodeid, int buflen) { @@ -1488,6 +1528,7 @@ static void process_recv_sockets(struct work_struct *work) break; case DLM_IO_EOF: close_connection(con, false); + wake_up(&con->shutdown_wait); /* CF_RECV_PENDING cleared */ break; case DLM_IO_RESCHED: @@ -1695,6 +1736,9 @@ static int work_start(void) void dlm_lowcomms_shutdown(void) { + struct connection *con; + int i, idx; + /* stop lowcomms_listen_data_ready calls */ lock_sock(listen_con.sock->sk); listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready; @@ -1703,29 +1747,20 @@ void dlm_lowcomms_shutdown(void) cancel_work_sync(&listen_con.rwork); dlm_close_sock(&listen_con.sock); - flush_workqueue(process_workqueue); -} - -void dlm_lowcomms_shutdown_node(int nodeid, bool force) -{ - struct connection *con; - int idx; - idx = srcu_read_lock(&connections_srcu); - con = nodeid2con(nodeid, 0); - if (WARN_ON_ONCE(!con)) { - srcu_read_unlock(&connections_srcu, idx); - return; - } + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + shutdown_connection(con, true); + stop_connection_io(con); + flush_workqueue(process_workqueue); + close_connection(con, true); - flush_work(&con->swork); - stop_connection_io(con); - WARN_ON_ONCE(!force && !list_empty(&con->writequeue)); - close_connection(con, true); - clean_one_writequeue(con); - if (con->othercon) - clean_one_writequeue(con->othercon); - allow_connection_io(con); + clean_one_writequeue(con); + if (con->othercon) + clean_one_writequeue(con->othercon); + allow_connection_io(con); + } + } srcu_read_unlock(&connections_srcu, idx); } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index ecfb3beb0bb8..ecd81018d1cf 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1418,8 +1418,7 @@ static void midcomms_shutdown(struct midcomms_node *node) break; case DLM_CLOSED: /* we have what we want */ - spin_unlock(&node->state_lock); - return; + break; default: /* busy to enter DLM_FIN_WAIT1, wait until passive * done in shutdown_wait to enter DLM_CLOSED. @@ -1436,17 +1435,12 @@ static void midcomms_shutdown(struct midcomms_node *node) node->state == DLM_CLOSED || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), DLM_SHUTDOWN_TIMEOUT); - if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) { + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); - midcomms_node_reset(node); - dlm_lowcomms_shutdown_node(node->nodeid, true); - return; - } - - pr_debug("active shutdown done for node %d with state %s\n", - node->nodeid, dlm_state_str(node->state)); - dlm_lowcomms_shutdown_node(node->nodeid, false); + else + pr_debug("active shutdown done for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); } void dlm_midcomms_shutdown(void) @@ -1454,8 +1448,6 @@ void dlm_midcomms_shutdown(void) struct midcomms_node *node; int i, idx; - dlm_lowcomms_shutdown(); - mutex_lock(&close_lock); idx = srcu_read_lock(&nodes_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { @@ -1473,6 +1465,8 @@ void dlm_midcomms_shutdown(void) } srcu_read_unlock(&nodes_srcu, idx); mutex_unlock(&close_lock); + + dlm_lowcomms_shutdown(); } int dlm_midcomms_close(int nodeid) -- cgit From 89835b064fe7421a60e40770c27d38024190a0c8 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:18:43 -0500 Subject: fs: dlm: ignore unexpected non dlm opts msgs This patch ignores unexpected RCOM_NAMES/RCOM_STATUS messages. To be backwards compatible, those messages are not part of the new reliable DLM OPTS encapsulation header, and have their own retransmit handling using sequence number matching When we get unexpected non dlm opts messages, we should allow them and let RCOM message handling filter them out using sequence numbers. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index ecd81018d1cf..dbc998b2748b 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -606,16 +606,8 @@ dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, case DLM_ESTABLISHED: break; default: - /* some invalid state passive shutdown - * was failed, we try to reset and - * hope it will go on. - */ - log_print("reset node %d because shutdown stuck", - node->nodeid); - - midcomms_node_reset(node); - node->state = DLM_ESTABLISHED; - break; + spin_unlock(&node->state_lock); + return NULL; } spin_unlock(&node->state_lock); } -- cgit From b8b750e0c99f39223115f2672ac4cfa96ecb9edd Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:18:44 -0500 Subject: fs: dlm: wait until all midcomms nodes detect version The current dlm version detection is very complex due to backwards compatablilty with earlier dlm protocol versions. It takes some time to detect if a peer node has a specific DLM version. If it's not detected, we just cut the socket connection. There could be cases where the local node has not detected the version yet, but the peer node has. In these cases, we are trying to shutdown the dlm connection with a FIN/ACK message exchange to be sure the other peer is ready to shutdown the connection on dlm application level. However this mechanism is only available on DLM protocol version 3.2 and we need to be sure the DLM version is detected before. To make it more robust we introduce a a "best effort" wait to wait for the version detection before shutdown the dlm connection. This need to be done before the kthread recoverd for recovery handling is stopped, because recovery handling will trigger enough messages to have a version detection going on. It is a corner case which was detected by modprobe dlm_locktroture module and rmmod dlm_locktorture module directly afterwards (in a looping behaviour). In practice probably nobody would leave a lockspace immediately after joining it. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 3 +++ fs/dlm/midcomms.c | 23 +++++++++++++++++++++++ fs/dlm/midcomms.h | 1 + 3 files changed, 27 insertions(+) (limited to 'fs/dlm') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 99bc96f90779..d9dc0b734002 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -820,6 +820,9 @@ static int release_lockspace(struct dlm_ls *ls, int force) return rv; } + if (ls_count == 1) + dlm_midcomms_version_wait(); + dlm_device_deregister(ls); if (force < 3 && dlm_user_daemon_available()) diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index dbc998b2748b..cf91a5a11b4f 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -657,6 +657,7 @@ static int dlm_midcomms_version_check_3_2(struct midcomms_node *node) switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_2; + wake_up(&node->shutdown_wait); log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, node->nodeid); break; @@ -826,6 +827,7 @@ static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_1; + wake_up(&node->shutdown_wait); log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1, node->nodeid); break; @@ -1386,6 +1388,27 @@ static void midcomms_node_release(struct rcu_head *rcu) kfree(node); } +void dlm_midcomms_version_wait(void) +{ + struct midcomms_node *node; + int i, idx, ret; + + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + ret = wait_event_timeout(node->shutdown_wait, + node->version != DLM_VERSION_NOT_SET || + node->state == DLM_CLOSED || + test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), + DLM_SHUTDOWN_TIMEOUT); + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) + pr_debug("version wait timed out for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + } + } + srcu_read_unlock(&nodes_srcu, idx); +} + static void midcomms_shutdown(struct midcomms_node *node) { int ret; diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index bea1cee4279c..9f8c9605013d 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -20,6 +20,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, int namelen); +void dlm_midcomms_version_wait(void); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); void dlm_midcomms_stop(void); -- cgit From 317dd6ba6ccaa5e7c24039955c39df84072d5166 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:18:45 -0500 Subject: fs: dlm: make dlm sequence id more robust When joining a new lockspace, use a random number to initialize a sequence number used in messages. This makes it easier to detect sequence number mismatches in message replies during tests that repeatedly join and leave a lockspace. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/dlm') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d9dc0b734002..9f344d76afa3 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -572,7 +572,7 @@ static int new_lockspace(const char *name, const char *cluster, spin_lock_init(&ls->ls_rcom_spin); get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t)); ls->ls_recover_status = 0; - ls->ls_recover_seq = 0; + ls->ls_recover_seq = get_random_u64(); ls->ls_recover_args = NULL; init_rwsem(&ls->ls_in_recovery); init_rwsem(&ls->ls_recv_active); -- cgit From 11605353f27c23902b6fb993a57bd2f0baaff3b6 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:18:46 -0500 Subject: fs: dlm: reduce the shutdown timeout to 5 secs When a shutdown is stuck, time out after 5 seconds instead of 3 minutes. After this timeout we try a forced shutdown. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index cf91a5a11b4f..44e29f99de4e 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -146,8 +146,8 @@ /* init value for sequence numbers for testing purpose only e.g. overflows */ #define DLM_SEQ_INIT 0 -/* 3 minutes wait to sync ending of dlm */ -#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(3 * 60 * 1000) +/* 5 seconds wait to sync ending of dlm */ +#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(5000) #define DLM_VERSION_NOT_SET 0 struct midcomms_node { -- cgit From 3186409711e64ed0a0fa767fad555a5e655022da Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:18:47 -0500 Subject: fs: dlm: remove newline in log_print There is an API difference between log_print() and other printk()s to put a newline or not. This one was introduced by mistake because log_print() adds a newline. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 44e29f99de4e..7827d049378c 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -467,7 +467,7 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; @@ -540,7 +540,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; @@ -1269,7 +1269,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; @@ -1369,7 +1369,7 @@ void dlm_midcomms_remove_member(int nodeid) /* already gone, do nothing */ break; default: - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); break; } -- cgit From ef7ef015eb333c8954d7165d216ec0c774d7e4e4 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:18:48 -0500 Subject: fs: dlm: move state change into else branch Currently we can switch at first into DLM_CLOSE_WAIT state and then do another state change if a condition is true. Instead of doing two state changes we handle the other state change inside an else branch of this condition. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 7827d049378c..69e6f4705ebc 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -506,9 +506,6 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, case DLM_ESTABLISHED: dlm_send_ack(node->nodeid, node->seq_next); - node->state = DLM_CLOSE_WAIT; - pr_debug("switch node %d to state %s\n", - node->nodeid, dlm_state_str(node->state)); /* passive shutdown DLM_LAST_ACK case 1 * additional we check if the node is used by * cluster manager events at all. @@ -519,6 +516,10 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, node->nodeid, dlm_state_str(node->state)); set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); dlm_send_fin(node, dlm_pas_fin_ack_rcv); + } else { + node->state = DLM_CLOSE_WAIT; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); } break; case DLM_FIN_WAIT1: -- cgit From 723b197bbdf1e0adbab772b8e5e022c40db6a9fe Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 12 Jan 2023 17:18:49 -0500 Subject: fs: dlm: remove unnecessary waker_up() calls The wake_up() is already handled inside of midcomms_node_reset() when switching the state to CLOSED state. So there is not need to call it after midcomms_node_reset() again. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/dlm') diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 69e6f4705ebc..c02c43e4980a 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -534,7 +534,6 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); - wake_up(&node->shutdown_wait); break; case DLM_LAST_ACK: /* probably remove_member caught it, do nothing */ @@ -1262,7 +1261,6 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); - wake_up(&node->shutdown_wait); break; case DLM_CLOSED: /* not valid but somehow we got what we want */ -- cgit