summaryrefslogtreecommitdiff
path: root/fs/afs/fs_probe.c
diff options
context:
space:
mode:
authorDavid Howells <dhowells@redhat.com>2023-10-31 16:30:37 +0000
committerDavid Howells <dhowells@redhat.com>2024-01-01 16:37:27 +0000
commitf49b594df3ebca53c91f4d6448680463f10aa479 (patch)
treed18f0048178320274cf73b7e62c4cbb3a8ae9b80 /fs/afs/fs_probe.c
parente6a7d7f71b17e0a44e2155bdad47eae7b5368503 (diff)
afs: Keep a record of the current fileserver endpoint state
Keep a record of the current fileserver endpoint state, including the probe state, and replace it when a new probe is started rather than just squelching the old state and overwriting it. Clearance of the old state can cause a race if there's another thread also currently trying to communicate with that server. It appears that this race might be the culprit for some occasions where kafs complains about invalid data in the RPC reply because the rotation algorithm fell all the way through without actually issuing an RPC call and the error return got filled in from the probe state (which has a zero error recorded). Whatever happens to be in the caller's reply buffer is then taken as the response. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
Diffstat (limited to 'fs/afs/fs_probe.c')
-rw-r--r--fs/afs/fs_probe.c235
1 files changed, 143 insertions, 92 deletions
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index c5702698b18b..a669aee033c5 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -15,6 +15,42 @@
static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ;
static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ;
+struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate,
+ enum afs_estate_trace where)
+{
+ if (estate) {
+ int r;
+
+ __refcount_inc(&estate->ref, &r);
+ trace_afs_estate(estate->server_id, estate->probe_seq, r, where);
+ }
+ return estate;
+}
+
+static void afs_endpoint_state_rcu(struct rcu_head *rcu)
+{
+ struct afs_endpoint_state *estate = container_of(rcu, struct afs_endpoint_state, rcu);
+
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_free);
+ afs_put_addrlist(estate->addresses, afs_alist_trace_put_estate);
+ kfree(estate);
+}
+
+void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where)
+{
+ if (estate) {
+ unsigned int server_id = estate->server_id, probe_seq = estate->probe_seq;
+ bool dead;
+ int r;
+
+ dead = __refcount_dec_and_test(&estate->ref, &r);
+ trace_afs_estate(server_id, probe_seq, r, where);
+ if (dead)
+ call_rcu(&estate->rcu, afs_endpoint_state_rcu);
+ }
+}
+
/*
* Start the probe polling timer. We have to supply it with an inc on the
* outstanding server count.
@@ -38,9 +74,10 @@ static void afs_schedule_fs_probe(struct afs_net *net,
/*
* Handle the completion of a set of probes.
*/
-static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server)
+static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate)
{
- bool responded = server->probe.responded;
+ bool responded = estate->responded;
write_seqlock(&net->fs_lock);
if (responded) {
@@ -50,6 +87,7 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server
clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
list_add_tail(&server->probe_link, &net->fs_probe_fast);
}
+
write_sequnlock(&net->fs_lock);
afs_schedule_fs_probe(net, server, !responded);
@@ -58,12 +96,13 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server
/*
* Handle the completion of a probe.
*/
-static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server)
+static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate)
{
_enter("");
- if (atomic_dec_and_test(&server->probe_outstanding))
- afs_finished_fs_probe(net, server);
+ if (atomic_dec_and_test(&estate->nr_probing))
+ afs_finished_fs_probe(net, server, estate);
wake_up_all(&server->probe_wq);
}
@@ -74,7 +113,7 @@ static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server
*/
static void afs_fs_probe_not_done(struct afs_net *net,
struct afs_server *server,
- struct afs_addr_list *alist,
+ struct afs_endpoint_state *estate,
int index)
{
_enter("");
@@ -82,14 +121,14 @@ static void afs_fs_probe_not_done(struct afs_net *net,
trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail);
spin_lock(&server->probe_lock);
- server->probe.local_failure = true;
- if (server->probe.error == 0)
- server->probe.error = -ENOMEM;
+ estate->local_failure = true;
+ if (estate->error == 0)
+ estate->error = -ENOMEM;
- set_bit(index, &alist->probe_failed);
+ set_bit(index, &estate->failed_set);
spin_unlock(&server->probe_lock);
- return afs_done_one_fs_probe(net, server);
+ return afs_done_one_fs_probe(net, server, estate);
}
/*
@@ -98,7 +137,8 @@ static void afs_fs_probe_not_done(struct afs_net *net,
*/
void afs_fileserver_probe_result(struct afs_call *call)
{
- struct afs_addr_list *alist = call->probe_alist;
+ struct afs_endpoint_state *estate = call->probe;
+ struct afs_addr_list *alist = estate->addresses;
struct afs_address *addr = &alist->addrs[call->probe_index];
struct afs_server *server = call->server;
unsigned int index = call->probe_index;
@@ -113,18 +153,18 @@ void afs_fileserver_probe_result(struct afs_call *call)
switch (ret) {
case 0:
- server->probe.error = 0;
+ estate->error = 0;
goto responded;
case -ECONNABORTED:
- if (!server->probe.responded) {
- server->probe.abort_code = call->abort_code;
- server->probe.error = ret;
+ if (!estate->responded) {
+ estate->abort_code = call->abort_code;
+ estate->error = ret;
}
goto responded;
case -ENOMEM:
case -ENONET:
- clear_bit(index, &alist->responded);
- server->probe.local_failure = true;
+ clear_bit(index, &estate->responsive_set);
+ estate->local_failure = true;
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
@@ -137,28 +177,28 @@ void afs_fileserver_probe_result(struct afs_call *call)
case -ETIMEDOUT:
case -ETIME:
default:
- clear_bit(index, &alist->responded);
- set_bit(index, &alist->probe_failed);
- if (!server->probe.responded &&
- (server->probe.error == 0 ||
- server->probe.error == -ETIMEDOUT ||
- server->probe.error == -ETIME))
- server->probe.error = ret;
+ clear_bit(index, &estate->responsive_set);
+ set_bit(index, &estate->failed_set);
+ if (!estate->responded &&
+ (estate->error == 0 ||
+ estate->error == -ETIMEDOUT ||
+ estate->error == -ETIME))
+ estate->error = ret;
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
goto out;
}
responded:
- clear_bit(index, &alist->probe_failed);
+ clear_bit(index, &estate->failed_set);
if (call->service_id == YFS_FS_SERVICE) {
- server->probe.is_yfs = true;
+ estate->is_yfs = true;
set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
server->service_id = call->service_id;
} else {
- server->probe.not_yfs = true;
- if (!server->probe.is_yfs) {
- clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+ estate->not_yfs = true;
+ if (!estate->is_yfs) {
+ estate->is_yfs = false;
server->service_id = call->service_id;
}
cap0 = ntohl(call->tmp);
@@ -169,84 +209,90 @@ responded:
}
rtt_us = rxrpc_kernel_get_srtt(addr->peer);
- if (rtt_us < server->probe.rtt) {
- server->probe.rtt = rtt_us;
+ if (rtt_us < estate->rtt) {
+ estate->rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
}
smp_wmb(); /* Set rtt before responded. */
- server->probe.responded = true;
- set_bit(index, &alist->responded);
+ estate->responded = true;
+ set_bit(index, &estate->responsive_set);
set_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
out:
spin_unlock(&server->probe_lock);
- trace_afs_fs_probe(server, false, alist, index, call->error, call->abort_code, rtt_us);
- _debug("probe %pU [%u] %pISpc rtt=%d ret=%d",
- &server->uuid, index, rxrpc_kernel_remote_addr(alist->addrs[index].peer),
+ trace_afs_fs_probe(server, false, estate, index, call->error, call->abort_code, rtt_us);
+ _debug("probe[%x] %pU [%u] %pISpc rtt=%d ret=%d",
+ estate->probe_seq, &server->uuid, index,
+ rxrpc_kernel_remote_addr(alist->addrs[index].peer),
rtt_us, ret);
- return afs_done_one_fs_probe(call->net, server);
+ return afs_done_one_fs_probe(call->net, server, estate);
}
/*
- * Probe one or all of a fileserver's addresses to find out the best route and
- * to query its capabilities.
+ * Probe all of a fileserver's addresses to find out the best route and to
+ * query its capabilities.
*/
void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
- struct key *key, bool all)
+ struct afs_addr_list *new_alist, struct key *key)
{
+ struct afs_endpoint_state *estate, *old;
struct afs_addr_list *alist;
- unsigned int index;
+ unsigned long unprobed;
_enter("%pU", &server->uuid);
- read_lock(&server->fs_lock);
- alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->fs_lock));
- afs_get_addrlist(alist, afs_alist_trace_get_probe);
- read_unlock(&server->fs_lock);
+ estate = kzalloc(sizeof(*estate), GFP_KERNEL);
+ if (!estate)
+ return;
+
+ refcount_set(&estate->ref, 1);
+ estate->server_id = server->debug_id;
+ estate->rtt = UINT_MAX;
+
+ write_lock(&server->fs_lock);
+
+ old = rcu_dereference_protected(server->endpoint_state,
+ lockdep_is_held(&server->fs_lock));
+ estate->responsive_set = old->responsive_set;
+ estate->addresses = afs_get_addrlist(new_alist ?: old->addresses,
+ afs_alist_trace_get_estate);
+ alist = estate->addresses;
+ estate->probe_seq = ++server->probe_counter;
+ atomic_set(&estate->nr_probing, alist->nr_addrs);
+
+ rcu_assign_pointer(server->endpoint_state, estate);
+ old->superseded = true;
+ write_unlock(&server->fs_lock);
+
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_alloc_probe);
afs_get_address_preferences(net, alist);
server->probed_at = jiffies;
- atomic_set(&server->probe_outstanding, all ? alist->nr_addrs : 1);
- memset(&server->probe, 0, sizeof(server->probe));
- server->probe.rtt = UINT_MAX;
-
- index = alist->preferred;
- if (index < 0 || index >= alist->nr_addrs)
- all = true;
-
- if (all) {
- unsigned long unprobed = (1UL << alist->nr_addrs) - 1;
- unsigned int i;
- int best_prio;
-
- while (unprobed) {
- best_prio = -1;
- index = 0;
- for (i = 0; i < alist->nr_addrs; i++) {
- if (test_bit(i, &unprobed) &&
- alist->addrs[i].prio > best_prio) {
- index = i;
- best_prio = alist->addrs[i].prio;
- }
+ unprobed = (1UL << alist->nr_addrs) - 1;
+ while (unprobed) {
+ unsigned int index = 0, i;
+ int best_prio = -1;
+
+ for (i = 0; i < alist->nr_addrs; i++) {
+ if (test_bit(i, &unprobed) &&
+ alist->addrs[i].prio > best_prio) {
+ index = i;
+ best_prio = alist->addrs[i].prio;
}
- __clear_bit(index, &unprobed);
-
- trace_afs_fs_probe(server, true, alist, index, 0, 0, 0);
- if (!afs_fs_get_capabilities(net, server, alist, index, key))
- afs_fs_probe_not_done(net, server, alist, index);
}
- } else {
- trace_afs_fs_probe(server, true, alist, index, 0, 0, 0);
- if (!afs_fs_get_capabilities(net, server, alist, index, key))
- afs_fs_probe_not_done(net, server, alist, index);
+ __clear_bit(index, &unprobed);
+
+ trace_afs_fs_probe(server, true, estate, index, 0, 0, 0);
+ if (!afs_fs_get_capabilities(net, server, estate, index, key))
+ afs_fs_probe_not_done(net, server, estate, index);
}
- afs_put_addrlist(alist, afs_alist_trace_put_probe);
+ afs_put_endpoint_state(old, afs_estate_trace_put_probe);
}
/*
@@ -254,6 +300,7 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
*/
int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
{
+ struct afs_endpoint_state *estate;
struct wait_queue_entry *waits;
struct afs_server *server;
unsigned int rtt = UINT_MAX, rtt_s;
@@ -263,15 +310,18 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
_enter("%u,%lx", slist->nr_servers, untried);
/* Only wait for servers that have a probe outstanding. */
+ rcu_read_lock();
for (i = 0; i < slist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = slist->servers[i].server;
- if (!atomic_read(&server->probe_outstanding))
+ estate = rcu_dereference(server->endpoint_state);
+ if (!atomic_read(&estate->nr_probing))
__clear_bit(i, &untried);
- if (server->probe.responded)
+ if (estate->responded)
have_responders = true;
}
}
+ rcu_read_unlock();
if (have_responders || !untried)
return 0;
@@ -294,9 +344,9 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
for (i = 0; i < slist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = slist->servers[i].server;
- if (server->probe.responded)
+ if (estate->responded)
goto stop;
- if (atomic_read(&server->probe_outstanding))
+ if (atomic_read(&estate->nr_probing))
still_probing = true;
}
}
@@ -348,7 +398,7 @@ void afs_fs_probe_timer(struct timer_list *timer)
/*
* Dispatch a probe to a server.
*/
-static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server, bool all)
+static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server)
__releases(&net->fs_lock)
{
struct key *key = NULL;
@@ -361,7 +411,7 @@ static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server
afs_get_server(server, afs_server_trace_get_probe);
write_sequnlock(&net->fs_lock);
- afs_fs_probe_fileserver(net, server, key, all);
+ afs_fs_probe_fileserver(net, server, NULL, key);
afs_put_server(net, server, afs_server_trace_put_probe);
}
@@ -373,7 +423,7 @@ void afs_probe_fileserver(struct afs_net *net, struct afs_server *server)
{
write_seqlock(&net->fs_lock);
if (!list_empty(&server->probe_link))
- return afs_dispatch_fs_probe(net, server, true);
+ return afs_dispatch_fs_probe(net, server);
write_sequnlock(&net->fs_lock);
}
@@ -433,7 +483,7 @@ again:
_debug("probe %pU", &server->uuid);
if (server && (first_pass || !need_resched())) {
- afs_dispatch_fs_probe(net, server, server == fast);
+ afs_dispatch_fs_probe(net, server);
first_pass = false;
goto again;
}
@@ -457,12 +507,13 @@ again:
/*
* Wait for a probe on a particular fileserver to complete for 2s.
*/
-int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
+int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate,
+ bool is_intr)
{
struct wait_queue_entry wait;
unsigned long timo = 2 * HZ;
- if (atomic_read(&server->probe_outstanding) == 0)
+ if (atomic_read(&estate->nr_probing) == 0)
goto dont_wait;
init_wait_entry(&wait, 0);
@@ -470,8 +521,8 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
prepare_to_wait_event(&server->probe_wq, &wait,
is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
if (timo == 0 ||
- server->probe.responded ||
- atomic_read(&server->probe_outstanding) == 0 ||
+ estate->responded ||
+ atomic_read(&estate->nr_probing) == 0 ||
(is_intr && signal_pending(current)))
break;
timo = schedule_timeout(timo);
@@ -480,7 +531,7 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
finish_wait(&server->probe_wq, &wait);
dont_wait:
- if (server->probe.responded)
+ if (estate->responded)
return 0;
if (is_intr && signal_pending(current))
return -ERESTARTSYS;