summaryrefslogtreecommitdiff
path: root/net/rxrpc/io_thread.c
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2024-12-09 13:48:35 -0800
committerJakub Kicinski <kuba@kernel.org>2024-12-09 13:48:37 -0800
commitf9663b7cafa5e15b700efc1fdfabe33e31c133e7 (patch)
treea97e28155f864d42668731c195828491f01e33d1 /net/rxrpc/io_thread.c
parent6145fefc1e42c1895c0c1c2c8593de2c085d8c56 (diff)
parent7c482665931b6ce7bc72fa5feae6c35567070296 (diff)
Merge branch 'rxrpc-implement-jumbo-data-transmission-and-rack-tlp'
David Howells says: ==================== rxrpc: Implement jumbo DATA transmission and RACK-TLP Here's a series of patches to implement two main features: (1) The transmission of jumbo data packets whereby several DATA packets of a particular size can be glued together into a single UDP packet, allowing us to make use of larger MTU sizes. The basic jumbo subpacket capacity is 1412 bytes (RXRPC_JUMBO_DATALEN) and, say, an MTU of 8192 allows five of them to be transmitted as one. An alternative (and possibly more efficient way) would be to expand/shrink the capacity of each DATA packet to match the MTU and thus save on header and tail-gap overhead, but the Rx protocol does not provide a mechanism for splitting the data - especially as the transported data is encrypted per-packet - and so UDP fragmentation would be the only way to handle this. In fact, in the future, AF_RXRPC also needs to look at shrinking the packet size where the MTU is smaller - for instance in the case of being carried by IPv6 over wifi where there isn't capacity for a 1412 byte capacity. (2) RACK-TLP to manage packet loss and retransmission in conjunction with the congestion control algorithm. These allow for better data throughput and work towards being able to have larger transmission windows. To this end, the following changes are also made: (1) Use a single large array of kvec structs for the I/O thread rather than having one per transmission buffer. We need a much bigger collection of kvecs for ping padding (2) Implement path-MTU probing by sending padded PING ACK packets and monitoring for PING RESPONSE ACKs. The pmtud value determined is used to configure the construction of jumbo DATA packets. (3) The transmission queue is changed from a linked list of transmission buffer structs to a linked list of transmission-queue structs, each of which points to either 32 or 64 transmission buffers (depending on cpu word size) and various bits of metadata are concentrated in the queue structs rather than the buffers to make better use of the cpu cache. (4) SACK data is stored in the transmission-queue structures in batches of 32 or 64 making it faster to process rather than being spread amongst all the individual packet buffers. (5) Don't change the DF flag on the UDP socket unless we need to - and basically only enable it for path-MTU probing. There are also some additional bits: (1) Fix the handling of connection aborts to poke the aborted connections. (2) Don't set the MORE-PACKETS Rx header flag on the wire. No one actually checks it and it is, in any case, generated inconsistently between implementations. (3) Request an ACK when, during call transmission, there's a stall in the app generating the data to be transmitted. (4) Fix attention starvation in the I/O thread by making sure we go through all outstanding events rather than returning to the beginning of the check cycle after any time we process an event. (5) Don't use the skbuff timestamp in the calculation of timeouts and RTT as we really should include local processing time in that too. Further, getting receive skbuff timestamps may be expensive. (6) Make RTT tracking per call with the saving of the value between calls, even within the same connection channel. The initial call timeout starts off large to allow the server time to set up its state before the initial reply. (7) Don't allocate txbuf structs for ACK packets, but rather use page frags and MSG_SPLICE_PAGES. (8) Use irq-disabling locks for interactions between app threads and I/O threads so that the I/O thread doesn't get help up. (9) Make rxrpc set the REQUEST-ACK flag on an outgoing packet when cwnd is at RXRPC_MIN_CWND (currently 4), not at 2 which it can never reach. (10) Add some tracing bits and pieces (including displaying the userStatus field in an ACK header) and some more stats counters (including different sizes of jumbo packets sent/received). Link: https://lore.kernel.org/r/20240306000655.1100294-1-dhowells@redhat.com/ [1] ==================== Link: https://patch.msgid.link/20241204074710.990092-1-dhowells@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/rxrpc/io_thread.c')
-rw-r--r--net/rxrpc/io_thread.c113
1 files changed, 61 insertions, 52 deletions
diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c
index 07c74c77d802..2925c7fc82cf 100644
--- a/net/rxrpc/io_thread.c
+++ b/net/rxrpc/io_thread.c
@@ -338,7 +338,6 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
struct rxrpc_channel *chan;
struct rxrpc_call *call = NULL;
unsigned int channel;
- bool ret;
if (sp->hdr.securityIndex != conn->security_ix)
return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security,
@@ -364,6 +363,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
if (sp->hdr.callNumber == 0)
return rxrpc_input_conn_packet(conn, skb);
+ /* Deal with path MTU discovery probing. */
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK &&
+ conn->pmtud_probe &&
+ after_eq(sp->ack.acked_serial, conn->pmtud_probe))
+ rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false);
+
/* Call-bound packets are routed by connection channel. */
channel = sp->hdr.cid & RXRPC_CHANNELMASK;
chan = &conn->channels[channel];
@@ -419,9 +424,9 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
peer_srx, skb);
}
- ret = rxrpc_input_call_event(call, skb);
+ rxrpc_queue_rx_call_packet(call, skb);
rxrpc_put_call(call, rxrpc_call_put_input);
- return ret;
+ return true;
}
/*
@@ -438,6 +443,8 @@ int rxrpc_io_thread(void *data)
ktime_t now;
#endif
bool should_stop;
+ LIST_HEAD(conn_attend_q);
+ LIST_HEAD(call_attend_q);
complete(&local->io_thread_ready);
@@ -448,43 +455,26 @@ int rxrpc_io_thread(void *data)
for (;;) {
rxrpc_inc_stat(local->rxnet, stat_io_loop);
- /* Deal with connections that want immediate attention. */
- conn = list_first_entry_or_null(&local->conn_attend_q,
- struct rxrpc_connection,
- attend_link);
- if (conn) {
- spin_lock_bh(&local->lock);
- list_del_init(&conn->attend_link);
- spin_unlock_bh(&local->lock);
-
- rxrpc_input_conn_event(conn, NULL);
- rxrpc_put_connection(conn, rxrpc_conn_put_poke);
- continue;
+ /* Inject a delay into packets if requested. */
+#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
+ now = ktime_get_real();
+ while ((skb = skb_peek(&local->rx_delay_queue))) {
+ if (ktime_before(now, skb->tstamp))
+ break;
+ skb = skb_dequeue(&local->rx_delay_queue);
+ skb_queue_tail(&local->rx_queue, skb);
}
+#endif
- if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER,
- &local->client_conn_flags))
- rxrpc_discard_expired_client_conns(local);
-
- /* Deal with calls that want immediate attention. */
- if ((call = list_first_entry_or_null(&local->call_attend_q,
- struct rxrpc_call,
- attend_link))) {
- spin_lock_bh(&local->lock);
- list_del_init(&call->attend_link);
- spin_unlock_bh(&local->lock);
-
- trace_rxrpc_call_poked(call);
- rxrpc_input_call_event(call, NULL);
- rxrpc_put_call(call, rxrpc_call_put_poke);
- continue;
+ if (!skb_queue_empty(&local->rx_queue)) {
+ spin_lock_irq(&local->rx_queue.lock);
+ skb_queue_splice_tail_init(&local->rx_queue, &rx_queue);
+ spin_unlock_irq(&local->rx_queue.lock);
+ trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue));
}
- if (!list_empty(&local->new_client_calls))
- rxrpc_connect_client_calls(local);
-
- /* Process received packets and errors. */
- if ((skb = __skb_dequeue(&rx_queue))) {
+ /* Distribute packets and errors. */
+ while ((skb = __skb_dequeue(&rx_queue))) {
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
switch (skb->mark) {
case RXRPC_SKB_MARK_PACKET:
@@ -508,27 +498,46 @@ int rxrpc_io_thread(void *data)
rxrpc_free_skb(skb, rxrpc_skb_put_unknown);
break;
}
- continue;
}
- /* Inject a delay into packets if requested. */
-#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
- now = ktime_get_real();
- while ((skb = skb_peek(&local->rx_delay_queue))) {
- if (ktime_before(now, skb->tstamp))
- break;
- skb = skb_dequeue(&local->rx_delay_queue);
- skb_queue_tail(&local->rx_queue, skb);
+ /* Deal with connections that want immediate attention. */
+ spin_lock_irq(&local->lock);
+ list_splice_tail_init(&local->conn_attend_q, &conn_attend_q);
+ spin_unlock_irq(&local->lock);
+
+ while ((conn = list_first_entry_or_null(&conn_attend_q,
+ struct rxrpc_connection,
+ attend_link))) {
+ spin_lock_bh(&local->lock);
+ list_del_init(&conn->attend_link);
+ spin_unlock_bh(&local->lock);
+ rxrpc_input_conn_event(conn, NULL);
+ rxrpc_put_connection(conn, rxrpc_conn_put_poke);
}
-#endif
- if (!skb_queue_empty(&local->rx_queue)) {
- spin_lock_irq(&local->rx_queue.lock);
- skb_queue_splice_tail_init(&local->rx_queue, &rx_queue);
- spin_unlock_irq(&local->rx_queue.lock);
- continue;
+ if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER,
+ &local->client_conn_flags))
+ rxrpc_discard_expired_client_conns(local);
+
+ /* Deal with calls that want immediate attention. */
+ spin_lock_irq(&local->lock);
+ list_splice_tail_init(&local->call_attend_q, &call_attend_q);
+ spin_unlock_irq(&local->lock);
+
+ while ((call = list_first_entry_or_null(&call_attend_q,
+ struct rxrpc_call,
+ attend_link))) {
+ spin_lock_bh(&local->lock);
+ list_del_init(&call->attend_link);
+ spin_unlock_bh(&local->lock);
+ trace_rxrpc_call_poked(call);
+ rxrpc_input_call_event(call);
+ rxrpc_put_call(call, rxrpc_call_put_poke);
}
+ if (!list_empty(&local->new_client_calls))
+ rxrpc_connect_client_calls(local);
+
set_current_state(TASK_INTERRUPTIBLE);
should_stop = kthread_should_stop();
if (!skb_queue_empty(&local->rx_queue) ||
@@ -558,7 +567,7 @@ int rxrpc_io_thread(void *data)
}
timeout = nsecs_to_jiffies(delay_ns);
- timeout = max(timeout, 1UL);
+ timeout = umax(timeout, 1);
schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);
continue;