summaryrefslogtreecommitdiff
path: root/net/rxrpc/sendmsg.c
diff options
context:
space:
mode:
authorDavid Howells <dhowells@redhat.com>2022-03-31 23:55:08 +0100
committerDavid Howells <dhowells@redhat.com>2022-11-08 16:42:28 +0000
commita4ea4c47761943d90cd5d1688b3c3c65922ff2b1 (patch)
tree6b4197ba2e1ec518a512f7aaf9fbda875457794f /net/rxrpc/sendmsg.c
parent5d7edbc9231ec6b60f9c5b7e7980e9a1cd92e6bb (diff)
rxrpc: Don't use a ring buffer for call Tx queue
Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
Diffstat (limited to 'net/rxrpc/sendmsg.c')
-rw-r--r--net/rxrpc/sendmsg.c188
1 files changed, 72 insertions, 116 deletions
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index e32805a49324..a96ae7f58148 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -25,7 +25,7 @@ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win)
unsigned int win_size =
min_t(unsigned int, call->tx_winsize,
call->cong_cwnd + call->cong_extra);
- rxrpc_seq_t tx_win = READ_ONCE(call->tx_hard_ack);
+ rxrpc_seq_t tx_win = smp_load_acquire(&call->acks_hard_ack);
if (_tx_win)
*_tx_win = tx_win;
@@ -50,7 +50,12 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx,
if (signal_pending(current))
return sock_intr_errno(*timeo);
- trace_rxrpc_transmit(call, rxrpc_transmit_wait);
+ if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) {
+ rxrpc_shrink_call_tx_buffer(call);
+ continue;
+ }
+
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_wait);
*timeo = schedule_timeout(*timeo);
}
}
@@ -71,12 +76,11 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
rtt = 2;
timeout = rtt;
- tx_start = READ_ONCE(call->tx_hard_ack);
+ tx_start = smp_load_acquire(&call->acks_hard_ack);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
- tx_win = READ_ONCE(call->tx_hard_ack);
if (rxrpc_check_tx_space(call, &tx_win))
return 0;
@@ -87,12 +91,17 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
tx_win == tx_start && signal_pending(current))
return -EINTR;
+ if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) {
+ rxrpc_shrink_call_tx_buffer(call);
+ continue;
+ }
+
if (tx_win != tx_start) {
timeout = rtt;
tx_start = tx_win;
}
- trace_rxrpc_transmit(call, rxrpc_transmit_wait);
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_wait);
timeout = schedule_timeout(timeout);
}
}
@@ -112,7 +121,12 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
if (call->state >= RXRPC_CALL_COMPLETE)
return call->error;
- trace_rxrpc_transmit(call, rxrpc_transmit_wait);
+ if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) {
+ rxrpc_shrink_call_tx_buffer(call);
+ continue;
+ }
+
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_wait);
*timeo = schedule_timeout(*timeo);
}
}
@@ -129,8 +143,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
DECLARE_WAITQUEUE(myself, current);
int ret;
- _enter(",{%u,%u,%u}",
- call->tx_hard_ack, call->tx_top, call->tx_winsize);
+ _enter(",{%u,%u,%u,%u}",
+ call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize);
add_wait_queue(&call->waitq, &myself);
@@ -155,24 +169,6 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
}
/*
- * Schedule an instant Tx resend.
- */
-static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix)
-{
- spin_lock_bh(&call->lock);
-
- if (call->state < RXRPC_CALL_COMPLETE) {
- call->rxtx_annotations[ix] =
- (call->rxtx_annotations[ix] & RXRPC_TX_ANNO_LAST) |
- RXRPC_TX_ANNO_RETRANS;
- if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
- rxrpc_queue_call(call);
- }
-
- spin_unlock_bh(&call->lock);
-}
-
-/*
* Notify the owner of the call that the transmit phase is ended and the last
* packet has been queued.
*/
@@ -188,40 +184,35 @@ static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call,
* the packet immediately. Returns the error from rxrpc_send_data_packet()
* in case the caller wants to do something with it.
*/
-static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
- struct sk_buff *skb, bool last,
- rxrpc_notify_end_tx_t notify_end_tx)
+static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
+ struct rxrpc_txbuf *txb,
+ rxrpc_notify_end_tx_t notify_end_tx)
{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
unsigned long now;
- rxrpc_seq_t seq = sp->hdr.seq;
- int ret, ix;
- u8 annotation = RXRPC_TX_ANNO_UNACK;
-
- _net("queue skb %p [%d]", skb, seq);
+ rxrpc_seq_t seq = txb->seq;
+ bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags);
+ int ret;
rxrpc_inc_stat(call->rxnet, stat_tx_data);
ASSERTCMP(seq, ==, call->tx_top + 1);
- if (last)
- annotation |= RXRPC_TX_ANNO_LAST;
-
/* We have to set the timestamp before queueing as the retransmit
* algorithm can see the packet as soon as we queue it.
*/
- skb->tstamp = ktime_get_real();
+ txb->last_sent = ktime_get_real();
- ix = seq & RXRPC_RXTX_BUFF_MASK;
- rxrpc_get_skb(skb, rxrpc_skb_got);
- call->rxtx_annotations[ix] = annotation;
- smp_wmb();
- call->rxtx_buffer[ix] = skb;
+ /* Add the packet to the call's output buffer */
+ rxrpc_get_txbuf(txb, rxrpc_txbuf_get_buffer);
+ spin_lock(&call->tx_lock);
+ list_add_tail(&txb->call_link, &call->tx_buffer);
call->tx_top = seq;
+ spin_unlock(&call->tx_lock);
+
if (last)
- trace_rxrpc_transmit(call, rxrpc_transmit_queue_last);
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last);
else
- trace_rxrpc_transmit(call, rxrpc_transmit_queue);
+ trace_rxrpc_txqueue(call, rxrpc_txqueue_queue);
if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
_debug("________awaiting reply/ACK__________");
@@ -254,7 +245,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
if (seq == 1 && rxrpc_is_client_call(call))
rxrpc_expose_client_call(call);
- ret = rxrpc_send_data_packet(call, skb, false);
+ ret = rxrpc_send_data_packet(call, txb);
if (ret < 0) {
switch (ret) {
case -ENETUNREACH:
@@ -264,8 +255,6 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
0, ret);
goto out;
}
- _debug("need instant resend %d", ret);
- rxrpc_instant_resend(call, ix);
} else {
unsigned long now = jiffies;
unsigned long resend_at = now + call->peer->rto_j;
@@ -276,9 +265,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
}
out:
- rxrpc_free_skb(skb, rxrpc_skb_freed);
- _leave(" = %d", ret);
- return ret;
+ rxrpc_put_txbuf(txb, rxrpc_txbuf_put_trans);
}
/*
@@ -292,8 +279,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
rxrpc_notify_end_tx_t notify_end_tx,
bool *_dropped_lock)
{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
+ struct rxrpc_txbuf *txb;
struct sock *sk = &rx->sk;
enum rxrpc_call_state state;
long timeo;
@@ -327,14 +313,15 @@ reload:
goto maybe_error;
}
- skb = call->tx_pending;
+ txb = call->tx_pending;
call->tx_pending = NULL;
- rxrpc_see_skb(skb, rxrpc_skb_seen);
+ if (txb)
+ rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more);
do {
rxrpc_transmit_ack_packets(call->peer->local);
- if (!skb) {
+ if (!txb) {
size_t remain, bufsize, chunk, offset;
_debug("alloc");
@@ -355,52 +342,31 @@ reload:
_debug("SIZE: %zu/%zu @%zu", chunk, bufsize, offset);
/* create a buffer that we can retain until it's ACK'd */
- skb = sock_alloc_send_skb(
- sk, bufsize, msg->msg_flags & MSG_DONTWAIT, &ret);
- if (!skb)
+ ret = -ENOMEM;
+ txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_DATA,
+ GFP_KERNEL);
+ if (!txb)
goto maybe_error;
- sp = rxrpc_skb(skb);
- rxrpc_new_skb(skb, rxrpc_skb_new);
-
- _debug("ALLOC SEND %p", skb);
-
- ASSERTCMP(skb->mark, ==, 0);
-
- __skb_put(skb, offset);
-
- sp->remain = chunk;
- if (sp->remain > skb_tailroom(skb))
- sp->remain = skb_tailroom(skb);
-
- _net("skb: hr %d, tr %d, hl %d, rm %d",
- skb_headroom(skb),
- skb_tailroom(skb),
- skb_headlen(skb),
- sp->remain);
-
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ txb->offset = offset;
+ txb->space -= offset;
+ txb->space = min_t(size_t, chunk, txb->space);
}
_debug("append");
- sp = rxrpc_skb(skb);
/* append next segment of data to the current buffer */
if (msg_data_left(msg) > 0) {
- int copy = skb_tailroom(skb);
- ASSERTCMP(copy, >, 0);
- if (copy > msg_data_left(msg))
- copy = msg_data_left(msg);
- if (copy > sp->remain)
- copy = sp->remain;
-
- _debug("add");
- ret = skb_add_data(skb, &msg->msg_iter, copy);
- _debug("added");
- if (ret < 0)
+ size_t copy = min_t(size_t, txb->space, msg_data_left(msg));
+
+ _debug("add %zu", copy);
+ if (!copy_from_iter_full(txb->data + txb->offset, copy,
+ &msg->msg_iter))
goto efault;
- sp->remain -= copy;
- skb->mark += copy;
+ _debug("added");
+ txb->space -= copy;
+ txb->len += copy;
+ txb->offset += copy;
copied += copy;
if (call->tx_total_len != -1)
call->tx_total_len -= copy;
@@ -412,32 +378,22 @@ reload:
goto call_terminated;
/* add the packet to the send queue if it's now full */
- if (sp->remain <= 0 ||
+ if (!txb->space ||
(msg_data_left(msg) == 0 && !more)) {
- struct rxrpc_connection *conn = call->conn;
- uint32_t seq;
-
- seq = call->tx_top + 1;
-
- sp->hdr.seq = seq;
- sp->hdr._rsvd = 0;
- sp->hdr.flags = conn->out_clientflag;
-
- if (msg_data_left(msg) == 0 && !more)
- sp->hdr.flags |= RXRPC_LAST_PACKET;
- else if (call->tx_top - call->tx_hard_ack <
+ if (msg_data_left(msg) == 0 && !more) {
+ txb->wire.flags |= RXRPC_LAST_PACKET;
+ __set_bit(RXRPC_TXBUF_LAST, &txb->flags);
+ }
+ else if (call->tx_top - call->acks_hard_ack <
call->tx_winsize)
- sp->hdr.flags |= RXRPC_MORE_PACKETS;
+ txb->wire.flags |= RXRPC_MORE_PACKETS;
- ret = call->security->secure_packet(call, skb, skb->mark);
+ ret = call->security->secure_packet(call, txb);
if (ret < 0)
goto out;
- ret = rxrpc_queue_packet(rx, call, skb,
- !msg_data_left(msg) && !more,
- notify_end_tx);
- /* Should check for failure here */
- skb = NULL;
+ rxrpc_queue_packet(rx, call, txb, notify_end_tx);
+ txb = NULL;
}
} while (msg_data_left(msg) > 0);
@@ -450,12 +406,12 @@ success:
read_unlock_bh(&call->state_lock);
}
out:
- call->tx_pending = skb;
+ call->tx_pending = txb;
_leave(" = %d", ret);
return ret;
call_terminated:
- rxrpc_free_skb(skb, rxrpc_skb_freed);
+ rxrpc_put_txbuf(txb, rxrpc_txbuf_put_send_aborted);
_leave(" = %d", call->error);
return call->error;