summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2024-12-09 13:48:35 -0800
committerJakub Kicinski <kuba@kernel.org>2024-12-09 13:48:37 -0800
commitf9663b7cafa5e15b700efc1fdfabe33e31c133e7 (patch)
treea97e28155f864d42668731c195828491f01e33d1 /net
parent6145fefc1e42c1895c0c1c2c8593de2c085d8c56 (diff)
parent7c482665931b6ce7bc72fa5feae6c35567070296 (diff)
Merge branch 'rxrpc-implement-jumbo-data-transmission-and-rack-tlp'
David Howells says: ==================== rxrpc: Implement jumbo DATA transmission and RACK-TLP Here's a series of patches to implement two main features: (1) The transmission of jumbo data packets whereby several DATA packets of a particular size can be glued together into a single UDP packet, allowing us to make use of larger MTU sizes. The basic jumbo subpacket capacity is 1412 bytes (RXRPC_JUMBO_DATALEN) and, say, an MTU of 8192 allows five of them to be transmitted as one. An alternative (and possibly more efficient way) would be to expand/shrink the capacity of each DATA packet to match the MTU and thus save on header and tail-gap overhead, but the Rx protocol does not provide a mechanism for splitting the data - especially as the transported data is encrypted per-packet - and so UDP fragmentation would be the only way to handle this. In fact, in the future, AF_RXRPC also needs to look at shrinking the packet size where the MTU is smaller - for instance in the case of being carried by IPv6 over wifi where there isn't capacity for a 1412 byte capacity. (2) RACK-TLP to manage packet loss and retransmission in conjunction with the congestion control algorithm. These allow for better data throughput and work towards being able to have larger transmission windows. To this end, the following changes are also made: (1) Use a single large array of kvec structs for the I/O thread rather than having one per transmission buffer. We need a much bigger collection of kvecs for ping padding (2) Implement path-MTU probing by sending padded PING ACK packets and monitoring for PING RESPONSE ACKs. The pmtud value determined is used to configure the construction of jumbo DATA packets. (3) The transmission queue is changed from a linked list of transmission buffer structs to a linked list of transmission-queue structs, each of which points to either 32 or 64 transmission buffers (depending on cpu word size) and various bits of metadata are concentrated in the queue structs rather than the buffers to make better use of the cpu cache. (4) SACK data is stored in the transmission-queue structures in batches of 32 or 64 making it faster to process rather than being spread amongst all the individual packet buffers. (5) Don't change the DF flag on the UDP socket unless we need to - and basically only enable it for path-MTU probing. There are also some additional bits: (1) Fix the handling of connection aborts to poke the aborted connections. (2) Don't set the MORE-PACKETS Rx header flag on the wire. No one actually checks it and it is, in any case, generated inconsistently between implementations. (3) Request an ACK when, during call transmission, there's a stall in the app generating the data to be transmitted. (4) Fix attention starvation in the I/O thread by making sure we go through all outstanding events rather than returning to the beginning of the check cycle after any time we process an event. (5) Don't use the skbuff timestamp in the calculation of timeouts and RTT as we really should include local processing time in that too. Further, getting receive skbuff timestamps may be expensive. (6) Make RTT tracking per call with the saving of the value between calls, even within the same connection channel. The initial call timeout starts off large to allow the server time to set up its state before the initial reply. (7) Don't allocate txbuf structs for ACK packets, but rather use page frags and MSG_SPLICE_PAGES. (8) Use irq-disabling locks for interactions between app threads and I/O threads so that the I/O thread doesn't get help up. (9) Make rxrpc set the REQUEST-ACK flag on an outgoing packet when cwnd is at RXRPC_MIN_CWND (currently 4), not at 2 which it can never reach. (10) Add some tracing bits and pieces (including displaying the userStatus field in an ACK header) and some more stats counters (including different sizes of jumbo packets sent/received). Link: https://lore.kernel.org/r/20240306000655.1100294-1-dhowells@redhat.com/ [1] ==================== Link: https://patch.msgid.link/20241204074710.990092-1-dhowells@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net')
-rw-r--r--net/rxrpc/Makefile1
-rw-r--r--net/rxrpc/af_rxrpc.c4
-rw-r--r--net/rxrpc/ar-internal.h341
-rw-r--r--net/rxrpc/call_accept.c22
-rw-r--r--net/rxrpc/call_event.c385
-rw-r--r--net/rxrpc/call_object.c66
-rw-r--r--net/rxrpc/conn_client.c26
-rw-r--r--net/rxrpc/conn_event.c40
-rw-r--r--net/rxrpc/conn_object.c14
-rw-r--r--net/rxrpc/input.c706
-rw-r--r--net/rxrpc/input_rack.c418
-rw-r--r--net/rxrpc/insecure.c5
-rw-r--r--net/rxrpc/io_thread.c113
-rw-r--r--net/rxrpc/local_object.c3
-rw-r--r--net/rxrpc/misc.c4
-rw-r--r--net/rxrpc/output.c568
-rw-r--r--net/rxrpc/peer_event.c114
-rw-r--r--net/rxrpc/peer_object.c30
-rw-r--r--net/rxrpc/proc.c61
-rw-r--r--net/rxrpc/protocol.h13
-rw-r--r--net/rxrpc/recvmsg.c18
-rw-r--r--net/rxrpc/rtt.c103
-rw-r--r--net/rxrpc/rxkad.c59
-rw-r--r--net/rxrpc/rxperf.c2
-rw-r--r--net/rxrpc/security.c4
-rw-r--r--net/rxrpc/sendmsg.c92
-rw-r--r--net/rxrpc/sysctl.c6
-rw-r--r--net/rxrpc/txbuf.c127
28 files changed, 2212 insertions, 1133 deletions
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index ac5caf5a48e1..210b75e3179e 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -16,6 +16,7 @@ rxrpc-y := \
conn_object.o \
conn_service.o \
input.o \
+ input_rack.o \
insecure.o \
io_thread.o \
key.o \
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 9d8bd0b37e41..86873399f7d5 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -408,9 +408,9 @@ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call)
/* Make sure we're not going to call back into a kernel service */
if (call->notify_rx) {
- spin_lock(&call->notify_lock);
+ spin_lock_irq(&call->notify_lock);
call->notify_rx = rxrpc_dummy_notify_rx;
- spin_unlock(&call->notify_lock);
+ spin_unlock_irq(&call->notify_lock);
}
}
mutex_unlock(&call->user_mutex);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index d0fd37bdcfe9..0c0a3c89dba3 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -30,6 +30,7 @@ struct rxrpc_crypt {
struct key_preparsed_payload;
struct rxrpc_connection;
struct rxrpc_txbuf;
+struct rxrpc_txqueue;
/*
* Mark applied to socket buffers in skb->mark. skb->priority is used
@@ -98,6 +99,7 @@ struct rxrpc_net {
atomic_t stat_tx_data_send;
atomic_t stat_tx_data_send_frag;
atomic_t stat_tx_data_send_fail;
+ atomic_t stat_tx_data_send_msgsize;
atomic_t stat_tx_data_underflow;
atomic_t stat_tx_data_cwnd_reset;
atomic_t stat_rx_data;
@@ -109,6 +111,8 @@ struct rxrpc_net {
atomic_t stat_tx_ack_skip;
atomic_t stat_tx_acks[256];
atomic_t stat_rx_acks[256];
+ atomic_t stat_tx_jumbo[10];
+ atomic_t stat_rx_jumbo[10];
atomic_t stat_why_req_ack[8];
@@ -210,9 +214,8 @@ struct rxrpc_skb_priv {
rxrpc_seq_t first_ack; /* First packet in acks table */
rxrpc_seq_t prev_ack; /* Highest seq seen */
rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */
+ u16 nr_acks; /* Number of acks+nacks */
u8 reason; /* Reason for ack */
- u8 nr_acks; /* Number of acks+nacks */
- u8 nr_nacks; /* Number of nacks */
} ack;
};
struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
@@ -320,6 +323,12 @@ struct rxrpc_local {
struct list_head new_client_calls; /* Newly created client calls need connection */
spinlock_t client_call_lock; /* Lock for ->new_client_calls */
struct sockaddr_rxrpc srx; /* local address */
+ /* Provide a kvec table sufficiently large to manage either a DATA
+ * packet with a maximum set of jumbo subpackets or a PING ACK padded
+ * out to 64K with zeropages for PMTUD.
+ */
+ struct kvec kvec[RXRPC_MAX_NR_JUMBO > 3 + 16 ?
+ RXRPC_MAX_NR_JUMBO : 3 + 16];
};
/*
@@ -338,25 +347,28 @@ struct rxrpc_peer {
time64_t last_tx_at; /* Last time packet sent here */
seqlock_t service_conn_lock;
spinlock_t lock; /* access lock */
- unsigned int if_mtu; /* interface MTU for this peer */
- unsigned int mtu; /* network MTU for this peer */
- unsigned int maxdata; /* data size (MTU - hdrsize) */
- unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
int debug_id; /* debug ID for printks */
struct sockaddr_rxrpc srx; /* remote address */
- /* calculated RTT cache */
-#define RXRPC_RTT_CACHE_SIZE 32
- spinlock_t rtt_input_lock; /* RTT lock for input routine */
- ktime_t rtt_last_req; /* Time of last RTT request */
- unsigned int rtt_count; /* Number of samples we've got */
+ /* Path MTU discovery [RFC8899] */
+ unsigned int pmtud_trial; /* Current MTU probe size */
+ unsigned int pmtud_good; /* Largest working MTU probe we've tried */
+ unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */
+ bool pmtud_lost; /* T if MTU probe was lost */
+ bool pmtud_probing; /* T if we have an active probe outstanding */
+ bool pmtud_pending; /* T if a call to this peer should send a probe */
+ u8 pmtud_jumbo; /* Max jumbo packets for the MTU */
+ bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */
+ unsigned int ackr_max_data; /* Maximum data advertised by peer */
+ seqcount_t mtu_lock; /* Lockless MTU access management */
+ unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */
+ unsigned int max_data; /* Maximum packet data capacity for this peer */
+ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
+ unsigned short tx_seg_max; /* Maximum number of transmissable segments */
- u32 srtt_us; /* smoothed round trip time << 3 in usecs */
- u32 mdev_us; /* medium deviation */
- u32 mdev_max_us; /* maximal mdev for the last rtt period */
- u32 rttvar_us; /* smoothed mdev_max */
- u32 rto_us; /* Retransmission timeout in usec */
- u8 backoff; /* Backoff timeout (as shift) */
+ /* Calculated RTT cache */
+ unsigned int recent_srtt_us;
+ unsigned int recent_rto_us;
u8 cong_ssthresh; /* Congestion slow-start threshold */
};
@@ -525,6 +537,8 @@ struct rxrpc_connection {
int debug_id; /* debug ID for printks */
rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */
unsigned int hi_serial; /* highest serial number received */
+ rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */
+ unsigned int pmtud_call; /* ID of call used for probe */
u32 service_id; /* Service ID, possibly upgraded */
u32 security_level; /* Security level selected */
u8 security_ix; /* security type */
@@ -599,13 +613,25 @@ enum rxrpc_call_state {
/*
* Call Tx congestion management modes.
*/
-enum rxrpc_congest_mode {
- RXRPC_CALL_SLOW_START,
- RXRPC_CALL_CONGEST_AVOIDANCE,
- RXRPC_CALL_PACKET_LOSS,
- RXRPC_CALL_FAST_RETRANSMIT,
- NR__RXRPC_CONGEST_MODES
-};
+enum rxrpc_ca_state {
+ RXRPC_CA_SLOW_START,
+ RXRPC_CA_CONGEST_AVOIDANCE,
+ RXRPC_CA_PACKET_LOSS,
+ RXRPC_CA_FAST_RETRANSMIT,
+ NR__RXRPC_CA_STATES
+} __mode(byte);
+
+/*
+ * Current purpose of call RACK timer. According to the RACK-TLP protocol
+ * [RFC8985], the transmission timer (call->rack_timo_at) may only be used for
+ * one of these at once.
+ */
+enum rxrpc_rack_timer_mode {
+ RXRPC_CALL_RACKTIMER_OFF, /* Timer not running */
+ RXRPC_CALL_RACKTIMER_RACK_REORDER, /* RACK reordering timer */
+ RXRPC_CALL_RACKTIMER_TLP_PTO, /* TLP timeout */
+ RXRPC_CALL_RACKTIMER_RTO, /* Retransmission timeout */
+} __mode(byte);
/*
* RxRPC call definition
@@ -624,8 +650,7 @@ struct rxrpc_call {
struct mutex user_mutex; /* User access mutex */
struct sockaddr_rxrpc dest_srx; /* Destination address */
ktime_t delay_ack_at; /* When DELAY ACK needs to happen */
- ktime_t ack_lost_at; /* When ACK is figured as lost */
- ktime_t resend_at; /* When next resend needs to happen */
+ ktime_t rack_timo_at; /* When ACK is figured as lost */
ktime_t ping_at; /* When next to send a ping */
ktime_t keepalive_at; /* When next to send a keepalive ping */
ktime_t expect_rx_by; /* When we expect to get a packet by */
@@ -670,21 +695,30 @@ struct rxrpc_call {
unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
unsigned short rx_pkt_len; /* Current recvmsg packet len */
+ /* Sendmsg data tracking. */
+ rxrpc_seq_t send_top; /* Highest Tx slot filled by sendmsg. */
+ struct rxrpc_txqueue *send_queue; /* Queue that sendmsg is writing into */
+
/* Transmitted data tracking. */
- spinlock_t tx_lock; /* Transmit queue lock */
- struct list_head tx_sendmsg; /* Sendmsg prepared packets */
- struct list_head tx_buffer; /* Buffer of transmissible packets */
+ struct rxrpc_txqueue *tx_queue; /* Start of transmission buffers */
+ struct rxrpc_txqueue *tx_qtail; /* End of transmission buffers */
+ rxrpc_seq_t tx_qbase; /* First slot in tx_queue */
rxrpc_seq_t tx_bottom; /* First packet in buffer */
rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */
- rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */
rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */
+ rxrpc_serial_t tx_last_serial; /* Serial of last DATA transmitted */
u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */
- u8 tx_winsize; /* Maximum size of Tx window */
+ u16 tx_nr_sent; /* Number of packets sent, but unacked */
+ u16 tx_nr_lost; /* Number of packets marked lost */
+ u16 tx_nr_resent; /* Number of packets resent, but unacked */
+ u16 tx_winsize; /* Maximum size of Tx window */
#define RXRPC_TX_MAX_WINDOW 128
+ u8 tx_jumbo_max; /* Maximum subpkts peer will accept */
ktime_t tx_last_sent; /* Last time a transmission occurred */
/* Received data tracking */
struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */
+ struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */
struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */
rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */
@@ -698,14 +732,32 @@ struct rxrpc_call {
*/
#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN
#define RXRPC_MIN_CWND 4
- u8 cong_cwnd; /* Congestion window size */
+ enum rxrpc_ca_state cong_ca_state; /* Congestion control state */
u8 cong_extra; /* Extra to send for congestion management */
- u8 cong_ssthresh; /* Slow-start threshold */
- enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */
- u8 cong_dup_acks; /* Count of ACKs showing missing packets */
- u8 cong_cumul_acks; /* Cumulative ACK count */
+ u16 cong_cwnd; /* Congestion window size */
+ u16 cong_ssthresh; /* Slow-start threshold */
+ u16 cong_dup_acks; /* Count of ACKs showing missing packets */
+ u16 cong_cumul_acks; /* Cumulative ACK count */
ktime_t cong_tstamp; /* Last time cwnd was changed */
- struct sk_buff *cong_last_nack; /* Last ACK with nacks received */
+
+ /* RACK-TLP [RFC8985] state. */
+ ktime_t rack_xmit_ts; /* Latest transmission timestamp */
+ ktime_t rack_rtt; /* RTT of most recently ACK'd segment */
+ ktime_t rack_rtt_ts; /* Timestamp of rack_rtt */
+ ktime_t rack_reo_wnd; /* Reordering window */
+ unsigned int rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */
+ int rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */
+ rxrpc_seq_t rack_fack; /* Highest sequence so far ACK'd */
+ rxrpc_seq_t rack_end_seq; /* Highest sequence seen */
+ rxrpc_seq_t rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */
+ bool rack_dsack_round_none; /* T if dsack_round is "None" */
+ bool rack_reordering_seen; /* T if detected reordering event */
+ enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */
+ bool tlp_is_retrans; /* T if unacked TLP retransmission */
+ rxrpc_serial_t tlp_serial; /* Serial of TLP probe (or 0 if none in progress) */
+ rxrpc_seq_t tlp_seq; /* Sequence of TLP probe */
+ unsigned int tlp_rtt_taken; /* Last time RTT taken */
+ ktime_t tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */
/* Receive-phase ACK management (ACKs we send). */
u8 ackr_reason; /* reason to ACK */
@@ -730,32 +782,45 @@ struct rxrpc_call {
/* Transmission-phase ACK management (ACKs we've received). */
ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
- rxrpc_seq_t acks_first_seq; /* first sequence number received */
+ rxrpc_seq_t acks_hard_ack; /* Highest sequence hard acked */
rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */
- rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */
rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */
+ unsigned short acks_nr_sacks; /* Number of soft acks recorded */
+ unsigned short acks_nr_snacks; /* Number of soft nacks recorded */
+
+ /* Calculated RTT cache */
+ ktime_t rtt_last_req; /* Time of last RTT request */
+ unsigned int rtt_count; /* Number of samples we've got */
+ unsigned int rtt_taken; /* Number of samples taken (wrapping) */
+ struct minmax min_rtt; /* Estimated minimum RTT */
+ u32 srtt_us; /* smoothed round trip time << 3 in usecs */
+ u32 mdev_us; /* medium deviation */
+ u32 mdev_max_us; /* maximal mdev for the last rtt period */
+ u32 rttvar_us; /* smoothed mdev_max */
+ u32 rto_us; /* Retransmission timeout in usec */
+ u8 backoff; /* Backoff timeout (as shift) */
};
/*
* Summary of a new ACK and the changes it made to the Tx buffer packet states.
*/
struct rxrpc_ack_summary {
- u16 nr_acks; /* Number of ACKs in packet */
- u16 nr_new_acks; /* Number of new ACKs in packet */
- u16 nr_new_nacks; /* Number of new nacks in packet */
- u16 nr_retained_nacks; /* Number of nacks retained between ACKs */
- u8 ack_reason;
- bool saw_nacks; /* Saw NACKs in packet */
- bool new_low_nack; /* T if new low NACK found */
- bool retrans_timeo; /* T if reTx due to timeout happened */
- u8 flight_size; /* Number of unreceived transmissions */
- /* Place to stash values for tracing */
- enum rxrpc_congest_mode mode:8;
- u8 cwnd;
- u8 ssthresh;
- u8 dup_acks;
- u8 cumulative_acks;
+ rxrpc_serial_t ack_serial; /* Serial number of ACK */
+ rxrpc_serial_t acked_serial; /* Serial number ACK'd */
+ u16 in_flight; /* Number of unreceived transmissions */
+ u16 nr_new_hacks; /* Number of rotated new ACKs */
+ u16 nr_new_sacks; /* Number of new soft ACKs in packet */
+ u16 nr_new_snacks; /* Number of new soft nacks in packet */
+ u8 ack_reason;
+ bool new_low_snack:1; /* T if new low soft NACK found */
+ bool retrans_timeo:1; /* T if reTx due to timeout happened */
+ bool need_retransmit:1; /* T if we need transmission */
+ bool rtt_sample_avail:1; /* T if RTT sample available */
+ bool in_fast_or_rto_recovery:1;
+ bool exiting_fast_or_rto_recovery:1;
+ bool tlp_probe_acked:1; /* T if the TLP probe seq was acked */
+ u8 /*enum rxrpc_congest_change*/ change;
};
/*
@@ -793,25 +858,23 @@ struct rxrpc_send_params {
* Buffer of data to be output as a packet.
*/
struct rxrpc_txbuf {
- struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */
- struct list_head tx_link; /* Link in live Enc queue or Tx queue */
- ktime_t last_sent; /* Time at which last transmitted */
refcount_t ref;
rxrpc_seq_t seq; /* Sequence number of this packet */
rxrpc_serial_t serial; /* Last serial number transmitted with */
unsigned int call_debug_id;
unsigned int debug_id;
- unsigned int len; /* Amount of data in buffer */
- unsigned int space; /* Remaining data space */
- unsigned int offset; /* Offset of fill point */
+ unsigned short len; /* Amount of data in buffer */
+ unsigned short space; /* Remaining data space */
+ unsigned short offset; /* Offset of fill point */
+ unsigned short pkt_len; /* Size of packet content */
+ unsigned short alloc_size; /* Amount of bufferage allocated */
unsigned int flags;
#define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */
#define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */
__be16 cksum; /* Checksum to go in header */
- unsigned short ack_rwind; /* ACK receive window */
- u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */
+ bool jumboable; /* Can be non-terminal jumbo subpacket */
u8 nr_kvec; /* Amount of kvec[] used */
- struct kvec kvec[3];
+ struct kvec kvec[1];
};
static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb)
@@ -824,6 +887,46 @@ static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb)
return !rxrpc_sending_to_server(txb);
}
+/*
+ * Transmit queue element, including RACK [RFC8985] per-segment metadata. The
+ * transmission timestamp is in usec from the base.
+ */
+struct rxrpc_txqueue {
+ /* Start with the members we want to prefetch. */
+ struct rxrpc_txqueue *next;
+ ktime_t xmit_ts_base;
+ rxrpc_seq_t qbase;
+ u8 nr_reported_acks; /* Number of segments explicitly acked/nacked */
+ unsigned long segment_acked; /* Bit-per-buf: Set if ACK'd */
+ unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */
+ unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */
+ unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */
+ unsigned long ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */
+
+ /* The arrays we want to pack into as few cache lines as possible. */
+ struct {
+#define RXRPC_NR_TXQUEUE BITS_PER_LONG
+#define RXRPC_TXQ_MASK (RXRPC_NR_TXQUEUE - 1)
+ struct rxrpc_txbuf *bufs[RXRPC_NR_TXQUEUE];
+ unsigned int segment_serial[RXRPC_NR_TXQUEUE];
+ unsigned int segment_xmit_ts[RXRPC_NR_TXQUEUE];
+ } ____cacheline_aligned;
+};
+
+/*
+ * Data transmission request.
+ */
+struct rxrpc_send_data_req {
+ ktime_t now; /* Current time */
+ struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */
+ rxrpc_seq_t seq; /* Sequence of first data */
+ int n; /* Number of DATA packets to glue into jumbo */
+ bool retrans; /* T if this is a retransmission */
+ bool did_send; /* T if did actually send */
+ bool tlp_probe; /* T if this is a TLP probe */
+ int /* enum rxrpc_txdata_trace */ trace;
+};
+
#include <trace/events/rxrpc.h>
/*
@@ -841,6 +944,21 @@ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn
}
/*
+ * Allocate the next serial n numbers on a connection. 0 must be skipped.
+ */
+static inline rxrpc_serial_t rxrpc_get_next_serials(struct rxrpc_connection *conn,
+ unsigned int n)
+{
+ rxrpc_serial_t serial;
+
+ serial = conn->tx_serial;
+ if (serial + n <= n)
+ serial = 1;
+ conn->tx_serial = serial + n;
+ return serial;
+}
+
+/*
* af_rxrpc.c
*/
extern atomic_t rxrpc_n_rx_skbs;
@@ -865,10 +983,10 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial,
enum rxrpc_propose_ack_trace why);
void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t,
enum rxrpc_propose_ack_trace);
-void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *);
-void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb);
-
-bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb);
+void rxrpc_resend_tlp(struct rxrpc_call *call);
+void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit,
+ enum rxrpc_txdata_trace trace);
+bool rxrpc_input_call_event(struct rxrpc_call *call);
/*
* call_object.c
@@ -1047,6 +1165,32 @@ void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *);
void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *);
/*
+ * input_rack.c
+ */
+void rxrpc_input_rack_one(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix);
+void rxrpc_input_rack(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned long new_acks);
+void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary);
+ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now);
+void rxrpc_tlp_send_probe(struct rxrpc_call *call);
+void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary);
+void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by);
+
+/* Initialise TLP state [RFC8958 7.1]. */
+static inline void rxrpc_tlp_init(struct rxrpc_call *call)
+{
+ call->tlp_serial = 0;
+ call->tlp_seq = call->acks_hard_ack;
+ call->tlp_is_retrans = false;
+}
+
+/*
* io_thread.c
*/
int rxrpc_encap_rcv(struct sock *, struct sk_buff *);
@@ -1149,17 +1293,20 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
*/
void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why);
+void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call);
int rxrpc_send_abort_packet(struct rxrpc_call *);
+void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req);
void rxrpc_send_conn_abort(struct rxrpc_connection *conn);
void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb);
void rxrpc_send_keepalive(struct rxrpc_peer *);
-void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb);
/*
* peer_event.c
*/
void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *);
void rxrpc_peer_keepalive_worker(struct work_struct *);
+void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial,
+ bool sendmsg_fail);
/*
* peer_object.c
@@ -1208,10 +1355,12 @@ static inline int rxrpc_abort_eproto(struct rxrpc_call *call,
/*
* rtt.c
*/
-void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int,
- rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
-ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans);
-void rxrpc_peer_init_rtt(struct rxrpc_peer *);
+void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+ int rtt_slot,
+ rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+ ktime_t send_time, ktime_t resp_time);
+ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans);
+void rxrpc_call_init_rtt(struct rxrpc_call *call);
/*
* rxkad.c
@@ -1284,7 +1433,6 @@ static inline void rxrpc_sysctl_exit(void) {}
extern atomic_t rxrpc_nr_txbuf;
struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size,
size_t data_align, gfp_t gfp);
-struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size);
void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
@@ -1311,6 +1459,53 @@ static inline bool after_eq(u32 seq1, u32 seq2)
return (s32)(seq1 - seq2) >= 0;
}
+static inline u32 earliest(u32 seq1, u32 seq2)
+{
+ return before(seq1, seq2) ? seq1 : seq2;
+}
+
+static inline u32 latest(u32 seq1, u32 seq2)
+{
+ return after(seq1, seq2) ? seq1 : seq2;
+}
+
+static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq)
+{
+ return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase;
+}
+
+static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ rxrpc_get_skb(skb, rxrpc_skb_get_call_rx);
+ __skb_queue_tail(&call->rx_queue, skb);
+ rxrpc_poke_call(call, rxrpc_call_poke_rx_packet);
+}
+
+/*
+ * Calculate how much space there is for transmitting more DATA packets.
+ */
+static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call)
+{
+ int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra);
+ int transmitted = call->tx_top - call->tx_bottom;
+
+ return max(winsize - transmitted, 0);
+}
+
+static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call)
+{
+ return call->acks_nr_sacks + call->tx_nr_lost;
+}
+
+/*
+ * Calculate the number of transmitted DATA packets assumed to be in flight
+ * [approx RFC6675].
+ */
+static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call)
+{
+ return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent;
+}
+
/*
* debug tracing
*/
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 0f5a1d77b890..e685034ce4f7 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -188,8 +188,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
/* Make sure that there aren't any incoming calls in progress before we
* clear the preallocation buffers.
*/
- spin_lock(&rx->incoming_lock);
- spin_unlock(&rx->incoming_lock);
+ spin_lock_irq(&rx->incoming_lock);
+ spin_unlock_irq(&rx->incoming_lock);
head = b->peer_backlog_head;
tail = b->peer_backlog_tail;
@@ -343,7 +343,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local,
if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call);
- read_lock(&local->services_lock);
+ read_lock_irq(&local->services_lock);
/* Weed out packets to services we're not offering. Packets that would
* begin a call are explicitly rejected and the rest are just
@@ -399,34 +399,34 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local,
spin_unlock(&conn->state_lock);
spin_unlock(&rx->incoming_lock);
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
if (hlist_unhashed(&call->error_link)) {
- spin_lock(&call->peer->lock);
+ spin_lock_irq(&call->peer->lock);
hlist_add_head(&call->error_link, &call->peer->error_targets);
- spin_unlock(&call->peer->lock);
+ spin_unlock_irq(&call->peer->lock);
}
_leave(" = %p{%d}", call, call->debug_id);
- rxrpc_input_call_event(call, skb);
+ rxrpc_queue_rx_call_packet(call, skb);
rxrpc_put_call(call, rxrpc_call_put_input);
return true;
unsupported_service:
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered,
RX_INVALID_OPERATION, -EOPNOTSUPP);
unsupported_security:
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered,
RX_INVALID_OPERATION, -EKEYREJECTED);
no_call:
spin_unlock(&rx->incoming_lock);
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
_leave(" = f [%u]", skb->mark);
return false;
discard:
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
return true;
}
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 7bbb68504766..8e477f7f8850 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -44,8 +44,8 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial,
trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial);
- if (call->peer->srtt_us)
- delay = (call->peer->srtt_us >> 3) * NSEC_PER_USEC;
+ if (call->srtt_us)
+ delay = (call->srtt_us >> 3) * NSEC_PER_USEC;
else
delay = ms_to_ktime(READ_ONCE(rxrpc_soft_ack_delay));
ktime_add_ms(delay, call->tx_backoff);
@@ -55,147 +55,104 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial,
}
/*
- * Handle congestion being detected by the retransmit timeout.
+ * Retransmit one or more packets.
*/
-static void rxrpc_congestion_timeout(struct rxrpc_call *call)
+static bool rxrpc_retransmit_data(struct rxrpc_call *call,
+ struct rxrpc_send_data_req *req)
{
- set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags);
+ struct rxrpc_txqueue *tq = req->tq;
+ unsigned int ix = req->seq & RXRPC_TXQ_MASK;
+ struct rxrpc_txbuf *txb = tq->bufs[ix];
+
+ _enter("%x,%x,%x,%x", tq->qbase, req->seq, ix, txb->debug_id);
+
+ req->retrans = true;
+ trace_rxrpc_retransmit(call, req, txb);
+
+ txb->flags |= RXRPC_TXBUF_RESENT;
+ rxrpc_send_data_packet(call, req);
+ rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans);
+
+ req->tq = NULL;
+ req->n = 0;
+ req->did_send = true;
+ req->now = ktime_get_real();
+ return true;
}
/*
* Perform retransmission of NAK'd and unack'd packets.
*/
-void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
+static void rxrpc_resend(struct rxrpc_call *call)
{
- struct rxrpc_ackpacket *ack = NULL;
- struct rxrpc_skb_priv *sp;
- struct rxrpc_txbuf *txb;
- rxrpc_seq_t transmitted = call->tx_transmitted;
- ktime_t next_resend = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC);
- ktime_t resend_at = KTIME_MAX, now, delay;
- bool unacked = false, did_send = false;
- unsigned int i;
-
- _enter("{%d,%d}", call->acks_hard_ack, call->tx_top);
-
- now = ktime_get_real();
-
- if (list_empty(&call->tx_buffer))
- goto no_resend;
+ struct rxrpc_send_data_req req = {
+ .now = ktime_get_real(),
+ .trace = rxrpc_txdata_retransmit,
+ };
+ struct rxrpc_txqueue *tq;
- trace_rxrpc_resend(call, ack_skb);
- txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link);
+ _enter("{%d,%d}", call->tx_bottom, call->tx_top);
- /* Scan the soft ACK table without dropping the lock and resend any
- * explicitly NAK'd packets.
- */
- if (ack_skb) {
- sp = rxrpc_skb(ack_skb);
- ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header);
+ trace_rxrpc_resend(call, call->acks_highest_serial);
- for (i = 0; i < sp->ack.nr_acks; i++) {
- rxrpc_seq_t seq;
+ /* Scan the transmission queue, looking for lost packets. */
+ for (tq = call->tx_queue; tq; tq = tq->next) {
+ unsigned long lost = tq->segment_lost;
- if (ack->acks[i] & 1)
- continue;
- seq = sp->ack.first_ack + i;
- if (after(txb->seq, transmitted))
- break;
- if (after(txb->seq, seq))
- continue; /* A new hard ACK probably came in */
- list_for_each_entry_from(txb, &call->tx_buffer, call_link) {
- if (txb->seq == seq)
- goto found_txb;
- }
- goto no_further_resend;
-
- found_txb:
- resend_at = ktime_add(txb->last_sent, rto);
- if (after(txb->serial, call->acks_highest_serial)) {
- if (ktime_after(resend_at, now) &&
- ktime_before(resend_at, next_resend))
- next_resend = resend_at;
- continue; /* Ack point not yet reached */
- }
+ if (after(tq->qbase, call->tx_transmitted))
+ break;
- rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked);
+ _debug("retr %16lx %u c=%08x [%x]",
+ tq->segment_acked, tq->nr_reported_acks, call->debug_id, tq->qbase);
+ _debug("lost %16lx", lost);
- trace_rxrpc_retransmit(call, txb->seq, txb->serial,
- ktime_sub(resend_at, now));
+ trace_rxrpc_resend_lost(call, tq, lost);
+ while (lost) {
+ unsigned int ix = __ffs(lost);
+ struct rxrpc_txbuf *txb = tq->bufs[ix];
- txb->flags |= RXRPC_TXBUF_RESENT;
- rxrpc_transmit_one(call, txb);
- did_send = true;
- now = ktime_get_real();
+ __clear_bit(ix, &lost);
+ rxrpc_see_txbuf(txb, rxrpc_txbuf_see_lost);
- if (list_is_last(&txb->call_link, &call->tx_buffer))
- goto no_further_resend;
- txb = list_next_entry(txb, call_link);
+ req.tq = tq;
+ req.seq = tq->qbase + ix;
+ req.n = 1;
+ rxrpc_retransmit_data(call, &req);
}
}
- /* Fast-forward through the Tx queue to the point the peer says it has
- * seen. Anything between the soft-ACK table and that point will get
- * ACK'd or NACK'd in due course, so don't worry about it here; here we
- * need to consider retransmitting anything beyond that point.
- */
- if (after_eq(call->acks_prev_seq, call->tx_transmitted))
- goto no_further_resend;
-
- list_for_each_entry_from(txb, &call->tx_buffer, call_link) {
- resend_at = ktime_add(txb->last_sent, rto);
-
- if (before_eq(txb->seq, call->acks_prev_seq))
- continue;
- if (after(txb->seq, call->tx_transmitted))
- break; /* Not transmitted yet */
-
- if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE &&
- before(txb->serial, ntohl(ack->serial)))
- goto do_resend; /* Wasn't accounted for by a more recent ping. */
-
- if (ktime_after(resend_at, now)) {
- if (ktime_before(resend_at, next_resend))
- next_resend = resend_at;
- continue;
- }
-
- do_resend:
- unacked = true;
-
- txb->flags |= RXRPC_TXBUF_RESENT;
- rxrpc_transmit_one(call, txb);
- did_send = true;
- rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans);
- now = ktime_get_real();
- }
+ rxrpc_get_rto_backoff(call, req.did_send);
+ _leave("");
+}
-no_further_resend:
-no_resend:
- if (resend_at < KTIME_MAX) {
- delay = rxrpc_get_rto_backoff(call->peer, did_send);
- resend_at = ktime_add(resend_at, delay);
- trace_rxrpc_timer_set(call, resend_at - now, rxrpc_timer_trace_resend_reset);
+/*
+ * Resend the highest-seq DATA packet so far transmitted for RACK-TLP [RFC8985 7.3].
+ */
+void rxrpc_resend_tlp(struct rxrpc_call *call)
+{
+ struct rxrpc_send_data_req req = {
+ .now = ktime_get_real(),
+ .seq = call->tx_transmitted,
+ .n = 1,
+ .tlp_probe = true,
+ .trace = rxrpc_txdata_tlp_retransmit,
+ };
+
+ /* There's a chance it'll be on the tail segment of the queue. */
+ req.tq = READ_ONCE(call->tx_qtail);
+ if (req.tq &&
+ before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) {
+ rxrpc_retransmit_data(call, &req);
+ return;
}
- call->resend_at = resend_at;
-
- if (unacked)
- rxrpc_congestion_timeout(call);
-
- /* If there was nothing that needed retransmission then it's likely
- * that an ACK got lost somewhere. Send a ping to find out instead of
- * retransmitting data.
- */
- if (!did_send) {
- ktime_t next_ping = ktime_add_us(call->acks_latest_ts,
- call->peer->srtt_us >> 3);
- if (ktime_sub(next_ping, now) <= 0)
- rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
- rxrpc_propose_ack_ping_for_0_retrans);
+ for (req.tq = call->tx_queue; req.tq; req.tq = req.tq->next) {
+ if (after_eq(call->tx_transmitted, req.tq->qbase) &&
+ before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) {
+ rxrpc_retransmit_data(call, &req);
+ return;
+ }
}
-
- _leave("");
}
/*
@@ -231,68 +188,93 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call)
}
}
-static bool rxrpc_tx_window_has_space(struct rxrpc_call *call)
-{
- unsigned int winsize = min_t(unsigned int, call->tx_winsize,
- call->cong_cwnd + call->cong_extra);
- rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize;
- rxrpc_seq_t tx_top = call->tx_top;
- int space;
-
- space = wtop - tx_top;
- return space > 0;
-}
-
/*
- * Decant some if the sendmsg prepared queue into the transmission buffer.
+ * Transmit some as-yet untransmitted data, to a maximum of the supplied limit.
*/
-static void rxrpc_decant_prepared_tx(struct rxrpc_call *call)
+static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, unsigned int limit,
+ enum rxrpc_txdata_trace trace)
{
- struct rxrpc_txbuf *txb;
+ int space = rxrpc_tx_window_space(call);
if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
- if (list_empty(&call->tx_sendmsg))
+ if (call->send_top == call->tx_top)
return;
rxrpc_expose_client_call(call);
}
- while ((txb = list_first_entry_or_null(&call->tx_sendmsg,
- struct rxrpc_txbuf, call_link))) {
- spin_lock(&call->tx_lock);
- list_del(&txb->call_link);
- spin_unlock(&call->tx_lock);
+ while (space > 0) {
+ struct rxrpc_send_data_req req = {
+ .now = ktime_get_real(),
+ .seq = call->tx_transmitted + 1,
+ .n = 0,
+ .trace = trace,
+ };
+ struct rxrpc_txqueue *tq;
+ struct rxrpc_txbuf *txb;
+ rxrpc_seq_t send_top, seq;
+ int limit = min(space, max(call->peer->pmtud_jumbo, 1));
+
+ /* Order send_top before the contents of the new txbufs and
+ * txqueue pointers
+ */
+ send_top = smp_load_acquire(&call->send_top);
+ if (call->tx_top == send_top)
+ break;
- call->tx_top = txb->seq;
- list_add_tail(&txb->call_link, &call->tx_buffer);
+ trace_rxrpc_transmit(call, send_top, space);
- if (txb->flags & RXRPC_LAST_PACKET)
- rxrpc_close_tx_phase(call);
+ tq = call->tx_qtail;
+ seq = call->tx_top;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant);
- rxrpc_transmit_one(call, txb);
+ do {
+ int ix;
- if (!rxrpc_tx_window_has_space(call))
- break;
+ seq++;
+ ix = seq & RXRPC_TXQ_MASK;
+ if (!ix) {
+ tq = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant_advance);
+ }
+ if (!req.tq)
+ req.tq = tq;
+ txb = tq->bufs[ix];
+ req.n++;
+ if (!txb->jumboable)
+ break;
+ } while (req.n < limit && before(seq, send_top));
+
+ if (txb->flags & RXRPC_LAST_PACKET) {
+ rxrpc_close_tx_phase(call);
+ tq = NULL;
+ }
+ call->tx_qtail = tq;
+ call->tx_top = seq;
+
+ space -= req.n;
+ rxrpc_send_data_packet(call, &req);
}
}
-static void rxrpc_transmit_some_data(struct rxrpc_call *call)
+void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit,
+ enum rxrpc_txdata_trace trace)
{
switch (__rxrpc_call_state(call)) {
case RXRPC_CALL_SERVER_ACK_REQUEST:
- if (list_empty(&call->tx_sendmsg))
+ if (call->tx_bottom == READ_ONCE(call->send_top))
return;
rxrpc_begin_service_reply(call);
fallthrough;
case RXRPC_CALL_SERVER_SEND_REPLY:
case RXRPC_CALL_CLIENT_SEND_REQUEST:
- if (!rxrpc_tx_window_has_space(call))
+ if (!rxrpc_tx_window_space(call))
return;
- if (list_empty(&call->tx_sendmsg)) {
+ if (call->tx_bottom == READ_ONCE(call->send_top)) {
rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow);
return;
}
- rxrpc_decant_prepared_tx(call);
+ rxrpc_transmit_fresh_data(call, limit, trace);
break;
default:
return;
@@ -305,8 +287,8 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call)
*/
static void rxrpc_send_initial_ping(struct rxrpc_call *call)
{
- if (call->peer->rtt_count < 3 ||
- ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+ if (call->rtt_count < 3 ||
+ ktime_before(ktime_add_ms(call->rtt_last_req, 1000),
ktime_get_real()))
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_params);
@@ -315,10 +297,11 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call)
/*
* Handle retransmission and deferred ACK/abort generation.
*/
-bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
+bool rxrpc_input_call_event(struct rxrpc_call *call)
{
+ struct sk_buff *skb;
ktime_t now, t;
- bool resend = false;
+ bool did_receive = false, saw_ack = false;
s32 abort_code;
rxrpc_see_call(call, rxrpc_call_see_input);
@@ -328,9 +311,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)],
call->events);
- if (__rxrpc_call_is_complete(call))
- goto out;
-
/* Handle abort request locklessly, vs rxrpc_propose_abort(). */
abort_code = smp_load_acquire(&call->send_abort);
if (abort_code) {
@@ -339,11 +319,33 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
goto out;
}
- if (skb && skb->mark == RXRPC_SKB_MARK_ERROR)
- goto out;
+ do {
+ skb = __skb_dequeue(&call->rx_queue);
+ if (skb) {
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ if (__rxrpc_call_is_complete(call) ||
+ skb->mark == RXRPC_SKB_MARK_ERROR) {
+ rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
+ goto out;
+ }
+
+ saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK;
+
+ rxrpc_input_call_packet(call, skb);
+ rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
+ did_receive = true;
+ }
- if (skb)
- rxrpc_input_call_packet(call, skb);
+ t = ktime_sub(call->rack_timo_at, ktime_get_real());
+ if (t <= 0) {
+ trace_rxrpc_timer_exp(call, t,
+ rxrpc_timer_trace_rack_off + call->rack_timer_mode);
+ call->rack_timo_at = KTIME_MAX;
+ rxrpc_rack_timer_expired(call, t);
+ }
+
+ } while (!skb_queue_empty(&call->rx_queue));
/* If we see our async-event poke, check for timeout trippage. */
now = ktime_get_real();
@@ -376,13 +378,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
rxrpc_propose_ack_delayed_ack);
}
- t = ktime_sub(call->ack_lost_at, now);
- if (t <= 0) {
- trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_lost_ack);
- call->ack_lost_at = KTIME_MAX;
- set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events);
- }
-
t = ktime_sub(call->ping_at, now);
if (t <= 0) {
trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping);
@@ -391,15 +386,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
rxrpc_propose_ack_ping_for_keepalive);
}
- t = ktime_sub(call->resend_at, now);
- if (t <= 0) {
- trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_resend);
- call->resend_at = KTIME_MAX;
- resend = true;
- }
-
- rxrpc_transmit_some_data(call);
-
now = ktime_get_real();
t = ktime_sub(call->keepalive_at, now);
if (t <= 0) {
@@ -409,35 +395,40 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
rxrpc_propose_ack_ping_for_keepalive);
}
- if (skb) {
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
- if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK)
- rxrpc_congestion_degrade(call);
- }
-
if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events))
rxrpc_send_initial_ping(call);
+ rxrpc_transmit_some_data(call, UINT_MAX, rxrpc_txdata_new_data);
+
+ if (saw_ack)
+ rxrpc_congestion_degrade(call);
+
+ if (did_receive &&
+ (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_SEND_REQUEST ||
+ __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SEND_REPLY)) {
+ t = ktime_sub(call->rack_timo_at, ktime_get_real());
+ trace_rxrpc_rack(call, t);
+ }
+
/* Process events */
if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_lost_ack);
- if (resend &&
+ if (call->tx_nr_lost > 0 &&
__rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY &&
!test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags))
- rxrpc_resend(call, NULL);
+ rxrpc_resend(call);
if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags))
rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0,
rxrpc_propose_ack_rx_idle);
if (call->ackr_nr_unacked > 2) {
- if (call->peer->rtt_count < 3)
+ if (call->rtt_count < 3)
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_rtt);
- else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+ else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000),
ktime_get_real()))
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_old_rtt);
@@ -455,8 +446,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
set(call->expect_req_by);
set(call->expect_rx_by);
set(call->delay_ack_at);
- set(call->ack_lost_at);
- set(call->resend_at);
+ set(call->rack_timo_at);
set(call->keepalive_at);
set(call->ping_at);
@@ -467,7 +457,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
} else {
unsigned long nowj = jiffies, delayj, nextj;
- delayj = max(nsecs_to_jiffies(delay), 1);
+ delayj = umax(nsecs_to_jiffies(delay), 1);
nextj = nowj + delayj;
if (time_before(nextj, call->timer.expires) ||
!timer_pending(&call->timer)) {
@@ -484,9 +474,12 @@ out:
rxrpc_disconnect_call(call);
if (call->security)
call->security->free_call_crypto(call);
+ } else {
+ if (did_receive &&
+ call->peer->ackr_adv_pmtud &&
+ call->peer->pmtud_pending)
+ rxrpc_send_probe_for_pmtud(call);
}
- if (call->acks_hard_ack != call->tx_bottom)
- rxrpc_shrink_call_tx_buffer(call);
_leave("");
return true;
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f9e983a12c14..5a543c3f6fb0 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -49,7 +49,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what)
bool busy;
if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) {
- spin_lock_bh(&local->lock);
+ spin_lock_irq(&local->lock);
busy = !list_empty(&call->attend_link);
trace_rxrpc_poke_call(call, busy, what);
if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke))
@@ -57,7 +57,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what)
if (!busy) {
list_add_tail(&call->attend_link, &local->call_attend_q);
}
- spin_unlock_bh(&local->lock);
+ spin_unlock_irq(&local->lock);
if (!busy)
rxrpc_wake_up_io_thread(local);
}
@@ -146,23 +146,21 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
INIT_LIST_HEAD(&call->recvmsg_link);
INIT_LIST_HEAD(&call->sock_link);
INIT_LIST_HEAD(&call->attend_link);
- INIT_LIST_HEAD(&call->tx_sendmsg);
- INIT_LIST_HEAD(&call->tx_buffer);
+ skb_queue_head_init(&call->rx_queue);
skb_queue_head_init(&call->recvmsg_queue);
skb_queue_head_init(&call->rx_oos_queue);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->notify_lock);
- spin_lock_init(&call->tx_lock);
refcount_set(&call->ref, 1);
call->debug_id = debug_id;
call->tx_total_len = -1;
+ call->tx_jumbo_max = 1;
call->next_rx_timo = 20 * HZ;
call->next_req_timo = 1 * HZ;
call->ackr_window = 1;
call->ackr_wtop = 1;
call->delay_ack_at = KTIME_MAX;
- call->ack_lost_at = KTIME_MAX;
- call->resend_at = KTIME_MAX;
+ call->rack_timo_at = KTIME_MAX;
call->ping_at = KTIME_MAX;
call->keepalive_at = KTIME_MAX;
call->expect_rx_by = KTIME_MAX;
@@ -177,6 +175,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
call->cong_cwnd = RXRPC_MIN_CWND;
call->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
+ rxrpc_call_init_rtt(call);
+
call->rxnet = rxnet;
call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK;
atomic_inc(&rxnet->nr_calls);
@@ -220,9 +220,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
__set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags);
if (p->timeouts.normal)
- call->next_rx_timo = min(p->timeouts.normal, 1);
+ call->next_rx_timo = umin(p->timeouts.normal, 1);
if (p->timeouts.idle)
- call->next_req_timo = min(p->timeouts.idle, 1);
+ call->next_req_timo = umin(p->timeouts.idle, 1);
if (p->timeouts.hard)
call->hard_timo = p->timeouts.hard;
@@ -302,9 +302,9 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp)
trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call);
rxrpc_get_call(call, rxrpc_call_get_io_thread);
- spin_lock(&local->client_call_lock);
+ spin_lock_irq(&local->client_call_lock);
list_add_tail(&call->wait_link, &local->new_client_calls);
- spin_unlock(&local->client_call_lock);
+ spin_unlock_irq(&local->client_call_lock);
rxrpc_wake_up_io_thread(local);
return 0;
@@ -434,7 +434,7 @@ error_attached_to_socket:
/*
* Set up an incoming call. call->conn points to the connection.
- * This is called in BH context and isn't allowed to fail.
+ * This is called with interrupts disabled and isn't allowed to fail.
*/
void rxrpc_incoming_call(struct rxrpc_sock *rx,
struct rxrpc_call *call,
@@ -531,11 +531,29 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why)
}
/*
- * Clean up the Rx skb ring.
+ * Clean up the transmission buffers.
+ */
+static void rxrpc_cleanup_tx_buffers(struct rxrpc_call *call)
+{
+ struct rxrpc_txqueue *tq, *next;
+
+ for (tq = call->tx_queue; tq; tq = next) {
+ next = tq->next;
+ for (int i = 0; i < RXRPC_NR_TXQUEUE; i++)
+ if (tq->bufs[i])
+ rxrpc_put_txbuf(tq->bufs[i], rxrpc_txbuf_put_cleaned);
+ trace_rxrpc_tq(call, tq, 0, rxrpc_tq_cleaned);
+ kfree(tq);
+ }
+}
+
+/*
+ * Clean up the receive buffers.
*/
-static void rxrpc_cleanup_ring(struct rxrpc_call *call)
+static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call)
{
rxrpc_purge_queue(&call->recvmsg_queue);
+ rxrpc_purge_queue(&call->rx_queue);
rxrpc_purge_queue(&call->rx_oos_queue);
}
@@ -558,7 +576,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
rxrpc_put_call_slot(call);
/* Make sure we don't get any more notifications */
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
if (!list_empty(&call->recvmsg_link)) {
_debug("unlinking once-pending call %p { e=%lx f=%lx }",
@@ -571,7 +589,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
call->recvmsg_link.next = NULL;
call->recvmsg_link.prev = NULL;
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
if (put)
rxrpc_put_call(call, rxrpc_call_put_unnotify);
@@ -671,23 +689,11 @@ static void rxrpc_rcu_free_call(struct rcu_head *rcu)
static void rxrpc_destroy_call(struct work_struct *work)
{
struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer);
- struct rxrpc_txbuf *txb;
del_timer_sync(&call->timer);
- rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack);
- rxrpc_cleanup_ring(call);
- while ((txb = list_first_entry_or_null(&call->tx_sendmsg,
- struct rxrpc_txbuf, call_link))) {
- list_del(&txb->call_link);
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned);
- }
- while ((txb = list_first_entry_or_null(&call->tx_buffer,
- struct rxrpc_txbuf, call_link))) {
- list_del(&txb->call_link);
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned);
- }
-
+ rxrpc_cleanup_tx_buffers(call);
+ rxrpc_cleanup_rx_buffers(call);
rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned);
rxrpc_put_connection(call->conn, rxrpc_conn_put_call);
rxrpc_deactivate_bundle(call->bundle);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index bb11e8289d6d..db0099197890 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -231,7 +231,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
distance = id - id_cursor;
if (distance < 0)
distance = -distance;
- limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024);
+ limit = umax(atomic_read(&rxnet->nr_conns) * 4, 1024);
if (distance > limit)
goto mark_dont_reuse;
@@ -437,9 +437,9 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
call->dest_srx.srx_service = conn->service_id;
call->cong_ssthresh = call->peer->cong_ssthresh;
if (call->cong_cwnd >= call->cong_ssthresh)
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
else
- call->cong_mode = RXRPC_CALL_SLOW_START;
+ call->cong_ca_state = RXRPC_CA_SLOW_START;
chan->call_id = call_id;
chan->call_debug_id = call->debug_id;
@@ -508,16 +508,18 @@ static void rxrpc_activate_channels(struct rxrpc_bundle *bundle)
void rxrpc_connect_client_calls(struct rxrpc_local *local)
{
struct rxrpc_call *call;
+ LIST_HEAD(new_client_calls);
- while ((call = list_first_entry_or_null(&local->new_client_calls,
- struct rxrpc_call, wait_link))
- ) {
+ spin_lock_irq(&local->client_call_lock);
+ list_splice_tail_init(&local->new_client_calls, &new_client_calls);
+ spin_unlock_irq(&local->client_call_lock);
+
+ while ((call = list_first_entry_or_null(&new_client_calls,
+ struct rxrpc_call, wait_link))) {
struct rxrpc_bundle *bundle = call->bundle;
- spin_lock(&local->client_call_lock);
list_move_tail(&call->wait_link, &bundle->waiting_calls);
rxrpc_see_call(call, rxrpc_call_see_waiting_call);
- spin_unlock(&local->client_call_lock);
if (rxrpc_bundle_has_space(bundle))
rxrpc_activate_channels(bundle);
@@ -545,9 +547,9 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
- spin_lock(&call->peer->lock);
+ spin_lock_irq(&call->peer->lock);
hlist_add_head(&call->error_link, &call->peer->error_targets);
- spin_unlock(&call->peer->lock);
+ spin_unlock_irq(&call->peer->lock);
}
}
@@ -588,9 +590,9 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call
ASSERTCMP(call->call_id, ==, 0);
ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
/* May still be on ->new_client_calls. */
- spin_lock(&local->client_call_lock);
+ spin_lock_irq(&local->client_call_lock);
list_del_init(&call->wait_link);
- spin_unlock(&local->client_call_lock);
+ spin_unlock_irq(&local->client_call_lock);
return;
}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 598b4ee389fc..713e04394ceb 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -26,7 +26,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff
bool aborted = false;
if (conn->state != RXRPC_CONN_ABORTED) {
- spin_lock(&conn->state_lock);
+ spin_lock_irq(&conn->state_lock);
if (conn->state != RXRPC_CONN_ABORTED) {
conn->abort_code = abort_code;
conn->error = err;
@@ -37,7 +37,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff
set_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events);
aborted = true;
}
- spin_unlock(&conn->state_lock);
+ spin_unlock_irq(&conn->state_lock);
}
return aborted;
@@ -63,11 +63,12 @@ int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb,
/*
* Mark a connection as being remotely aborted.
*/
-static bool rxrpc_input_conn_abort(struct rxrpc_connection *conn,
+static void rxrpc_input_conn_abort(struct rxrpc_connection *conn,
struct sk_buff *skb)
{
- return rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED,
- RXRPC_CALL_REMOTELY_ABORTED);
+ trace_rxrpc_rx_conn_abort(conn, skb);
+ rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED,
+ RXRPC_CALL_REMOTELY_ABORTED);
}
/*
@@ -91,7 +92,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
struct rxrpc_acktrailer trailer;
size_t len;
int ret, ioc;
- u32 serial, mtu, call_id, padding;
+ u32 serial, max_mtu, if_mtu, call_id, padding;
_enter("%d", conn->debug_id);
@@ -149,8 +150,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
break;
case RXRPC_PACKET_TYPE_ACK:
- mtu = conn->peer->if_mtu;
- mtu -= conn->peer->hdrsize;
+ if_mtu = conn->peer->if_mtu - conn->peer->hdrsize;
+ if (conn->peer->ackr_adv_pmtud) {
+ max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu);
+ } else {
+ if_mtu = umin(1444, if_mtu);
+ max_mtu = if_mtu;
+ }
pkt.ack.bufferSpace = 0;
pkt.ack.maxSkew = htons(skb ? skb->priority : 0);
pkt.ack.firstPacket = htonl(chan->last_seq + 1);
@@ -158,10 +164,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0);
pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE;
pkt.ack.nAcks = 0;
- trailer.maxMTU = htonl(rxrpc_rx_mtu);
- trailer.ifMTU = htonl(mtu);
+ trailer.maxMTU = htonl(max_mtu);
+ trailer.ifMTU = htonl(if_mtu);
trailer.rwind = htonl(rxrpc_rx_window_size);
- trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max);
+ trailer.jumbo_max = 0;
pkt.whdr.flags |= RXRPC_SLOW_START_OK;
padding = 0;
iov[0].iov_len += sizeof(pkt.ack);
@@ -171,7 +177,8 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
trace_rxrpc_tx_ack(chan->call_debug_id, serial,
ntohl(pkt.ack.firstPacket),
ntohl(pkt.ack.serial),
- pkt.ack.reason, 0, rxrpc_rx_window_size);
+ pkt.ack.reason, 0, rxrpc_rx_window_size,
+ rxrpc_propose_ack_retransmit);
break;
default:
@@ -202,11 +209,14 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn)
for (i = 0; i < RXRPC_MAXCALLS; i++) {
call = conn->channels[i].call;
- if (call)
+ if (call) {
+ rxrpc_see_call(call, rxrpc_call_see_conn_abort);
rxrpc_set_call_completion(call,
conn->completion,
conn->abort_code,
conn->error);
+ rxrpc_poke_call(call, rxrpc_call_poke_conn_abort);
+ }
}
_leave("");
@@ -252,10 +262,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
if (ret < 0)
return ret;
- spin_lock(&conn->state_lock);
+ spin_lock_irq(&conn->state_lock);
if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING)
conn->state = RXRPC_CONN_SERVICE;
- spin_unlock(&conn->state_lock);
+ spin_unlock_irq(&conn->state_lock);
if (conn->state == RXRPC_CONN_SERVICE) {
/* Offload call state flipping to the I/O thread. As
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 694c4df7a1a3..7eba4d7d9a38 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -31,13 +31,13 @@ void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why)
if (WARN_ON_ONCE(!local))
return;
- spin_lock_bh(&local->lock);
+ spin_lock_irq(&local->lock);
busy = !list_empty(&conn->attend_link);
if (!busy) {
rxrpc_get_connection(conn, why);
list_add_tail(&conn->attend_link, &local->conn_attend_q);
}
- spin_unlock_bh(&local->lock);
+ spin_unlock_irq(&local->lock);
rxrpc_wake_up_io_thread(local);
}
@@ -196,9 +196,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
call->peer->cong_ssthresh = call->cong_ssthresh;
if (!hlist_unhashed(&call->error_link)) {
- spin_lock(&call->peer->lock);
+ spin_lock_irq(&call->peer->lock);
hlist_del_init(&call->error_link);
- spin_unlock(&call->peer->lock);
+ spin_unlock_irq(&call->peer->lock);
}
if (rxrpc_is_client_call(call)) {
@@ -321,6 +321,12 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
list_del_init(&conn->proc_link);
write_unlock(&rxnet->conn_lock);
+ if (conn->pmtud_probe) {
+ trace_rxrpc_pmtud_lost(conn, 0);
+ conn->peer->pmtud_probing = false;
+ conn->peer->pmtud_pending = true;
+ }
+
rxrpc_purge_queue(&conn->rx_queue);
rxrpc_kill_client_conn(conn);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 16d49a861dbb..4974b5accafa 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -27,80 +27,68 @@ static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq,
}
/*
- * Do TCP-style congestion management [RFC 5681].
+ * Do TCP-style congestion management [RFC5681].
*/
static void rxrpc_congestion_management(struct rxrpc_call *call,
- struct sk_buff *skb,
- struct rxrpc_ack_summary *summary,
- rxrpc_serial_t acked_serial)
+ struct rxrpc_ack_summary *summary)
{
- enum rxrpc_congest_change change = rxrpc_cong_no_change;
- unsigned int cumulative_acks = call->cong_cumul_acks;
- unsigned int cwnd = call->cong_cwnd;
- bool resend = false;
-
- summary->flight_size =
- (call->tx_top - call->acks_hard_ack) - summary->nr_acks;
+ summary->change = rxrpc_cong_no_change;
+ summary->in_flight = rxrpc_tx_in_flight(call);
if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
summary->retrans_timeo = true;
- call->cong_ssthresh = max_t(unsigned int,
- summary->flight_size / 2, 2);
- cwnd = 1;
- if (cwnd >= call->cong_ssthresh &&
- call->cong_mode == RXRPC_CALL_SLOW_START) {
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
- call->cong_tstamp = skb->tstamp;
- cumulative_acks = 0;
+ call->cong_ssthresh = umax(summary->in_flight / 2, 2);
+ call->cong_cwnd = 1;
+ if (call->cong_cwnd >= call->cong_ssthresh &&
+ call->cong_ca_state == RXRPC_CA_SLOW_START) {
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
+ call->cong_tstamp = call->acks_latest_ts;
+ call->cong_cumul_acks = 0;
}
}
- cumulative_acks += summary->nr_new_acks;
- if (cumulative_acks > 255)
- cumulative_acks = 255;
-
- summary->cwnd = call->cong_cwnd;
- summary->ssthresh = call->cong_ssthresh;
- summary->cumulative_acks = cumulative_acks;
- summary->dup_acks = call->cong_dup_acks;
+ call->cong_cumul_acks += summary->nr_new_sacks;
+ call->cong_cumul_acks += summary->nr_new_hacks;
+ if (call->cong_cumul_acks > 255)
+ call->cong_cumul_acks = 255;
- switch (call->cong_mode) {
- case RXRPC_CALL_SLOW_START:
- if (summary->saw_nacks)
+ switch (call->cong_ca_state) {
+ case RXRPC_CA_SLOW_START:
+ if (call->acks_nr_snacks > 0)
goto packet_loss_detected;
- if (summary->cumulative_acks > 0)
- cwnd += 1;
- if (cwnd >= call->cong_ssthresh) {
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
- call->cong_tstamp = skb->tstamp;
+ if (call->cong_cumul_acks > 0)
+ call->cong_cwnd += 1;
+ if (call->cong_cwnd >= call->cong_ssthresh) {
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
+ call->cong_tstamp = call->acks_latest_ts;
}
goto out;
- case RXRPC_CALL_CONGEST_AVOIDANCE:
- if (summary->saw_nacks)
+ case RXRPC_CA_CONGEST_AVOIDANCE:
+ if (call->acks_nr_snacks > 0)
goto packet_loss_detected;
/* We analyse the number of packets that get ACK'd per RTT
* period and increase the window if we managed to fill it.
*/
- if (call->peer->rtt_count == 0)
+ if (call->rtt_count == 0)
goto out;
- if (ktime_before(skb->tstamp,
+ if (ktime_before(call->acks_latest_ts,
ktime_add_us(call->cong_tstamp,
- call->peer->srtt_us >> 3)))
+ call->srtt_us >> 3)))
goto out_no_clear_ca;
- change = rxrpc_cong_rtt_window_end;
- call->cong_tstamp = skb->tstamp;
- if (cumulative_acks >= cwnd)
- cwnd++;
+ summary->change = rxrpc_cong_rtt_window_end;
+ call->cong_tstamp = call->acks_latest_ts;
+ if (call->cong_cumul_acks >= call->cong_cwnd)
+ call->cong_cwnd++;
goto out;
- case RXRPC_CALL_PACKET_LOSS:
- if (!summary->saw_nacks)
+ case RXRPC_CA_PACKET_LOSS:
+ if (call->acks_nr_snacks == 0)
goto resume_normality;
- if (summary->new_low_nack) {
- change = rxrpc_cong_new_low_nack;
+ if (summary->new_low_snack) {
+ summary->change = rxrpc_cong_new_low_nack;
call->cong_dup_acks = 1;
if (call->cong_extra > 1)
call->cong_extra = 1;
@@ -111,31 +99,35 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
if (call->cong_dup_acks < 3)
goto send_extra_data;
- change = rxrpc_cong_begin_retransmission;
- call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT;
- call->cong_ssthresh = max_t(unsigned int,
- summary->flight_size / 2, 2);
- cwnd = call->cong_ssthresh + 3;
+ summary->change = rxrpc_cong_begin_retransmission;
+ call->cong_ca_state = RXRPC_CA_FAST_RETRANSMIT;
+ call->cong_ssthresh = umax(summary->in_flight / 2, 2);
+ call->cong_cwnd = call->cong_ssthresh + 3;
call->cong_extra = 0;
call->cong_dup_acks = 0;
- resend = true;
+ summary->need_retransmit = true;
+ summary->in_fast_or_rto_recovery = true;
goto out;
- case RXRPC_CALL_FAST_RETRANSMIT:
- if (!summary->new_low_nack) {
- if (summary->nr_new_acks == 0)
- cwnd += 1;
+ case RXRPC_CA_FAST_RETRANSMIT:
+ rxrpc_tlp_init(call);
+ summary->in_fast_or_rto_recovery = true;
+ if (!summary->new_low_snack) {
+ if (summary->nr_new_sacks == 0)
+ call->cong_cwnd += 1;
call->cong_dup_acks++;
if (call->cong_dup_acks == 2) {
- change = rxrpc_cong_retransmit_again;
+ summary->change = rxrpc_cong_retransmit_again;
call->cong_dup_acks = 0;
- resend = true;
+ summary->need_retransmit = true;
}
} else {
- change = rxrpc_cong_progress;
- cwnd = call->cong_ssthresh;
- if (!summary->saw_nacks)
+ summary->change = rxrpc_cong_progress;
+ call->cong_cwnd = call->cong_ssthresh;
+ if (call->acks_nr_snacks == 0) {
+ summary->exiting_fast_or_rto_recovery = true;
goto resume_normality;
+ }
}
goto out;
@@ -145,30 +137,25 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
}
resume_normality:
- change = rxrpc_cong_cleared_nacks;
+ summary->change = rxrpc_cong_cleared_nacks;
call->cong_dup_acks = 0;
call->cong_extra = 0;
- call->cong_tstamp = skb->tstamp;
- if (cwnd < call->cong_ssthresh)
- call->cong_mode = RXRPC_CALL_SLOW_START;
+ call->cong_tstamp = call->acks_latest_ts;
+ if (call->cong_cwnd < call->cong_ssthresh)
+ call->cong_ca_state = RXRPC_CA_SLOW_START;
else
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
out:
- cumulative_acks = 0;
+ call->cong_cumul_acks = 0;
out_no_clear_ca:
- if (cwnd >= RXRPC_TX_MAX_WINDOW)
- cwnd = RXRPC_TX_MAX_WINDOW;
- call->cong_cwnd = cwnd;
- call->cong_cumul_acks = cumulative_acks;
- summary->mode = call->cong_mode;
- trace_rxrpc_congest(call, summary, acked_serial, change);
- if (resend)
- rxrpc_resend(call, skb);
+ if (call->cong_cwnd >= RXRPC_TX_MAX_WINDOW)
+ call->cong_cwnd = RXRPC_TX_MAX_WINDOW;
+ trace_rxrpc_congest(call, summary);
return;
packet_loss_detected:
- change = rxrpc_cong_saw_nack;
- call->cong_mode = RXRPC_CALL_PACKET_LOSS;
+ summary->change = rxrpc_cong_saw_nack;
+ call->cong_ca_state = RXRPC_CA_PACKET_LOSS;
call->cong_dup_acks = 0;
goto send_extra_data;
@@ -177,7 +164,7 @@ send_extra_data:
* state.
*/
if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ||
- summary->nr_acks != call->tx_top - call->acks_hard_ack) {
+ call->acks_nr_sacks != call->tx_top - call->tx_bottom) {
call->cong_extra++;
wake_up(&call->waitq);
}
@@ -189,26 +176,42 @@ send_extra_data:
*/
void rxrpc_congestion_degrade(struct rxrpc_call *call)
{
- ktime_t rtt, now;
+ ktime_t rtt, now, time_since;
- if (call->cong_mode != RXRPC_CALL_SLOW_START &&
- call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE)
+ if (call->cong_ca_state != RXRPC_CA_SLOW_START &&
+ call->cong_ca_state != RXRPC_CA_CONGEST_AVOIDANCE)
return;
if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY)
return;
- rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8));
+ rtt = ns_to_ktime(call->srtt_us * (NSEC_PER_USEC / 8));
now = ktime_get_real();
- if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now))
+ time_since = ktime_sub(now, call->tx_last_sent);
+ if (ktime_before(time_since, rtt))
return;
- trace_rxrpc_reset_cwnd(call, now);
+ trace_rxrpc_reset_cwnd(call, time_since, rtt);
rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset);
call->tx_last_sent = now;
- call->cong_mode = RXRPC_CALL_SLOW_START;
- call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh,
- call->cong_cwnd * 3 / 4);
- call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND);
+ call->cong_ca_state = RXRPC_CA_SLOW_START;
+ call->cong_ssthresh = umax(call->cong_ssthresh, call->cong_cwnd * 3 / 4);
+ call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND);
+}
+
+/*
+ * Add an RTT sample derived from an ACK'd DATA packet.
+ */
+static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ int ix)
+{
+ ktime_t xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
+
+ rxrpc_call_add_rtt(call, rxrpc_rtt_rx_data_ack, -1,
+ summary->acked_serial, summary->ack_serial,
+ xmit_ts, call->acks_latest_ts);
+ __clear_bit(ix, &tq->rtt_samples); /* Prevent repeat RTT sample */
}
/*
@@ -217,37 +220,120 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call)
static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
struct rxrpc_ack_summary *summary)
{
- struct rxrpc_txbuf *txb;
- bool rot_last = false;
+ struct rxrpc_txqueue *tq = call->tx_queue;
+ rxrpc_seq_t seq = call->tx_bottom + 1;
+ bool rot_last = false, trace = false;
- list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) {
- if (before_eq(txb->seq, call->acks_hard_ack))
- continue;
- if (txb->flags & RXRPC_LAST_PACKET) {
+ _enter("%x,%x", call->tx_bottom, to);
+
+ trace_rxrpc_tx_rotate(call, seq, to);
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate);
+
+ if (call->acks_lowest_nak == call->tx_bottom) {
+ call->acks_lowest_nak = to;
+ } else if (after(to, call->acks_lowest_nak)) {
+ summary->new_low_snack = true;
+ call->acks_lowest_nak = to;
+ }
+
+ /* We may have a left over fully-consumed buffer at the front that we
+ * couldn't drop before (rotate_and_keep below).
+ */
+ if (seq == call->tx_qbase + RXRPC_NR_TXQUEUE) {
+ call->tx_qbase += RXRPC_NR_TXQUEUE;
+ call->tx_queue = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free);
+ kfree(tq);
+ tq = call->tx_queue;
+ }
+
+ do {
+ unsigned int ix = seq - call->tx_qbase;
+
+ _debug("tq=%x seq=%x i=%d f=%x", tq->qbase, seq, ix, tq->bufs[ix]->flags);
+ if (tq->bufs[ix]->flags & RXRPC_LAST_PACKET) {
set_bit(RXRPC_CALL_TX_LAST, &call->flags);
rot_last = true;
}
- if (txb->seq == to)
- break;
- }
- if (rot_last)
- set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags);
+ if (summary->acked_serial == tq->segment_serial[ix] &&
+ test_bit(ix, &tq->rtt_samples))
+ rxrpc_add_data_rtt_sample(call, summary, tq, ix);
+
+ if (ix == tq->nr_reported_acks) {
+ /* Packet directly hard ACK'd. */
+ tq->nr_reported_acks++;
+ rxrpc_input_rack_one(call, summary, tq, ix);
+ if (seq == call->tlp_seq)
+ summary->tlp_probe_acked = true;
+ summary->nr_new_hacks++;
+ __set_bit(ix, &tq->segment_acked);
+ trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_hack);
+ } else if (test_bit(ix, &tq->segment_acked)) {
+ /* Soft ACK -> hard ACK. */
+ call->acks_nr_sacks--;
+ trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_sack);
+ } else {
+ /* Soft NAK -> hard ACK. */
+ call->acks_nr_snacks--;
+ rxrpc_input_rack_one(call, summary, tq, ix);
+ if (seq == call->tlp_seq)
+ summary->tlp_probe_acked = true;
+ summary->nr_new_hacks++;
+ __set_bit(ix, &tq->segment_acked);
+ trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_snak);
+ }
- _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last);
+ call->tx_nr_sent--;
+ if (__test_and_clear_bit(ix, &tq->segment_lost))
+ call->tx_nr_lost--;
+ if (__test_and_clear_bit(ix, &tq->segment_retransmitted))
+ call->tx_nr_resent--;
+ __clear_bit(ix, &tq->ever_retransmitted);
- if (call->acks_lowest_nak == call->acks_hard_ack) {
- call->acks_lowest_nak = to;
- } else if (after(to, call->acks_lowest_nak)) {
- summary->new_low_nack = true;
- call->acks_lowest_nak = to;
+ rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated);
+ tq->bufs[ix] = NULL;
+
+ WRITE_ONCE(call->tx_bottom, seq);
+ trace_rxrpc_txqueue(call, (rot_last ?
+ rxrpc_txqueue_rotate_last :
+ rxrpc_txqueue_rotate));
+
+ seq++;
+ trace = true;
+ if (!(seq & RXRPC_TXQ_MASK)) {
+ trace_rxrpc_rack_update(call, summary);
+ trace = false;
+ prefetch(tq->next);
+ if (tq != call->tx_qtail) {
+ call->tx_qbase += RXRPC_NR_TXQUEUE;
+ call->tx_queue = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free);
+ kfree(tq);
+ tq = call->tx_queue;
+ } else {
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_keep);
+ tq = NULL;
+ break;
+ }
+ }
+
+ } while (before_eq(seq, to));
+
+ if (trace)
+ trace_rxrpc_rack_update(call, summary);
+
+ if (rot_last) {
+ set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags);
+ if (tq) {
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free);
+ kfree(tq);
+ call->tx_queue = NULL;
+ }
}
- smp_store_release(&call->acks_hard_ack, to);
+ _debug("%x,%x,%x,%d", to, call->tx_bottom, call->tx_top, rot_last);
- trace_rxrpc_txqueue(call, (rot_last ?
- rxrpc_txqueue_rotate_last :
- rxrpc_txqueue_rotate));
wake_up(&call->waitq);
return rot_last;
}
@@ -263,13 +349,10 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
{
ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
- call->resend_at = KTIME_MAX;
- trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend);
-
- if (unlikely(call->cong_last_nack)) {
- rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack);
- call->cong_last_nack = NULL;
- }
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF;
+ call->rack_timo_at = KTIME_MAX;
+ trace_rxrpc_rack_timer(call, 0, false);
+ trace_rxrpc_timer_can(call, rxrpc_timer_trace_rack_off + call->rack_timer_mode);
switch (__rxrpc_call_state(call)) {
case RXRPC_CALL_CLIENT_SEND_REQUEST:
@@ -365,7 +448,7 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb,
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
bool last = sp->hdr.flags & RXRPC_LAST_PACKET;
- __skb_queue_tail(&call->recvmsg_queue, skb);
+ skb_queue_tail(&call->recvmsg_queue, skb);
rxrpc_input_update_ack_window(call, window, wtop);
trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq);
if (last)
@@ -442,7 +525,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb,
rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg);
- spin_lock(&call->recvmsg_queue.lock);
rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue);
*_notify = true;
@@ -464,8 +546,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb,
rxrpc_receive_queue_oos);
}
- spin_unlock(&call->recvmsg_queue.lock);
-
call->ackr_sack_base = sack;
} else {
unsigned int slot;
@@ -530,7 +610,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb
unsigned int offset = sizeof(struct rxrpc_wire_header);
unsigned int len = skb->len - offset;
bool notify = false;
- int ack_reason = 0;
+ int ack_reason = 0, count = 1, stat_ix;
while (sp->hdr.flags & RXRPC_JUMBO_PACKET) {
if (len < RXRPC_JUMBO_SUBPKTLEN)
@@ -559,12 +639,16 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb
sp->hdr.serial++;
offset += RXRPC_JUMBO_SUBPKTLEN;
len -= RXRPC_JUMBO_SUBPKTLEN;
+ count++;
}
sp->offset = offset;
sp->len = len;
rxrpc_input_data_one(call, skb, &notify, &ack_serial, &ack_reason);
+ stat_ix = umin(count, ARRAY_SIZE(call->rxnet->stat_rx_jumbo)) - 1;
+ atomic_inc(&call->rxnet->stat_rx_jumbo[stat_ix]);
+
if (ack_reason > 0) {
rxrpc_send_ACK(call, ack_reason, ack_serial,
rxrpc_propose_ack_input_data);
@@ -667,7 +751,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
smp_mb(); /* Read data before setting avail bit */
set_bit(i, &call->rtt_avail);
- rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial,
+ rxrpc_call_add_rtt(call, type, i, acked_serial, ack_serial,
sent_at, resp_time);
matched = true;
}
@@ -677,7 +761,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
*/
if (after(acked_serial, orig_serial)) {
trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_obsolete, i,
- orig_serial, acked_serial, 0, 0);
+ orig_serial, acked_serial, 0, 0, 0);
clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
smp_wmb();
set_bit(i, &call->rtt_avail);
@@ -685,7 +769,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
}
if (!matched)
- trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0);
+ trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0, 0);
}
/*
@@ -695,10 +779,13 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb
struct rxrpc_acktrailer *trailer)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_peer *peer;
- unsigned int mtu;
+ struct rxrpc_peer *peer = call->peer;
+ unsigned int max_data, capacity;
bool wake = false;
- u32 rwind = ntohl(trailer->rwind);
+ u32 max_mtu = ntohl(trailer->maxMTU);
+ //u32 if_mtu = ntohl(trailer->ifMTU);
+ u32 rwind = ntohl(trailer->rwind);
+ u32 jumbo_max = ntohl(trailer->jumbo_max);
if (rwind > RXRPC_TX_MAX_WINDOW)
rwind = RXRPC_TX_MAX_WINDOW;
@@ -709,54 +796,149 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb
call->tx_winsize = rwind;
}
- mtu = min(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU));
+ max_mtu = clamp(max_mtu, 500, 65535);
+ peer->ackr_max_data = max_mtu;
- peer = call->peer;
- if (mtu < peer->maxdata) {
- spin_lock(&peer->lock);
- peer->maxdata = mtu;
- peer->mtu = mtu + peer->hdrsize;
- spin_unlock(&peer->lock);
+ if (max_mtu < peer->max_data) {
+ trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu,
+ rxrpc_pmtud_reduce_ack);
+ write_seqcount_begin(&peer->mtu_lock);
+ peer->max_data = max_mtu;
+ write_seqcount_end(&peer->mtu_lock);
+ }
+
+ max_data = umin(max_mtu, peer->max_data);
+ capacity = max_data;
+ capacity += sizeof(struct rxrpc_jumbo_header); /* First subpacket has main hdr, not jumbo */
+ capacity /= sizeof(struct rxrpc_jumbo_header) + RXRPC_JUMBO_DATALEN;
+
+ if (jumbo_max == 0) {
+ /* The peer says it supports pmtu discovery */
+ peer->ackr_adv_pmtud = true;
+ } else {
+ peer->ackr_adv_pmtud = false;
+ capacity = clamp(capacity, 1, jumbo_max);
}
+ call->tx_jumbo_max = capacity;
+
if (wake)
wake_up(&call->waitq);
}
+#if defined(CONFIG_X86) && __GNUC__ && !defined(__clang__)
+/* Clang doesn't support the %z constraint modifier */
+#define shiftr_adv_rotr(shift_from, rotate_into) ({ \
+ asm(" shr%z1 %1\n" \
+ " inc %0\n" \
+ " rcr%z2 %2\n" \
+ : "+d"(shift_from), "+m"(*(shift_from)), "+rm"(rotate_into) \
+ ); \
+ })
+#else
+#define shiftr_adv_rotr(shift_from, rotate_into) ({ \
+ typeof(rotate_into) __bit0 = *(shift_from) & 1; \
+ *(shift_from) >>= 1; \
+ shift_from++; \
+ rotate_into >>= 1; \
+ rotate_into |= __bit0 << (sizeof(rotate_into) * 8 - 1); \
+ })
+#endif
+
/*
- * Determine how many nacks from the previous ACK have now been satisfied.
+ * Deal with RTT samples from soft ACKs.
*/
-static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call,
- struct rxrpc_ack_summary *summary,
- rxrpc_seq_t seq)
+static void rxrpc_input_soft_rtt(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq)
{
- struct sk_buff *skb = call->cong_last_nack;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int i, new_acks = 0, retained_nacks = 0;
- rxrpc_seq_t old_seq = sp->ack.first_ack;
- u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket);
+ for (int ix = 0; ix < RXRPC_NR_TXQUEUE; ix++)
+ if (summary->acked_serial == tq->segment_serial[ix])
+ return rxrpc_add_data_rtt_sample(call, summary, tq, ix);
+}
- if (after_eq(seq, old_seq + sp->ack.nr_acks)) {
- summary->nr_new_acks += sp->ack.nr_nacks;
- summary->nr_new_acks += seq - (old_seq + sp->ack.nr_acks);
- summary->nr_retained_nacks = 0;
- } else if (seq == old_seq) {
- summary->nr_retained_nacks = sp->ack.nr_nacks;
- } else {
- for (i = 0; i < sp->ack.nr_acks; i++) {
- if (acks[i] == RXRPC_ACK_TYPE_NACK) {
- if (before(old_seq + i, seq))
- new_acks++;
- else
- retained_nacks++;
- }
+/*
+ * Process a batch of soft ACKs specific to a transmission queue segment.
+ */
+static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned long extracted_acks,
+ int nr_reported,
+ rxrpc_seq_t seq,
+ rxrpc_seq_t *lowest_nak)
+{
+ unsigned long old_reported = 0, flipped, new_acks = 0;
+ unsigned long a_to_n, n_to_a = 0;
+ int new, a, n;
+
+ if (tq->nr_reported_acks > 0)
+ old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks);
+
+ _enter("{%x,%lx,%d},%lx,%d,%x",
+ tq->qbase, tq->segment_acked, tq->nr_reported_acks,
+ extracted_acks, nr_reported, seq);
+
+ _debug("[%x]", tq->qbase);
+ _debug("tq %16lx %u", tq->segment_acked, tq->nr_reported_acks);
+ _debug("sack %16lx %u", extracted_acks, nr_reported);
+
+ /* See how many previously logged ACKs/NAKs have flipped. */
+ flipped = (tq->segment_acked ^ extracted_acks) & old_reported;
+ if (flipped) {
+ n_to_a = ~tq->segment_acked & flipped; /* Old NAK -> ACK */
+ a_to_n = tq->segment_acked & flipped; /* Old ACK -> NAK */
+ a = hweight_long(n_to_a);
+ n = hweight_long(a_to_n);
+ _debug("flip %16lx", flipped);
+ _debug("ntoa %16lx %d", n_to_a, a);
+ _debug("aton %16lx %d", a_to_n, n);
+ call->acks_nr_sacks += a - n;
+ call->acks_nr_snacks += n - a;
+ summary->nr_new_sacks += a;
+ summary->nr_new_snacks += n;
+ }
+
+ /* See how many new ACKs/NAKs have been acquired. */
+ new = nr_reported - tq->nr_reported_acks;
+ if (new > 0) {
+ new_acks = extracted_acks & ~old_reported;
+ if (new_acks) {
+ a = hweight_long(new_acks);
+ n = new - a;
+ _debug("new_a %16lx new=%d a=%d n=%d", new_acks, new, a, n);
+ call->acks_nr_sacks += a;
+ call->acks_nr_snacks += n;
+ summary->nr_new_sacks += a;
+ summary->nr_new_snacks += n;
+ } else {
+ call->acks_nr_snacks += new;
+ summary->nr_new_snacks += new;
}
+ }
+
+ tq->nr_reported_acks = nr_reported;
+ tq->segment_acked = extracted_acks;
+ trace_rxrpc_apply_acks(call, tq);
- summary->nr_new_acks += new_acks;
- summary->nr_retained_nacks = retained_nacks;
+ if (extracted_acks != ~0UL) {
+ rxrpc_seq_t lowest = seq + ffz(extracted_acks);
+
+ if (before(lowest, *lowest_nak))
+ *lowest_nak = lowest;
}
- return old_seq + sp->ack.nr_acks;
+ if (summary->acked_serial)
+ rxrpc_input_soft_rtt(call, summary, tq);
+
+ new_acks |= n_to_a;
+ if (new_acks)
+ rxrpc_input_rack(call, summary, tq, new_acks);
+
+ if (call->tlp_serial &&
+ rxrpc_seq_in_txq(tq, call->tlp_seq) &&
+ test_bit(call->tlp_seq - tq->qbase, &new_acks))
+ summary->tlp_probe_acked = true;
}
/*
@@ -770,39 +952,50 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call,
*/
static void rxrpc_input_soft_acks(struct rxrpc_call *call,
struct rxrpc_ack_summary *summary,
- struct sk_buff *skb,
- rxrpc_seq_t seq,
- rxrpc_seq_t since)
+ struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int i, old_nacks = 0;
+ struct rxrpc_txqueue *tq = call->tx_queue;
+ unsigned long extracted = ~0UL;
+ unsigned int nr = 0;
+ rxrpc_seq_t seq = call->acks_hard_ack + 1;
rxrpc_seq_t lowest_nak = seq + sp->ack.nr_acks;
u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket);
- for (i = 0; i < sp->ack.nr_acks; i++) {
- if (acks[i] == RXRPC_ACK_TYPE_ACK) {
- summary->nr_acks++;
- if (after_eq(seq, since))
- summary->nr_new_acks++;
- } else {
- summary->saw_nacks = true;
- if (before(seq, since)) {
- /* Overlap with previous ACK */
- old_nacks++;
- } else {
- summary->nr_new_nacks++;
- sp->ack.nr_nacks++;
- }
+ _enter("%x,%x,%u", tq->qbase, seq, sp->ack.nr_acks);
+
+ while (after(seq, tq->qbase + RXRPC_NR_TXQUEUE - 1))
+ tq = tq->next;
- if (before(seq, lowest_nak))
- lowest_nak = seq;
+ for (unsigned int i = 0; i < sp->ack.nr_acks; i++) {
+ /* Decant ACKs until we hit a txqueue boundary. */
+ shiftr_adv_rotr(acks, extracted);
+ if (i == 256) {
+ acks -= i;
+ i = 0;
}
seq++;
+ nr++;
+ if ((seq & RXRPC_TXQ_MASK) != 0)
+ continue;
+
+ _debug("bound %16lx %u", extracted, nr);
+
+ rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE,
+ seq - RXRPC_NR_TXQUEUE, &lowest_nak);
+ extracted = ~0UL;
+ nr = 0;
+ tq = tq->next;
+ prefetch(tq);
}
- if (lowest_nak != call->acks_lowest_nak) {
- call->acks_lowest_nak = lowest_nak;
- summary->new_low_nack = true;
+ if (nr) {
+ unsigned int nr_reported = seq & RXRPC_TXQ_MASK;
+
+ extracted >>= RXRPC_NR_TXQUEUE - nr_reported;
+ _debug("tail %16lx %u", extracted, nr_reported);
+ rxrpc_input_soft_ack_tq(call, summary, tq, extracted, nr_reported,
+ seq & ~RXRPC_TXQ_MASK, &lowest_nak);
}
/* We *can* have more nacks than we did - the peer is permitted to drop
@@ -810,9 +1003,14 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call,
* possible for the nack distribution to change whilst the number of
* nacks stays the same or goes down.
*/
- if (old_nacks < summary->nr_retained_nacks)
- summary->nr_new_acks += summary->nr_retained_nacks - old_nacks;
- summary->nr_retained_nacks = old_nacks;
+ if (lowest_nak != call->acks_lowest_nak) {
+ call->acks_lowest_nak = lowest_nak;
+ summary->new_low_snack = true;
+ }
+
+ _debug("summary A=%d+%d N=%d+%d",
+ call->acks_nr_sacks, summary->nr_new_sacks,
+ call->acks_nr_snacks, summary->nr_new_snacks);
}
/*
@@ -820,21 +1018,21 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call,
* with respect to the ack state conveyed by preceding ACKs.
*/
static bool rxrpc_is_ack_valid(struct rxrpc_call *call,
- rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt)
+ rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt)
{
- rxrpc_seq_t base = READ_ONCE(call->acks_first_seq);
+ rxrpc_seq_t base = READ_ONCE(call->acks_hard_ack);
- if (after(first_pkt, base))
+ if (after(hard_ack, base))
return true; /* The window advanced */
- if (before(first_pkt, base))
+ if (before(hard_ack, base))
return false; /* firstPacket regressed */
if (after_eq(prev_pkt, call->acks_prev_seq))
return true; /* previousPacket hasn't regressed. */
/* Some rx implementations put a serial number in previousPacket. */
- if (after_eq(prev_pkt, base + call->tx_winsize))
+ if (after(prev_pkt, base + call->tx_winsize))
return false;
return true;
}
@@ -852,53 +1050,34 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call,
static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
{
struct rxrpc_ack_summary summary = { 0 };
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_acktrailer trailer;
- rxrpc_serial_t ack_serial, acked_serial;
- rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt;
int nr_acks, offset, ioffset;
_enter("");
offset = sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket);
- ack_serial = sp->hdr.serial;
- acked_serial = sp->ack.acked_serial;
- first_soft_ack = sp->ack.first_ack;
- prev_pkt = sp->ack.prev_ack;
- nr_acks = sp->ack.nr_acks;
- hard_ack = first_soft_ack - 1;
- summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ?
- sp->ack.reason : RXRPC_ACK__INVALID);
-
- trace_rxrpc_rx_ack(call, ack_serial, acked_serial,
- first_soft_ack, prev_pkt,
- summary.ack_reason, nr_acks);
- rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]);
+ summary.ack_serial = sp->hdr.serial;
+ first_soft_ack = sp->ack.first_ack;
+ prev_pkt = sp->ack.prev_ack;
+ nr_acks = sp->ack.nr_acks;
+ hard_ack = first_soft_ack - 1;
+ summary.acked_serial = sp->ack.acked_serial;
+ summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ?
+ sp->ack.reason : RXRPC_ACK__INVALID);
- if (acked_serial != 0) {
- switch (summary.ack_reason) {
- case RXRPC_ACK_PING_RESPONSE:
- rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
- rxrpc_rtt_rx_ping_response);
- break;
- case RXRPC_ACK_REQUESTED:
- rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
- rxrpc_rtt_rx_requested_ack);
- break;
- default:
- rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
- rxrpc_rtt_rx_other_ack);
- break;
- }
- }
+ trace_rxrpc_rx_ack(call, sp);
+ rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]);
+ prefetch(call->tx_queue);
/* If we get an EXCEEDS_WINDOW ACK from the server, it probably
* indicates that the client address changed due to NAT. The server
* lost the call because it switched to a different peer.
*/
if (unlikely(summary.ack_reason == RXRPC_ACK_EXCEEDS_WINDOW) &&
- first_soft_ack == 1 &&
+ hard_ack == 0 &&
prev_pkt == 0 &&
rxrpc_is_client_call(call)) {
rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
@@ -911,9 +1090,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
* if we still have it buffered to the beginning.
*/
if (unlikely(summary.ack_reason == RXRPC_ACK_OUT_OF_SEQUENCE) &&
- first_soft_ack == 1 &&
+ hard_ack == 0 &&
prev_pkt == 0 &&
- call->acks_hard_ack == 0 &&
+ call->tx_bottom == 0 &&
rxrpc_is_client_call(call)) {
rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
0, -ENETRESET);
@@ -921,11 +1100,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
}
/* Discard any out-of-order or duplicate ACKs (outside lock). */
- if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) {
- trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial,
- first_soft_ack, call->acks_first_seq,
- prev_pkt, call->acks_prev_seq);
- goto send_response;
+ if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) {
+ trace_rxrpc_rx_discard_ack(call, summary.ack_serial, hard_ack, prev_pkt);
+ goto send_response; /* Still respond if requested. */
}
trailer.maxMTU = 0;
@@ -937,34 +1114,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
if (nr_acks > 0)
skb_condense(skb);
- if (call->cong_last_nack) {
- since = rxrpc_input_check_prev_ack(call, &summary, first_soft_ack);
- rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack);
- call->cong_last_nack = NULL;
- } else {
- summary.nr_new_acks = first_soft_ack - call->acks_first_seq;
- call->acks_lowest_nak = first_soft_ack + nr_acks;
- since = first_soft_ack;
- }
-
- call->acks_latest_ts = skb->tstamp;
- call->acks_first_seq = first_soft_ack;
+ call->acks_latest_ts = ktime_get_real();
+ call->acks_hard_ack = hard_ack;
call->acks_prev_seq = prev_pkt;
- switch (summary.ack_reason) {
- case RXRPC_ACK_PING:
- break;
- default:
- if (acked_serial && after(acked_serial, call->acks_highest_serial))
- call->acks_highest_serial = acked_serial;
- break;
+ if (summary.acked_serial) {
+ switch (summary.ack_reason) {
+ case RXRPC_ACK_PING_RESPONSE:
+ rxrpc_complete_rtt_probe(call, call->acks_latest_ts,
+ summary.acked_serial, summary.ack_serial,
+ rxrpc_rtt_rx_ping_response);
+ break;
+ default:
+ if (after(summary.acked_serial, call->acks_highest_serial))
+ call->acks_highest_serial = summary.acked_serial;
+ summary.rtt_sample_avail = true;
+ break;
+ }
}
/* Parse rwind and mtu sizes if provided. */
if (trailer.maxMTU)
rxrpc_input_ack_trailer(call, skb, &trailer);
- if (first_soft_ack == 0)
+ if (hard_ack + 1 == 0)
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero);
/* Ignore ACKs unless we are or have just been transmitting. */
@@ -978,13 +1151,13 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
goto send_response;
}
- if (before(hard_ack, call->acks_hard_ack) ||
+ if (before(hard_ack, call->tx_bottom) ||
after(hard_ack, call->tx_top))
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_outside_window);
if (nr_acks > call->tx_top - hard_ack)
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow);
- if (after(hard_ack, call->acks_hard_ack)) {
+ if (after(hard_ack, call->tx_bottom)) {
if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) {
rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack);
goto send_response;
@@ -994,25 +1167,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
if (nr_acks > 0) {
if (offset > (int)skb->len - nr_acks)
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack);
- rxrpc_input_soft_acks(call, &summary, skb, first_soft_ack, since);
- rxrpc_get_skb(skb, rxrpc_skb_get_last_nack);
- call->cong_last_nack = skb;
+ rxrpc_input_soft_acks(call, &summary, skb);
}
if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) &&
- summary.nr_acks == call->tx_top - hard_ack &&
+ call->acks_nr_sacks == call->tx_top - hard_ack &&
rxrpc_is_client_call(call))
- rxrpc_propose_ping(call, ack_serial,
+ rxrpc_propose_ping(call, summary.ack_serial,
rxrpc_propose_ack_ping_for_lost_reply);
- rxrpc_congestion_management(call, skb, &summary, acked_serial);
+ /* Drive the congestion management algorithm first and then RACK-TLP as
+ * the latter depends on the state/change in state in the former.
+ */
+ rxrpc_congestion_management(call, &summary);
+ rxrpc_rack_detect_loss_and_arm_timer(call, &summary);
+ rxrpc_tlp_process_ack(call, &summary);
+ if (call->tlp_serial && after_eq(summary.acked_serial, call->tlp_serial))
+ call->tlp_serial = 0;
send_response:
if (summary.ack_reason == RXRPC_ACK_PING)
- rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial,
+ rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, summary.ack_serial,
rxrpc_propose_ack_respond_to_ping);
else if (sp->hdr.flags & RXRPC_REQUEST_ACK)
- rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial,
+ rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, summary.ack_serial,
rxrpc_propose_ack_respond_to_ack);
}
@@ -1111,5 +1289,5 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb)
break;
}
- rxrpc_input_call_event(call, skb);
+ rxrpc_input_call_event(call);
}
diff --git a/net/rxrpc/input_rack.c b/net/rxrpc/input_rack.c
new file mode 100644
index 000000000000..13c371261e0a
--- /dev/null
+++ b/net/rxrpc/input_rack.c
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* RACK-TLP [RFC8958] Implementation
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "ar-internal.h"
+
+static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1,
+ ktime_t t2, rxrpc_seq_t seq2)
+{
+ if (ktime_after(t1, t2))
+ return true;
+ return t1 == t2 && after(seq1, seq2);
+}
+
+/*
+ * Mark a packet lost.
+ */
+static void rxrpc_rack_mark_lost(struct rxrpc_call *call,
+ struct rxrpc_txqueue *tq, unsigned int ix)
+{
+ if (__test_and_set_bit(ix, &tq->segment_lost)) {
+ if (__test_and_clear_bit(ix, &tq->segment_retransmitted))
+ call->tx_nr_resent--;
+ } else {
+ call->tx_nr_lost++;
+ }
+ tq->segment_xmit_ts[ix] = UINT_MAX;
+}
+
+/*
+ * Get the transmission time of a packet in the Tx queue.
+ */
+static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix)
+{
+ if (tq->segment_xmit_ts[ix] == UINT_MAX)
+ return KTIME_MAX;
+ return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
+}
+
+/*
+ * Get a bitmask of nack bits for a queue segment and mask off any that aren't
+ * yet reported.
+ */
+static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq)
+{
+ unsigned long nacks = ~tq->segment_acked;
+
+ if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE)
+ nacks &= (1UL << tq->nr_reported_acks) - 1;
+ return nacks;
+}
+
+/*
+ * Update the RACK state for the most recently sent packet that has been
+ * delivered [RFC8958 6.2 Step 2].
+ */
+static void rxrpc_rack_update(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix)
+{
+ rxrpc_seq_t seq = tq->qbase + ix;
+ ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+ ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts);
+
+ if (__test_and_clear_bit(ix, &tq->segment_lost))
+ call->tx_nr_lost--;
+
+ if (test_bit(ix, &tq->segment_retransmitted)) {
+ /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */
+ if (before(call->acks_highest_serial, tq->segment_serial[ix]))
+ return;
+ if (rtt < minmax_get(&call->min_rtt))
+ return;
+ }
+
+ /* The RACK algorithm requires the segment ACKs to be traversed in
+ * order of segment transmission - but the only thing this seems to
+ * matter for is that RACK.rtt is set to the rtt of the most recently
+ * transmitted segment. We should be able to achieve the same by only
+ * setting RACK.rtt if the xmit time is greater.
+ */
+ if (ktime_after(xmit_ts, call->rack_rtt_ts)) {
+ call->rack_rtt = rtt;
+ call->rack_rtt_ts = xmit_ts;
+ }
+
+ if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) {
+ call->rack_rtt = rtt;
+ call->rack_xmit_ts = xmit_ts;
+ call->rack_end_seq = seq;
+ }
+}
+
+/*
+ * Detect data segment reordering [RFC8958 6.2 Step 3].
+ */
+static void rxrpc_rack_detect_reordering(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix)
+{
+ rxrpc_seq_t seq = tq->qbase + ix;
+
+ /* Track the highest sequence number so far ACK'd. This is not
+ * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer
+ * could put a NACK in the last SACK slot.
+ */
+ if (after(seq, call->rack_fack))
+ call->rack_fack = seq;
+ else if (before(seq, call->rack_fack) &&
+ test_bit(ix, &tq->segment_retransmitted))
+ call->rack_reordering_seen = true;
+}
+
+void rxrpc_input_rack_one(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix)
+{
+ rxrpc_rack_update(call, summary, tq, ix);
+ rxrpc_rack_detect_reordering(call, summary, tq, ix);
+}
+
+void rxrpc_input_rack(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned long new_acks)
+{
+ while (new_acks) {
+ unsigned int ix = __ffs(new_acks);
+
+ __clear_bit(ix, &new_acks);
+ rxrpc_input_rack_one(call, summary, tq, ix);
+ }
+
+ trace_rxrpc_rack_update(call, summary);
+}
+
+/*
+ * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated
+ * duration of the reordering window.
+ *
+ * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can
+ * be given a 'DUPLICATE' reason with the serial number referring to the
+ * duplicated DATA packet. Rx does not inform as to whether this was a
+ * reception of the same packet twice or of a retransmission of a packet we
+ * already received (though this could be determined by the transmitter based
+ * on the serial number).
+ */
+static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary)
+{
+ rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
+ rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */
+ bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE;
+ int dup_thresh = 3;
+
+ /* DSACK-based reordering window adaptation */
+ if (!call->rack_dsack_round_none &&
+ after_eq(snd_una, call->rack_dsack_round))
+ call->rack_dsack_round_none = true;
+
+ /* Grow the reordering window per round that sees DSACK. Reset the
+ * window after 16 DSACK-free recoveries.
+ */
+ if (call->rack_dsack_round_none && have_dsack_option) {
+ call->rack_dsack_round_none = false;
+ call->rack_dsack_round = snd_nxt;
+ call->rack_reo_wnd_mult++;
+ call->rack_reo_wnd_persist = 16;
+ } else if (summary->exiting_fast_or_rto_recovery) {
+ call->rack_reo_wnd_persist--;
+ if (call->rack_reo_wnd_persist <= 0)
+ call->rack_reo_wnd_mult = 1;
+ }
+
+ if (!call->rack_reordering_seen) {
+ if (summary->in_fast_or_rto_recovery)
+ return 0;
+ if (call->acks_nr_sacks >= dup_thresh)
+ return 0;
+ }
+
+ return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4,
+ call->srtt_us >> 3));
+}
+
+/*
+ * Detect losses [RFC8958 6.2 Step 5].
+ */
+static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary)
+{
+ struct rxrpc_txqueue *tq;
+ ktime_t timeout = 0, lost_after, now = ktime_get_real();
+
+ call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary);
+ lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
+ trace_rxrpc_rack_scan_loss(call);
+
+ for (tq = call->tx_queue; tq; tq = tq->next) {
+ unsigned long nacks = rxrpc_tq_nacks(tq);
+
+ if (after(tq->qbase, call->tx_transmitted))
+ break;
+ trace_rxrpc_rack_scan_loss_tq(call, tq, nacks);
+
+ /* Skip ones marked lost but not yet retransmitted */
+ nacks &= ~tq->segment_lost | tq->segment_retransmitted;
+
+ while (nacks) {
+ unsigned int ix = __ffs(nacks);
+ rxrpc_seq_t seq = tq->qbase + ix;
+ ktime_t remaining;
+ ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+
+ __clear_bit(ix, &nacks);
+
+ if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq,
+ xmit_ts, seq)) {
+ remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now);
+ if (remaining <= 0) {
+ rxrpc_rack_mark_lost(call, tq, ix);
+ trace_rxrpc_rack_detect_loss(call, summary, seq);
+ } else {
+ timeout = max(remaining, timeout);
+ }
+ }
+ }
+ }
+
+ return timeout;
+}
+
+/*
+ * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5].
+ */
+void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary)
+{
+ ktime_t timeout = rxrpc_rack_detect_loss(call, summary);
+
+ if (timeout) {
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER;
+ call->rack_timo_at = ktime_add(ktime_get_real(), timeout);
+ trace_rxrpc_rack_timer(call, timeout, false);
+ trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo);
+ }
+}
+
+/*
+ * Handle RACK-TLP RTO expiration [RFC8958 6.3].
+ */
+static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call)
+{
+ struct rxrpc_txqueue *tq;
+ rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
+ ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
+ ktime_t deadline = ktime_sub(ktime_get_real(), lost_after);
+
+ for (tq = call->tx_queue; tq; tq = tq->next) {
+ unsigned long unacked = ~tq->segment_acked;
+
+ trace_rxrpc_rack_mark_loss_tq(call, tq);
+ while (unacked) {
+ unsigned int ix = __ffs(unacked);
+ rxrpc_seq_t seq = tq->qbase + ix;
+ ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+
+ if (after(seq, call->tx_transmitted))
+ return;
+ __clear_bit(ix, &unacked);
+
+ if (seq == snd_una ||
+ ktime_before(xmit_ts, deadline))
+ rxrpc_rack_mark_lost(call, tq, ix);
+ }
+ }
+}
+
+/*
+ * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2].
+ */
+ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now)
+{
+ unsigned int flight_size = rxrpc_tx_in_flight(call);
+ ktime_t rto_at = ktime_add(call->tx_last_sent,
+ rxrpc_get_rto_backoff(call, false));
+ ktime_t pto;
+
+ if (call->rtt_count > 0) {
+ /* Use 2*SRTT as the timeout. */
+ pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4);
+ if (flight_size)
+ pto = ktime_add(pto, call->tlp_max_ack_delay);
+ } else {
+ pto = NSEC_PER_SEC;
+ }
+
+ if (ktime_after(ktime_add(now, pto), rto_at))
+ pto = ktime_sub(rto_at, now);
+ return pto;
+}
+
+/*
+ * Send a TLP loss probe on PTO expiration [RFC8958 7.3].
+ */
+void rxrpc_tlp_send_probe(struct rxrpc_call *call)
+{
+ unsigned int in_flight = rxrpc_tx_in_flight(call);
+
+ if (after_eq(call->acks_hard_ack, call->tx_transmitted))
+ return; /* Everything we transmitted has been acked. */
+
+ /* There must be no other loss probe still in flight and we need to
+ * have taken a new RTT sample since last probe or the start of
+ * connection.
+ */
+ if (!call->tlp_serial &&
+ call->tlp_rtt_taken != call->rtt_taken) {
+ call->tlp_is_retrans = false;
+ if (after(call->send_top, call->tx_transmitted) &&
+ rxrpc_tx_window_space(call) > 0) {
+ /* Transmit the lowest-sequence unsent DATA */
+ call->tx_last_serial = 0;
+ rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data);
+ call->tlp_serial = call->tx_last_serial;
+ call->tlp_seq = call->tx_transmitted;
+ trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new);
+ in_flight = rxrpc_tx_in_flight(call);
+ } else {
+ /* Retransmit the highest-sequence DATA sent */
+ call->tx_last_serial = 0;
+ rxrpc_resend_tlp(call);
+ call->tlp_is_retrans = true;
+ trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit);
+ }
+ } else {
+ trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy);
+ }
+
+ if (in_flight != 0) {
+ ktime_t rto = rxrpc_get_rto_backoff(call, false);
+
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO;
+ call->rack_timo_at = ktime_add(ktime_get_real(), rto);
+ trace_rxrpc_rack_timer(call, rto, false);
+ trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto);
+ }
+}
+
+/*
+ * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4].
+ */
+void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary)
+{
+ if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack))
+ return;
+
+ if (!call->tlp_is_retrans) {
+ /* TLP of new data delivered */
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data);
+ call->tlp_serial = 0;
+ } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE &&
+ summary->acked_serial == call->tlp_serial) {
+ /* General Case: Detected packet losses using RACK [7.4.1] */
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked);
+ call->tlp_serial = 0;
+ } else if (after(call->acks_hard_ack, call->tlp_seq)) {
+ /* Repaired the single loss */
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond);
+ call->tlp_serial = 0;
+ // TODO: Invoke congestion control to react to the loss
+ // event the probe has repaired
+ } else if (summary->tlp_probe_acked) {
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked);
+ /* Special Case: Detected a single loss repaired by the loss
+ * probe [7.4.2]
+ */
+ call->tlp_serial = 0;
+ } else {
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete);
+ }
+}
+
+/*
+ * Handle RACK timer expiration; returns true to request a resend.
+ */
+void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by)
+{
+ struct rxrpc_ack_summary summary = {};
+ enum rxrpc_rack_timer_mode mode = call->rack_timer_mode;
+
+ trace_rxrpc_rack_timer(call, overran_by, true);
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF;
+
+ switch (mode) {
+ case RXRPC_CALL_RACKTIMER_RACK_REORDER:
+ rxrpc_rack_detect_loss_and_arm_timer(call, &summary);
+ break;
+ case RXRPC_CALL_RACKTIMER_TLP_PTO:
+ rxrpc_tlp_send_probe(call);
+ break;
+ case RXRPC_CALL_RACKTIMER_RTO:
+ // Might need to poke the congestion algo in some way
+ rxrpc_rack_mark_losses_on_rto(call);
+ break;
+ //case RXRPC_CALL_RACKTIMER_ZEROWIN:
+ default:
+ pr_warn("Unexpected rack timer %u", call->rack_timer_mode);
+ }
+}
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index 6716c021a532..e068f9b79d02 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -19,11 +19,14 @@ static int none_init_connection_security(struct rxrpc_connection *conn,
*/
static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp)
{
- return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 1, gfp);
+ return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp);
}
static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
{
+ txb->pkt_len = txb->len;
+ if (txb->len == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
return 0;
}
diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c
index 07c74c77d802..2925c7fc82cf 100644
--- a/net/rxrpc/io_thread.c
+++ b/net/rxrpc/io_thread.c
@@ -338,7 +338,6 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
struct rxrpc_channel *chan;
struct rxrpc_call *call = NULL;
unsigned int channel;
- bool ret;
if (sp->hdr.securityIndex != conn->security_ix)
return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security,
@@ -364,6 +363,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
if (sp->hdr.callNumber == 0)
return rxrpc_input_conn_packet(conn, skb);
+ /* Deal with path MTU discovery probing. */
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK &&
+ conn->pmtud_probe &&
+ after_eq(sp->ack.acked_serial, conn->pmtud_probe))
+ rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false);
+
/* Call-bound packets are routed by connection channel. */
channel = sp->hdr.cid & RXRPC_CHANNELMASK;
chan = &conn->channels[channel];
@@ -419,9 +424,9 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
peer_srx, skb);
}
- ret = rxrpc_input_call_event(call, skb);
+ rxrpc_queue_rx_call_packet(call, skb);
rxrpc_put_call(call, rxrpc_call_put_input);
- return ret;
+ return true;
}
/*
@@ -438,6 +443,8 @@ int rxrpc_io_thread(void *data)
ktime_t now;
#endif
bool should_stop;
+ LIST_HEAD(conn_attend_q);
+ LIST_HEAD(call_attend_q);
complete(&local->io_thread_ready);
@@ -448,43 +455,26 @@ int rxrpc_io_thread(void *data)
for (;;) {
rxrpc_inc_stat(local->rxnet, stat_io_loop);
- /* Deal with connections that want immediate attention. */
- conn = list_first_entry_or_null(&local->conn_attend_q,
- struct rxrpc_connection,
- attend_link);
- if (conn) {
- spin_lock_bh(&local->lock);
- list_del_init(&conn->attend_link);
- spin_unlock_bh(&local->lock);
-
- rxrpc_input_conn_event(conn, NULL);
- rxrpc_put_connection(conn, rxrpc_conn_put_poke);
- continue;
+ /* Inject a delay into packets if requested. */
+#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
+ now = ktime_get_real();
+ while ((skb = skb_peek(&local->rx_delay_queue))) {
+ if (ktime_before(now, skb->tstamp))
+ break;
+ skb = skb_dequeue(&local->rx_delay_queue);
+ skb_queue_tail(&local->rx_queue, skb);
}
+#endif
- if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER,
- &local->client_conn_flags))
- rxrpc_discard_expired_client_conns(local);
-
- /* Deal with calls that want immediate attention. */
- if ((call = list_first_entry_or_null(&local->call_attend_q,
- struct rxrpc_call,
- attend_link))) {
- spin_lock_bh(&local->lock);
- list_del_init(&call->attend_link);
- spin_unlock_bh(&local->lock);
-
- trace_rxrpc_call_poked(call);
- rxrpc_input_call_event(call, NULL);
- rxrpc_put_call(call, rxrpc_call_put_poke);
- continue;
+ if (!skb_queue_empty(&local->rx_queue)) {
+ spin_lock_irq(&local->rx_queue.lock);
+ skb_queue_splice_tail_init(&local->rx_queue, &rx_queue);
+ spin_unlock_irq(&local->rx_queue.lock);
+ trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue));
}
- if (!list_empty(&local->new_client_calls))
- rxrpc_connect_client_calls(local);
-
- /* Process received packets and errors. */
- if ((skb = __skb_dequeue(&rx_queue))) {
+ /* Distribute packets and errors. */
+ while ((skb = __skb_dequeue(&rx_queue))) {
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
switch (skb->mark) {
case RXRPC_SKB_MARK_PACKET:
@@ -508,27 +498,46 @@ int rxrpc_io_thread(void *data)
rxrpc_free_skb(skb, rxrpc_skb_put_unknown);
break;
}
- continue;
}
- /* Inject a delay into packets if requested. */
-#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
- now = ktime_get_real();
- while ((skb = skb_peek(&local->rx_delay_queue))) {
- if (ktime_before(now, skb->tstamp))
- break;
- skb = skb_dequeue(&local->rx_delay_queue);
- skb_queue_tail(&local->rx_queue, skb);
+ /* Deal with connections that want immediate attention. */
+ spin_lock_irq(&local->lock);
+ list_splice_tail_init(&local->conn_attend_q, &conn_attend_q);
+ spin_unlock_irq(&local->lock);
+
+ while ((conn = list_first_entry_or_null(&conn_attend_q,
+ struct rxrpc_connection,
+ attend_link))) {
+ spin_lock_bh(&local->lock);
+ list_del_init(&conn->attend_link);
+ spin_unlock_bh(&local->lock);
+ rxrpc_input_conn_event(conn, NULL);
+ rxrpc_put_connection(conn, rxrpc_conn_put_poke);
}
-#endif
- if (!skb_queue_empty(&local->rx_queue)) {
- spin_lock_irq(&local->rx_queue.lock);
- skb_queue_splice_tail_init(&local->rx_queue, &rx_queue);
- spin_unlock_irq(&local->rx_queue.lock);
- continue;
+ if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER,
+ &local->client_conn_flags))
+ rxrpc_discard_expired_client_conns(local);
+
+ /* Deal with calls that want immediate attention. */
+ spin_lock_irq(&local->lock);
+ list_splice_tail_init(&local->call_attend_q, &call_attend_q);
+ spin_unlock_irq(&local->lock);
+
+ while ((call = list_first_entry_or_null(&call_attend_q,
+ struct rxrpc_call,
+ attend_link))) {
+ spin_lock_bh(&local->lock);
+ list_del_init(&call->attend_link);
+ spin_unlock_bh(&local->lock);
+ trace_rxrpc_call_poked(call);
+ rxrpc_input_call_event(call);
+ rxrpc_put_call(call, rxrpc_call_put_poke);
}
+ if (!list_empty(&local->new_client_calls))
+ rxrpc_connect_client_calls(local);
+
set_current_state(TASK_INTERRUPTIBLE);
should_stop = kthread_should_stop();
if (!skb_queue_empty(&local->rx_queue) ||
@@ -558,7 +567,7 @@ int rxrpc_io_thread(void *data)
}
timeout = nsecs_to_jiffies(delay_ns);
- timeout = max(timeout, 1UL);
+ timeout = umax(timeout, 1);
schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);
continue;
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 2792d2304605..a74a4b43904f 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -215,9 +215,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
/* we want to set the don't fragment bit */
rxrpc_local_dont_fragment(local, true);
-
- /* We want receive timestamps. */
- sock_enable_timestamps(usk);
break;
default:
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 657cf35089a6..8fcc8139d771 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255;
* Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
* made by gluing normal packets together that we're willing to handle.
*/
-unsigned int rxrpc_rx_mtu = 5692;
+unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46);
/*
* The maximum number of fragments in a received jumbo packet that we tell the
* sender that we're willing to handle.
*/
-unsigned int rxrpc_rx_jumbo_max = 4;
+unsigned int rxrpc_rx_jumbo_max = 46;
#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
/*
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 5ea9601efd05..6f7a125d6e90 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -72,22 +72,96 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call, ktime_t now)
}
/*
+ * Allocate transmission buffers for an ACK and attach them to local->kv[].
+ */
+static int rxrpc_alloc_ack(struct rxrpc_call *call, size_t sack_size)
+{
+ struct rxrpc_wire_header *whdr;
+ struct rxrpc_acktrailer *trailer;
+ struct rxrpc_ackpacket *ack;
+ struct kvec *kv = call->local->kvec;
+ gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS;
+ void *buf, *buf2 = NULL;
+ u8 *filler;
+
+ buf = page_frag_alloc(&call->local->tx_alloc,
+ sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp);
+ if (!buf)
+ return -ENOMEM;
+
+ if (sack_size) {
+ buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp);
+ if (!buf2) {
+ page_frag_free(buf);
+ return -ENOMEM;
+ }
+ }
+
+ whdr = buf;
+ ack = buf + sizeof(*whdr);
+ filler = buf + sizeof(*whdr) + sizeof(*ack) + 1;
+ trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3;
+
+ kv[0].iov_base = whdr;
+ kv[0].iov_len = sizeof(*whdr) + sizeof(*ack);
+ kv[1].iov_base = buf2;
+ kv[1].iov_len = sack_size;
+ kv[2].iov_base = filler;
+ kv[2].iov_len = 3 + sizeof(*trailer);
+ return 3; /* Number of kvec[] used. */
+}
+
+static void rxrpc_free_ack(struct rxrpc_call *call)
+{
+ page_frag_free(call->local->kvec[0].iov_base);
+ if (call->local->kvec[1].iov_base)
+ page_frag_free(call->local->kvec[1].iov_base);
+}
+
+/*
+ * Record the beginning of an RTT probe.
+ */
+static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial,
+ ktime_t now, enum rxrpc_rtt_tx_trace why)
+{
+ unsigned long avail = call->rtt_avail;
+ int rtt_slot = 9;
+
+ if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK))
+ goto no_slot;
+
+ rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK);
+ if (!test_and_clear_bit(rtt_slot, &call->rtt_avail))
+ goto no_slot;
+
+ call->rtt_serial[rtt_slot] = serial;
+ call->rtt_sent_at[rtt_slot] = now;
+ smp_wmb(); /* Write data before avail bit */
+ set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
+
+ trace_rxrpc_rtt_tx(call, why, rtt_slot, serial);
+ return;
+
+no_slot:
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial);
+}
+
+/*
* Fill out an ACK packet.
*/
-static void rxrpc_fill_out_ack(struct rxrpc_call *call,
- struct rxrpc_txbuf *txb,
- u8 ack_reason,
- rxrpc_serial_t serial)
+static int rxrpc_fill_out_ack(struct rxrpc_call *call, int nr_kv, u8 ack_reason,
+ rxrpc_serial_t serial_to_ack, rxrpc_serial_t *_ack_serial)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
- struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3;
+ struct kvec *kv = call->local->kvec;
+ struct rxrpc_wire_header *whdr = kv[0].iov_base;
+ struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3;
struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1);
- unsigned int qsize, sack, wrap, to;
+ unsigned int qsize, sack, wrap, to, max_mtu, if_mtu;
rxrpc_seq_t window, wtop;
+ ktime_t now = ktime_get_real();
int rsize;
- u32 mtu, jmax;
- u8 *filler = txb->kvec[2].iov_base;
- u8 *sackp = txb->kvec[1].iov_base;
+ u8 *filler = kv[2].iov_base;
+ u8 *sackp = kv[1].iov_base;
rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill);
@@ -95,14 +169,25 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
wtop = call->ackr_wtop;
sack = call->ackr_sack_base % RXRPC_SACK_SIZE;
+ *_ack_serial = rxrpc_get_next_serial(call->conn);
+
+ whdr->epoch = htonl(call->conn->proto.epoch);
+ whdr->cid = htonl(call->cid);
+ whdr->callNumber = htonl(call->call_id);
+ whdr->serial = htonl(*_ack_serial);
whdr->seq = 0;
whdr->type = RXRPC_PACKET_TYPE_ACK;
- txb->flags |= RXRPC_SLOW_START_OK;
+ whdr->flags = call->conn->out_clientflag | RXRPC_SLOW_START_OK;
+ whdr->userStatus = 0;
+ whdr->securityIndex = call->security_ix;
+ whdr->_rsvd = 0;
+ whdr->serviceId = htons(call->dest_srx.srx_service);
+
ack->bufferSpace = 0;
ack->maxSkew = 0;
ack->firstPacket = htonl(window);
ack->previousPacket = htonl(call->rx_highest_seq);
- ack->serial = htonl(serial);
+ ack->serial = htonl(serial_to_ack);
ack->reason = ack_reason;
ack->nAcks = wtop - window;
filler[0] = 0;
@@ -110,15 +195,13 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
filler[2] = 0;
if (ack_reason == RXRPC_ACK_PING)
- txb->flags |= RXRPC_REQUEST_ACK;
+ whdr->flags |= RXRPC_REQUEST_ACK;
if (after(wtop, window)) {
- txb->len += ack->nAcks;
- txb->kvec[1].iov_base = sackp;
- txb->kvec[1].iov_len = ack->nAcks;
+ kv[1].iov_len = ack->nAcks;
wrap = RXRPC_SACK_SIZE - sack;
- to = min_t(unsigned int, ack->nAcks, RXRPC_SACK_SIZE);
+ to = umin(ack->nAcks, RXRPC_SACK_SIZE);
if (sack + ack->nAcks <= RXRPC_SACK_SIZE) {
memcpy(sackp, call->ackr_sack_table + sack, ack->nAcks);
@@ -132,56 +215,42 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
ack->reason = RXRPC_ACK_IDLE;
}
- mtu = call->peer->if_mtu;
- mtu -= call->peer->hdrsize;
- jmax = rxrpc_rx_jumbo_max;
qsize = (window - 1) - call->rx_consumed;
rsize = max_t(int, call->rx_winsize - qsize, 0);
- txb->ack_rwind = rsize;
- trailer->maxMTU = htonl(rxrpc_rx_mtu);
- trailer->ifMTU = htonl(mtu);
- trailer->rwind = htonl(rsize);
- trailer->jumbo_max = htonl(jmax);
-}
-
-/*
- * Record the beginning of an RTT probe.
- */
-static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial,
- ktime_t now, enum rxrpc_rtt_tx_trace why)
-{
- unsigned long avail = call->rtt_avail;
- int rtt_slot = 9;
-
- if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK))
- goto no_slot;
-
- rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK);
- if (!test_and_clear_bit(rtt_slot, &call->rtt_avail))
- goto no_slot;
- call->rtt_serial[rtt_slot] = serial;
- call->rtt_sent_at[rtt_slot] = now;
- smp_wmb(); /* Write data before avail bit */
- set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
+ if_mtu = call->peer->if_mtu - call->peer->hdrsize;
+ if (call->peer->ackr_adv_pmtud) {
+ max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu);
+ } else {
+ if_mtu = umin(if_mtu, 1444);
+ max_mtu = if_mtu;
+ }
- trace_rxrpc_rtt_tx(call, why, rtt_slot, serial);
- return;
+ trailer->maxMTU = htonl(max_mtu);
+ trailer->ifMTU = htonl(if_mtu);
+ trailer->rwind = htonl(rsize);
+ trailer->jumbo_max = 0; /* Advertise pmtu discovery */
-no_slot:
- trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial);
+ if (ack_reason == RXRPC_ACK_PING)
+ rxrpc_begin_rtt_probe(call, *_ack_serial, now, rxrpc_rtt_tx_ping);
+ if (whdr->flags & RXRPC_REQUEST_ACK)
+ call->rtt_last_req = now;
+ rxrpc_set_keepalive(call, now);
+ return nr_kv;
}
/*
* Transmit an ACK packet.
*/
-static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+static void rxrpc_send_ack_packet(struct rxrpc_call *call, int nr_kv, size_t len,
+ rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
+ struct kvec *kv = call->local->kvec;
+ struct rxrpc_wire_header *whdr = kv[0].iov_base;
+ struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3;
struct rxrpc_connection *conn;
struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1);
struct msghdr msg;
- ktime_t now;
int ret;
if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
@@ -195,33 +264,34 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
msg.msg_controllen = 0;
msg.msg_flags = MSG_SPLICE_PAGES;
- whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
-
- txb->serial = rxrpc_get_next_serial(conn);
- whdr->serial = htonl(txb->serial);
- trace_rxrpc_tx_ack(call->debug_id, txb->serial,
+ trace_rxrpc_tx_ack(call->debug_id, serial,
ntohl(ack->firstPacket),
ntohl(ack->serial), ack->reason, ack->nAcks,
- txb->ack_rwind);
+ ntohl(trailer->rwind), why);
rxrpc_inc_stat(call->rxnet, stat_tx_ack_send);
- iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, txb->len);
- rxrpc_local_dont_fragment(conn->local, false);
- ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len);
+ iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, len);
+ rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe);
+
+ ret = do_udp_sendmsg(conn->local->socket, &msg, len);
call->peer->last_tx_at = ktime_get_seconds();
if (ret < 0) {
- trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret,
+ trace_rxrpc_tx_fail(call->debug_id, serial, ret,
rxrpc_tx_point_call_ack);
+ if (why == rxrpc_propose_ack_ping_for_mtu_probe &&
+ ret == -EMSGSIZE)
+ rxrpc_input_probe_for_pmtud(conn, serial, true);
} else {
trace_rxrpc_tx_packet(call->debug_id, whdr,
rxrpc_tx_point_call_ack);
- now = ktime_get_real();
- if (ack->reason == RXRPC_ACK_PING)
- rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_ping);
- if (txb->flags & RXRPC_REQUEST_ACK)
- call->peer->rtt_last_req = now;
- rxrpc_set_keepalive(call, now);
+ if (why == rxrpc_propose_ack_ping_for_mtu_probe) {
+ call->peer->pmtud_pending = false;
+ call->peer->pmtud_probing = true;
+ call->conn->pmtud_probe = serial;
+ call->conn->pmtud_call = call->debug_id;
+ trace_rxrpc_pmtud_tx(call);
+ }
}
rxrpc_tx_backoff(call, ret);
}
@@ -230,31 +300,62 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
* Queue an ACK for immediate transmission.
*/
void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
- rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why)
+ rxrpc_serial_t serial_to_ack, enum rxrpc_propose_ack_trace why)
{
- struct rxrpc_txbuf *txb;
+ struct kvec *kv = call->local->kvec;
+ rxrpc_serial_t ack_serial;
+ size_t len;
+ int nr_kv;
if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
return;
rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]);
- txb = rxrpc_alloc_ack_txbuf(call, call->ackr_wtop - call->ackr_window);
- if (!txb) {
+ nr_kv = rxrpc_alloc_ack(call, call->ackr_wtop - call->ackr_window);
+ if (nr_kv < 0) {
kleave(" = -ENOMEM");
return;
}
- txb->ack_why = why;
+ nr_kv = rxrpc_fill_out_ack(call, nr_kv, ack_reason, serial_to_ack, &ack_serial);
+ len = kv[0].iov_len;
+ len += kv[1].iov_len;
+ len += kv[2].iov_len;
+
+ /* Extend a path MTU probe ACK. */
+ if (why == rxrpc_propose_ack_ping_for_mtu_probe) {
+ size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header);
+
+ if (len > probe_mtu)
+ goto skip;
+ while (len < probe_mtu) {
+ size_t part = umin(probe_mtu - len, PAGE_SIZE);
+
+ kv[nr_kv].iov_base = page_address(ZERO_PAGE(0));
+ kv[nr_kv].iov_len = part;
+ len += part;
+ nr_kv++;
+ }
+ }
- rxrpc_fill_out_ack(call, txb, ack_reason, serial);
call->ackr_nr_unacked = 0;
atomic_set(&call->ackr_nr_consumed, 0);
clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags);
- trace_rxrpc_send_ack(call, why, ack_reason, serial);
- rxrpc_send_ack_packet(call, txb);
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx);
+ trace_rxrpc_send_ack(call, why, ack_reason, ack_serial);
+ rxrpc_send_ack_packet(call, nr_kv, len, ack_serial, why);
+skip:
+ rxrpc_free_ack(call);
+}
+
+/*
+ * Send an ACK probe for path MTU discovery.
+ */
+void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call)
+{
+ rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
+ rxrpc_propose_ack_ping_for_mtu_probe);
}
/*
@@ -324,14 +425,21 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
/*
* Prepare a (sub)packet for transmission.
*/
-static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb,
- rxrpc_serial_t serial)
+static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call,
+ struct rxrpc_send_data_req *req,
+ struct rxrpc_txbuf *txb,
+ rxrpc_serial_t serial, int subpkt)
{
struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
+ struct rxrpc_jumbo_header *jumbo = (void *)(whdr + 1) - sizeof(*jumbo);
enum rxrpc_req_ack_trace why;
struct rxrpc_connection *conn = call->conn;
+ struct kvec *kv = &call->local->kvec[subpkt];
+ size_t len = txb->pkt_len;
+ bool last;
+ u8 flags;
- _enter("%x,{%d}", txb->seq, txb->len);
+ _enter("%x,%zd", txb->seq, len);
txb->serial = serial;
@@ -339,6 +447,15 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t
txb->seq == 1)
whdr->userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE;
+ txb->flags &= ~RXRPC_REQUEST_ACK;
+ flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
+ last = txb->flags & RXRPC_LAST_PACKET;
+
+ if (subpkt < req->n - 1) {
+ len = RXRPC_JUMBO_DATALEN;
+ goto dont_set_request_ack;
+ }
+
/* If our RTT cache needs working on, request an ACK. Also request
* ACKs if a DATA packet appears to have been lost.
*
@@ -346,113 +463,188 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t
* service call, lest OpenAFS incorrectly send us an ACK with some
* soft-ACKs in it and then never follow up with a proper hard ACK.
*/
- if (txb->flags & RXRPC_REQUEST_ACK)
- why = rxrpc_reqack_already_on;
- else if ((txb->flags & RXRPC_LAST_PACKET) && rxrpc_sending_to_client(txb))
+ if (last && rxrpc_sending_to_client(txb))
why = rxrpc_reqack_no_srv_last;
else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
why = rxrpc_reqack_ack_lost;
else if (txb->flags & RXRPC_TXBUF_RESENT)
why = rxrpc_reqack_retrans;
- else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2)
+ else if (call->cong_ca_state == RXRPC_CA_SLOW_START && call->cong_cwnd <= RXRPC_MIN_CWND)
why = rxrpc_reqack_slow_start;
else if (call->tx_winsize <= 2)
why = rxrpc_reqack_small_txwin;
- else if (call->peer->rtt_count < 3 && txb->seq & 1)
+ else if (call->rtt_count < 3)
why = rxrpc_reqack_more_rtt;
- else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real()))
+ else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real()))
why = rxrpc_reqack_old_rtt;
+ else if (!last && !after(READ_ONCE(call->send_top), txb->seq))
+ why = rxrpc_reqack_app_stall;
else
goto dont_set_request_ack;
rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]);
trace_rxrpc_req_ack(call->debug_id, txb->seq, why);
- if (why != rxrpc_reqack_no_srv_last)
- txb->flags |= RXRPC_REQUEST_ACK;
+ if (why != rxrpc_reqack_no_srv_last) {
+ flags |= RXRPC_REQUEST_ACK;
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, -1, serial);
+ call->rtt_last_req = req->now;
+ }
dont_set_request_ack:
- whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
- whdr->serial = htonl(txb->serial);
- whdr->cksum = txb->cksum;
+ /* The jumbo header overlays the wire header in the txbuf. */
+ if (subpkt < req->n - 1)
+ flags |= RXRPC_JUMBO_PACKET;
+ else
+ flags &= ~RXRPC_JUMBO_PACKET;
+ if (subpkt == 0) {
+ whdr->flags = flags;
+ whdr->serial = htonl(txb->serial);
+ whdr->cksum = txb->cksum;
+ whdr->serviceId = htons(conn->service_id);
+ kv->iov_base = whdr;
+ len += sizeof(*whdr);
+ } else {
+ jumbo->flags = flags;
+ jumbo->pad = 0;
+ jumbo->cksum = txb->cksum;
+ kv->iov_base = jumbo;
+ len += sizeof(*jumbo);
+ }
- trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, false);
+ trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, req->trace);
+ kv->iov_len = len;
+ return len;
}
/*
- * Prepare a packet for transmission.
+ * Prepare a transmission queue object for initial transmission. Returns the
+ * number of microseconds since the transmission queue base timestamp.
*/
-static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq,
+ struct rxrpc_send_data_req *req)
{
- rxrpc_serial_t serial;
-
- /* Each transmission of a Tx packet needs a new serial number */
- serial = rxrpc_get_next_serial(call->conn);
-
- rxrpc_prepare_data_subpacket(call, txb, serial);
-
- return txb->len;
+ if (!tq)
+ return 0;
+ if (tq->xmit_ts_base == KTIME_MIN) {
+ tq->xmit_ts_base = req->now;
+ return 0;
+ }
+ return ktime_to_us(ktime_sub(req->now, tq->xmit_ts_base));
}
/*
- * Set timeouts after transmitting a packet.
+ * Prepare a (jumbo) packet for transmission.
*/
-static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req)
{
- ktime_t now = ktime_get_real();
- bool ack_requested = txb->flags & RXRPC_REQUEST_ACK;
+ struct rxrpc_txqueue *tq = req->tq;
+ rxrpc_serial_t serial;
+ unsigned int xmit_ts;
+ rxrpc_seq_t seq = req->seq;
+ size_t len = 0;
+ bool start_tlp = false;
- call->tx_last_sent = now;
- txb->last_sent = now;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit);
- if (ack_requested) {
- rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_data);
+ /* Each transmission of a Tx packet needs a new serial number */
+ serial = rxrpc_get_next_serials(call->conn, req->n);
+
+ call->tx_last_serial = serial + req->n - 1;
+ call->tx_last_sent = req->now;
+ xmit_ts = rxrpc_prepare_txqueue(tq, req);
+ prefetch(tq->next);
+
+ for (int i = 0;;) {
+ int ix = seq & RXRPC_TXQ_MASK;
+ struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK];
+
+ _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq);
+
+ /* Record (re-)transmission for RACK [RFC8985 6.1]. */
+ if (__test_and_clear_bit(ix, &tq->segment_lost))
+ call->tx_nr_lost--;
+ if (req->retrans) {
+ __set_bit(ix, &tq->ever_retransmitted);
+ __set_bit(ix, &tq->segment_retransmitted);
+ call->tx_nr_resent++;
+ } else {
+ call->tx_nr_sent++;
+ start_tlp = true;
+ }
+ tq->segment_xmit_ts[ix] = xmit_ts;
+ tq->segment_serial[ix] = serial;
+ if (i + 1 == req->n)
+ /* Only sample the last subpacket in a jumbo. */
+ __set_bit(ix, &tq->rtt_samples);
+ len += rxrpc_prepare_data_subpacket(call, req, txb, serial, i);
+ serial++;
+ seq++;
+ i++;
+ if (i >= req->n)
+ break;
+ if (!(seq & RXRPC_TXQ_MASK)) {
+ tq = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit_advance);
+ xmit_ts = rxrpc_prepare_txqueue(tq, req);
+ }
+ }
- call->peer->rtt_last_req = now;
- if (call->peer->rtt_count > 1) {
- ktime_t delay = rxrpc_get_rto_backoff(call->peer, false);
+ /* Set timeouts */
+ if (req->tlp_probe) {
+ /* Sending TLP loss probe [RFC8985 7.3]. */
+ call->tlp_serial = serial - 1;
+ call->tlp_seq = seq - 1;
+ } else if (start_tlp) {
+ /* Schedule TLP loss probe [RFC8985 7.2]. */
+ ktime_t pto;
+
+ if (!test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags))
+ /* The first packet may take longer to elicit a response. */
+ pto = NSEC_PER_SEC;
+ else
+ pto = rxrpc_tlp_calc_pto(call, req->now);
- call->ack_lost_at = ktime_add(now, delay);
- trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack);
- }
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_TLP_PTO;
+ call->rack_timo_at = ktime_add(req->now, pto);
+ trace_rxrpc_rack_timer(call, pto, false);
+ trace_rxrpc_timer_set(call, pto, rxrpc_timer_trace_rack_tlp_pto);
}
if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) {
ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo));
- call->expect_rx_by = ktime_add(now, delay);
+ call->expect_rx_by = ktime_add(req->now, delay);
trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx);
}
- rxrpc_set_keepalive(call, now);
+ rxrpc_set_keepalive(call, req->now);
+ return len;
}
/*
- * send a packet through the transport endpoint
+ * Send one or more packets through the transport endpoint
*/
-static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
struct rxrpc_connection *conn = call->conn;
enum rxrpc_tx_point frag;
+ struct rxrpc_txqueue *tq = req->tq;
+ struct rxrpc_txbuf *txb;
struct msghdr msg;
+ rxrpc_seq_t seq = req->seq;
size_t len;
- int ret;
+ bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags);
+ int ret, stat_ix;
- _enter("%x,{%d}", txb->seq, txb->len);
+ _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1);
- len = rxrpc_prepare_data_packet(call, txb);
+ stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1;
+ atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]);
- if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
- static int lose;
- if ((lose++ & 7) == 7) {
- ret = 0;
- trace_rxrpc_tx_data(call, txb->seq, txb->serial,
- txb->flags, true);
- goto done;
- }
- }
+ len = rxrpc_prepare_data_packet(call, req);
+ txb = tq->bufs[seq & RXRPC_TXQ_MASK];
- iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, len);
+ iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, req->n, len);
msg.msg_name = &call->peer->srx.transport;
msg.msg_namelen = call->peer->srx.transport_len;
@@ -460,16 +652,11 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
msg.msg_controllen = 0;
msg.msg_flags = MSG_SPLICE_PAGES;
- /* Track what we've attempted to transmit at least once so that the
- * retransmission algorithm doesn't try to resend what we haven't sent
- * yet.
+ /* Send the packet with the don't fragment bit set unless we think it's
+ * too big or if this is a retransmission.
*/
- if (txb->seq == call->tx_transmitted + 1)
- call->tx_transmitted = txb->seq;
-
- /* send the packet with the don't fragment bit set if we currently
- * think it's small enough */
- if (txb->len >= call->peer->maxdata) {
+ if (seq == call->tx_transmitted + 1 &&
+ len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) {
rxrpc_local_dont_fragment(conn->local, false);
frag = rxrpc_tx_point_call_data_frag;
} else {
@@ -477,7 +664,25 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
frag = rxrpc_tx_point_call_data_nofrag;
}
-retry:
+ /* Track what we've attempted to transmit at least once so that the
+ * retransmission algorithm doesn't try to resend what we haven't sent
+ * yet.
+ */
+ if (seq == call->tx_transmitted + 1)
+ call->tx_transmitted = seq + req->n - 1;
+
+ if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+ static int lose;
+
+ if ((lose++ & 7) == 7) {
+ ret = 0;
+ trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags,
+ rxrpc_txdata_inject_loss);
+ conn->peer->last_tx_at = ktime_get_seconds();
+ goto done;
+ }
+ }
+
/* send the packet by UDP
* - returns -EMSGSIZE if UDP would have to fragment the packet
* to go out of the interface
@@ -488,36 +693,35 @@ retry:
ret = do_udp_sendmsg(conn->local->socket, &msg, len);
conn->peer->last_tx_at = ktime_get_seconds();
- if (ret < 0) {
+ if (ret == -EMSGSIZE) {
+ rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize);
+ trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag);
+ ret = 0;
+ } else if (ret < 0) {
rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail);
trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag);
} else {
- trace_rxrpc_tx_packet(call->debug_id, whdr, frag);
+ trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag);
}
rxrpc_tx_backoff(call, ret);
- if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_frag) {
- rxrpc_local_dont_fragment(conn->local, false);
- frag = rxrpc_tx_point_call_data_frag;
- goto retry;
- }
-done:
- if (ret >= 0) {
- rxrpc_tstamp_data_packets(call, txb);
- } else {
- /* Cancel the call if the initial transmission fails,
- * particularly if that's due to network routing issues that
- * aren't going away anytime soon. The layer above can arrange
- * the retransmission.
+ if (ret < 0) {
+ /* Cancel the call if the initial transmission fails or if we
+ * hit due to network routing issues that aren't going away
+ * anytime soon. The layer above can arrange the
+ * retransmission.
*/
- if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags))
+ if (new_call ||
+ ret == -ENETUNREACH ||
+ ret == -EHOSTUNREACH ||
+ ret == -ECONNREFUSED)
rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
RX_USER_ABORT, ret);
}
- _leave(" = %d [%u]", ret, call->peer->maxdata);
- return ret;
+done:
+ _leave(" = %d [%u]", ret, call->peer->max_data);
}
/*
@@ -692,41 +896,3 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
peer->last_tx_at = ktime_get_seconds();
_leave("");
}
-
-/*
- * Schedule an instant Tx resend.
- */
-static inline void rxrpc_instant_resend(struct rxrpc_call *call,
- struct rxrpc_txbuf *txb)
-{
- if (!__rxrpc_call_is_complete(call))
- kdebug("resend");
-}
-
-/*
- * Transmit one packet.
- */
-void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
-{
- int ret;
-
- ret = rxrpc_send_data_packet(call, txb);
- if (ret < 0) {
- switch (ret) {
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -ECONNREFUSED:
- rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
- 0, ret);
- break;
- default:
- _debug("need instant resend %d", ret);
- rxrpc_instant_resend(call, txb);
- }
- } else {
- ktime_t delay = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC);
-
- call->resend_at = ktime_add(ktime_get_real(), delay);
- trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend_tx);
- }
-}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 552ba84a255c..d82e44a3901b 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -102,6 +102,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local,
*/
static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
{
+ unsigned int max_data;
+
/* wind down the local interface MTU */
if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu)
peer->if_mtu = mtu;
@@ -120,11 +122,17 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
}
}
- if (mtu < peer->mtu) {
- spin_lock(&peer->lock);
- peer->mtu = mtu;
- peer->maxdata = peer->mtu - peer->hdrsize;
- spin_unlock(&peer->lock);
+ max_data = max_t(int, mtu - peer->hdrsize, 500);
+ if (max_data < peer->max_data) {
+ if (peer->pmtud_good > max_data)
+ peer->pmtud_good = max_data;
+ if (peer->pmtud_bad > max_data + 1)
+ peer->pmtud_bad = max_data + 1;
+
+ trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp);
+ write_seqcount_begin(&peer->mtu_lock);
+ peer->max_data = max_data;
+ write_seqcount_end(&peer->mtu_lock);
}
}
@@ -205,23 +213,23 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb,
struct rxrpc_call *call;
HLIST_HEAD(error_targets);
- spin_lock(&peer->lock);
+ spin_lock_irq(&peer->lock);
hlist_move_list(&peer->error_targets, &error_targets);
while (!hlist_empty(&error_targets)) {
call = hlist_entry(error_targets.first,
struct rxrpc_call, error_link);
hlist_del_init(&call->error_link);
- spin_unlock(&peer->lock);
+ spin_unlock_irq(&peer->lock);
rxrpc_see_call(call, rxrpc_call_see_distribute_error);
rxrpc_set_call_completion(call, compl, 0, -err);
- rxrpc_input_call_event(call, skb);
+ rxrpc_input_call_event(call);
- spin_lock(&peer->lock);
+ spin_lock_irq(&peer->lock);
}
- spin_unlock(&peer->lock);
+ spin_unlock_irq(&peer->lock);
}
/*
@@ -347,3 +355,89 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work)
_leave("");
}
+
+/*
+ * Do path MTU probing.
+ */
+void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial,
+ bool sendmsg_fail)
+{
+ struct rxrpc_peer *peer = conn->peer;
+ unsigned int max_data = peer->max_data;
+ int good, trial, bad, jumbo;
+
+ good = peer->pmtud_good;
+ trial = peer->pmtud_trial;
+ bad = peer->pmtud_bad;
+ if (good >= bad - 1) {
+ conn->pmtud_probe = 0;
+ peer->pmtud_lost = false;
+ return;
+ }
+
+ if (!peer->pmtud_probing)
+ goto send_probe;
+
+ if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) {
+ /* Retry a lost probe. */
+ if (!peer->pmtud_lost) {
+ trace_rxrpc_pmtud_lost(conn, acked_serial);
+ conn->pmtud_probe = 0;
+ peer->pmtud_lost = true;
+ goto send_probe;
+ }
+
+ /* The probed size didn't seem to get through. */
+ bad = trial;
+ peer->pmtud_bad = bad;
+ if (bad <= max_data)
+ max_data = bad - 1;
+ } else {
+ /* It did get through. */
+ good = trial;
+ peer->pmtud_good = good;
+ if (good > max_data)
+ max_data = good;
+ }
+
+ max_data = umin(max_data, peer->ackr_max_data);
+ if (max_data != peer->max_data) {
+ preempt_disable();
+ write_seqcount_begin(&peer->mtu_lock);
+ peer->max_data = max_data;
+ write_seqcount_end(&peer->mtu_lock);
+ preempt_enable();
+ }
+
+ jumbo = max_data + sizeof(struct rxrpc_jumbo_header);
+ jumbo /= RXRPC_JUMBO_SUBPKTLEN;
+ peer->pmtud_jumbo = jumbo;
+
+ trace_rxrpc_pmtud_rx(conn, acked_serial);
+ conn->pmtud_probe = 0;
+ peer->pmtud_lost = false;
+
+ if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2))
+ trial = RXRPC_JUMBO(2);
+ else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4))
+ trial = RXRPC_JUMBO(4);
+ else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3))
+ trial = RXRPC_JUMBO(3);
+ else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6))
+ trial = RXRPC_JUMBO(6);
+ else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5))
+ trial = RXRPC_JUMBO(5);
+ else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8))
+ trial = RXRPC_JUMBO(8);
+ else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7))
+ trial = RXRPC_JUMBO(7);
+ else
+ trial = (good + bad) / 2;
+ peer->pmtud_trial = trial;
+
+ if (good >= bad)
+ return;
+
+send_probe:
+ peer->pmtud_pending = true;
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 49dcda67a0d5..e1c63129586b 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -162,6 +162,11 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local,
#endif
peer->if_mtu = 1500;
+ if (peer->max_data < peer->if_mtu - peer->hdrsize) {
+ trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize,
+ rxrpc_pmtud_reduce_route);
+ peer->max_data = peer->if_mtu - peer->hdrsize;
+ }
memset(&fl, 0, sizeof(fl));
switch (peer->srx.transport.family) {
@@ -199,8 +204,16 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local,
}
peer->if_mtu = dst_mtu(dst);
+ peer->hdrsize += dst->header_len + dst->trailer_len;
+ peer->tx_seg_max = dst->dev->gso_max_segs;
dst_release(dst);
+ peer->max_data = umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize);
+ peer->pmtud_good = 500;
+ peer->pmtud_bad = peer->if_mtu - peer->hdrsize + 1;
+ peer->pmtud_trial = umin(peer->max_data, peer->pmtud_bad - 1);
+ peer->pmtud_pending = true;
+
_leave(" [if_mtu %u]", peer->if_mtu);
}
@@ -222,11 +235,9 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp,
peer->service_conns = RB_ROOT;
seqlock_init(&peer->service_conn_lock);
spin_lock_init(&peer->lock);
- spin_lock_init(&peer->rtt_input_lock);
+ seqcount_init(&peer->mtu_lock);
peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
-
- rxrpc_peer_init_rtt(peer);
-
+ peer->recent_srtt_us = UINT_MAX;
peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
trace_rxrpc_peer(peer->debug_id, 1, why);
}
@@ -242,9 +253,7 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer,
unsigned long hash_key)
{
peer->hash_key = hash_key;
- rxrpc_assess_MTU_size(local, peer);
- peer->mtu = peer->if_mtu;
- peer->rtt_last_req = ktime_get_real();
+
switch (peer->srx.transport.family) {
case AF_INET:
@@ -268,7 +277,9 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer,
}
peer->hdrsize += sizeof(struct rxrpc_wire_header);
- peer->maxdata = peer->mtu - peer->hdrsize;
+ peer->max_data = peer->if_mtu - peer->hdrsize;
+
+ rxrpc_assess_MTU_size(local, peer);
}
/*
@@ -304,6 +315,7 @@ static void rxrpc_free_peer(struct rxrpc_peer *peer)
* Set up a new incoming peer. There shouldn't be any other matching peers
* since we've already done a search in the list from the non-reentrant context
* (the data_ready handler) that is the only place we can add new peers.
+ * Called with interrupts disabled.
*/
void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer)
{
@@ -479,7 +491,7 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer);
*/
unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer)
{
- return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX;
+ return READ_ONCE(peer->recent_srtt_us);
}
EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 263a2251e3d2..d803562ca0ac 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -52,7 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
struct rxrpc_call *call;
struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
enum rxrpc_call_state state;
- rxrpc_seq_t acks_hard_ack;
+ rxrpc_seq_t tx_bottom;
char lbuff[50], rbuff[50];
long timeout = 0;
@@ -79,7 +79,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
if (state != RXRPC_CALL_SERVER_PREALLOC)
timeout = ktime_ms_delta(READ_ONCE(call->expect_rx_by), ktime_get_real());
- acks_hard_ack = READ_ONCE(call->acks_hard_ack);
+ tx_bottom = READ_ONCE(call->tx_bottom);
seq_printf(seq,
"UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
" %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n",
@@ -93,7 +93,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
rxrpc_call_states[state],
call->abort_code,
call->debug_id,
- acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack,
+ tx_bottom, READ_ONCE(call->tx_top) - tx_bottom,
call->ackr_window, call->ackr_wtop - call->ackr_window,
call->rx_serial,
call->cong_cwnd,
@@ -283,9 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN) {
seq_puts(seq,
- "Proto Local "
- " Remote "
- " Use SST MTU LastUse RTT RTO\n"
+ "Proto Local Remote Use SST Maxd LastUse RTT RTO\n"
);
return 0;
}
@@ -298,16 +296,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
now = ktime_get_seconds();
seq_printf(seq,
- "UDP %-47.47s %-47.47s %3u"
- " %3u %5u %6llus %8u %8u\n",
+ "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8d %8d\n",
lbuff,
rbuff,
refcount_read(&peer->ref),
peer->cong_ssthresh,
- peer->mtu,
+ peer->max_data,
now - peer->last_tx_at,
- peer->srtt_us >> 3,
- peer->rto_us);
+ READ_ONCE(peer->recent_srtt_us),
+ READ_ONCE(peer->recent_rto_us));
return 0;
}
@@ -476,10 +473,11 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq));
seq_printf(seq,
- "Data : send=%u sendf=%u fail=%u\n",
+ "Data : send=%u sendf=%u fail=%u emsz=%u\n",
atomic_read(&rxnet->stat_tx_data_send),
atomic_read(&rxnet->stat_tx_data_send_frag),
- atomic_read(&rxnet->stat_tx_data_send_fail));
+ atomic_read(&rxnet->stat_tx_data_send_fail),
+ atomic_read(&rxnet->stat_tx_data_send_msgsize));
seq_printf(seq,
"Data-Tx : nr=%u retrans=%u uf=%u cwr=%u\n",
atomic_read(&rxnet->stat_tx_data),
@@ -508,7 +506,7 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]),
atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE]));
seq_printf(seq,
- "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n",
+ "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u z=%u\n",
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]),
@@ -517,13 +515,14 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]),
- atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]));
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]),
+ atomic_read(&rxnet->stat_rx_acks[0]));
seq_printf(seq,
- "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n",
+ "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u\n",
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]),
- atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]),
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]),
- atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]));
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]),
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall]));
seq_printf(seq,
"Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n",
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]),
@@ -531,6 +530,30 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]),
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin]));
seq_printf(seq,
+ "Jumbo-Tx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",
+ atomic_read(&rxnet->stat_tx_jumbo[0]),
+ atomic_read(&rxnet->stat_tx_jumbo[1]),
+ atomic_read(&rxnet->stat_tx_jumbo[2]),
+ atomic_read(&rxnet->stat_tx_jumbo[3]),
+ atomic_read(&rxnet->stat_tx_jumbo[4]),
+ atomic_read(&rxnet->stat_tx_jumbo[5]),
+ atomic_read(&rxnet->stat_tx_jumbo[6]),
+ atomic_read(&rxnet->stat_tx_jumbo[7]),
+ atomic_read(&rxnet->stat_tx_jumbo[8]),
+ atomic_read(&rxnet->stat_tx_jumbo[9]));
+ seq_printf(seq,
+ "Jumbo-Rx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",
+ atomic_read(&rxnet->stat_rx_jumbo[0]),
+ atomic_read(&rxnet->stat_rx_jumbo[1]),
+ atomic_read(&rxnet->stat_rx_jumbo[2]),
+ atomic_read(&rxnet->stat_rx_jumbo[3]),
+ atomic_read(&rxnet->stat_rx_jumbo[4]),
+ atomic_read(&rxnet->stat_rx_jumbo[5]),
+ atomic_read(&rxnet->stat_rx_jumbo[6]),
+ atomic_read(&rxnet->stat_rx_jumbo[7]),
+ atomic_read(&rxnet->stat_rx_jumbo[8]),
+ atomic_read(&rxnet->stat_rx_jumbo[9]));
+ seq_printf(seq,
"Buffers : txb=%u rxb=%u\n",
atomic_read(&rxrpc_nr_txbuf),
atomic_read(&rxrpc_n_rx_skbs));
@@ -567,6 +590,8 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size)
atomic_set(&rxnet->stat_tx_ack_skip, 0);
memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks));
memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks));
+ memset(&rxnet->stat_tx_jumbo, 0, sizeof(rxnet->stat_tx_jumbo));
+ memset(&rxnet->stat_rx_jumbo, 0, sizeof(rxnet->stat_rx_jumbo));
memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack));
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
index 4fe6b4d20ada..42f70e4636f8 100644
--- a/net/rxrpc/protocol.h
+++ b/net/rxrpc/protocol.h
@@ -92,11 +92,16 @@ struct rxrpc_jumbo_header {
/*
* The maximum number of subpackets that can possibly fit in a UDP packet is:
*
- * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1
- * = ((65535 - 28 - 28) / 1416) + 1
- * = 46 non-terminal packets and 1 terminal packet.
+ * (max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412)
+ * = ((65535 - 28 + 4) / 1416)
+ * = 45 non-terminal packets and 1 terminal packet.
*/
-#define RXRPC_MAX_NR_JUMBO 47
+#define RXRPC_MAX_NR_JUMBO 46
+
+/* Size of a jumbo packet with N subpackets, excluding UDP+IP */
+#define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \
+ RXRPC_JUMBO_DATALEN + \
+ ((N) - 1) * RXRPC_JUMBO_SUBPKTLEN)
/*****************************************************************************/
/*
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index a482f88c5fc5..32cd5f1d541d 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -36,16 +36,16 @@ void rxrpc_notify_socket(struct rxrpc_call *call)
sk = &rx->sk;
if (rx && sk->sk_state < RXRPC_CLOSE) {
if (call->notify_rx) {
- spin_lock(&call->notify_lock);
+ spin_lock_irq(&call->notify_lock);
call->notify_rx(sk, call, call->user_call_ID);
- spin_unlock(&call->notify_lock);
+ spin_unlock_irq(&call->notify_lock);
} else {
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
if (list_empty(&call->recvmsg_link)) {
rxrpc_get_call(call, rxrpc_call_get_notify_socket);
list_add_tail(&call->recvmsg_link, &rx->recvmsg_q);
}
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
if (!sock_flag(sk, SOCK_DEAD)) {
_debug("call %ps", sk->sk_data_ready);
@@ -337,14 +337,14 @@ try_again:
* We also want to weed out calls that got requeued whilst we were
* shovelling data out.
*/
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
l = rx->recvmsg_q.next;
call = list_entry(l, struct rxrpc_call, recvmsg_link);
if (!rxrpc_call_is_complete(call) &&
skb_queue_empty(&call->recvmsg_queue)) {
list_del_init(&call->recvmsg_link);
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
release_sock(&rx->sk);
trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0);
rxrpc_put_call(call, rxrpc_call_put_recvmsg);
@@ -355,7 +355,7 @@ try_again:
list_del_init(&call->recvmsg_link);
else
rxrpc_get_call(call, rxrpc_call_get_recvmsg);
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
call_debug_id = call->debug_id;
trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0);
@@ -445,9 +445,9 @@ error_unlock_call:
error_requeue_call:
if (!(flags & MSG_PEEK)) {
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
list_add(&call->recvmsg_link, &rx->recvmsg_q);
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0);
} else {
rxrpc_put_call(call, rxrpc_call_put_recvmsg);
diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c
index cdab7b7d08a0..7474f88d7b18 100644
--- a/net/rxrpc/rtt.c
+++ b/net/rxrpc/rtt.c
@@ -12,22 +12,22 @@
#include "ar-internal.h"
#define RXRPC_RTO_MAX (120 * USEC_PER_SEC)
-#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */
+#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */
#define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */
-static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer)
+static u32 rxrpc_rto_min_us(struct rxrpc_call *call)
{
return 200;
}
-static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer)
+static u32 __rxrpc_set_rto(const struct rxrpc_call *call)
{
- return (peer->srtt_us >> 3) + peer->rttvar_us;
+ return (call->srtt_us >> 3) + call->rttvar_us;
}
static u32 rxrpc_bound_rto(u32 rto)
{
- return min(rto, RXRPC_RTO_MAX);
+ return clamp(200000, rto + 100000, RXRPC_RTO_MAX);
}
/*
@@ -40,10 +40,10 @@ static u32 rxrpc_bound_rto(u32 rto)
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us)
+static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us)
{
long m = sample_rtt_us; /* RTT */
- u32 srtt = peer->srtt_us;
+ u32 srtt = call->srtt_us;
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
@@ -66,7 +66,7 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us)
srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0) {
m = -m; /* m is now abs(error) */
- m -= (peer->mdev_us >> 2); /* similar update on mdev */
+ m -= (call->mdev_us >> 2); /* similar update on mdev */
/* This is similar to one of Eifel findings.
* Eifel blocks mdev updates when rtt decreases.
* This solution is a bit different: we use finer gain
@@ -78,31 +78,31 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us)
if (m > 0)
m >>= 3;
} else {
- m -= (peer->mdev_us >> 2); /* similar update on mdev */
+ m -= (call->mdev_us >> 2); /* similar update on mdev */
}
- peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
- if (peer->mdev_us > peer->mdev_max_us) {
- peer->mdev_max_us = peer->mdev_us;
- if (peer->mdev_max_us > peer->rttvar_us)
- peer->rttvar_us = peer->mdev_max_us;
+ call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
+ if (call->mdev_us > call->mdev_max_us) {
+ call->mdev_max_us = call->mdev_us;
+ if (call->mdev_max_us > call->rttvar_us)
+ call->rttvar_us = call->mdev_max_us;
}
} else {
/* no previous measure. */
srtt = m << 3; /* take the measured time to be rtt */
- peer->mdev_us = m << 1; /* make sure rto = 3*rtt */
- peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer));
- peer->mdev_max_us = peer->rttvar_us;
+ call->mdev_us = m << 1; /* make sure rto = 3*rtt */
+ call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call));
+ call->mdev_max_us = call->rttvar_us;
}
- peer->srtt_us = max(1U, srtt);
+ call->srtt_us = umax(srtt, 1);
}
/*
* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static void rxrpc_set_rto(struct rxrpc_peer *peer)
+static void rxrpc_set_rto(struct rxrpc_call *call)
{
u32 rto;
@@ -113,7 +113,7 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer)
* is invisible. Actually, Linux-2.4 also generates erratic
* ACKs in some circumstances.
*/
- rto = __rxrpc_set_rto(peer);
+ rto = __rxrpc_set_rto(call);
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
@@ -124,61 +124,73 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer)
/* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo
* guarantees that rto is higher.
*/
- peer->rto_us = rxrpc_bound_rto(rto);
+ call->rto_us = rxrpc_bound_rto(rto);
}
-static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us)
+static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us)
+{
+ /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */
+ u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024;
+
+ minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024,
+ (u32)rtt_us ? : jiffies_to_usecs(1));
+}
+
+static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us)
{
if (rtt_us < 0)
return;
- //rxrpc_update_rtt_min(peer, rtt_us);
- rxrpc_rtt_estimator(peer, rtt_us);
- rxrpc_set_rto(peer);
+ /* Update RACK min RTT [RFC8985 6.1 Step 1]. */
+ rxrpc_update_rtt_min(call, resp_time, rtt_us);
+
+ rxrpc_rtt_estimator(call, rtt_us);
+ rxrpc_set_rto(call);
- /* RFC6298: only reset backoff on valid RTT measurement. */
- peer->backoff = 0;
+ /* Only reset backoff on valid RTT measurement [RFC6298]. */
+ call->backoff = 0;
}
/*
* Add RTT information to cache. This is called in softirq mode and has
- * exclusive access to the peer RTT data.
+ * exclusive access to the call RTT data.
*/
-void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
int rtt_slot,
rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
ktime_t send_time, ktime_t resp_time)
{
- struct rxrpc_peer *peer = call->peer;
s64 rtt_us;
rtt_us = ktime_to_us(ktime_sub(resp_time, send_time));
if (rtt_us < 0)
return;
- spin_lock(&peer->rtt_input_lock);
- rxrpc_ack_update_rtt(peer, rtt_us);
- if (peer->rtt_count < 3)
- peer->rtt_count++;
- spin_unlock(&peer->rtt_input_lock);
+ rxrpc_ack_update_rtt(call, resp_time, rtt_us);
+ if (call->rtt_count < 3)
+ call->rtt_count++;
+ call->rtt_taken++;
+
+ WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8);
+ WRITE_ONCE(call->peer->recent_rto_us, call->rto_us);
trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial,
- peer->srtt_us >> 3, peer->rto_us);
+ rtt_us, call->srtt_us, call->rto_us);
}
/*
* Get the retransmission timeout to set in nanoseconds, backing it off each
* time we retransmit.
*/
-ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans)
+ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans)
{
u64 timo_us;
- u32 backoff = READ_ONCE(peer->backoff);
+ u32 backoff = READ_ONCE(call->backoff);
- timo_us = peer->rto_us;
+ timo_us = call->rto_us;
timo_us <<= backoff;
if (retrans && timo_us * 2 <= RXRPC_RTO_MAX)
- WRITE_ONCE(peer->backoff, backoff + 1);
+ WRITE_ONCE(call->backoff, backoff + 1);
if (timo_us < 1)
timo_us = 1;
@@ -186,10 +198,11 @@ ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans)
return ns_to_ktime(timo_us * NSEC_PER_USEC);
}
-void rxrpc_peer_init_rtt(struct rxrpc_peer *peer)
+void rxrpc_call_init_rtt(struct rxrpc_call *call)
{
- peer->rto_us = RXRPC_TIMEOUT_INIT;
- peer->mdev_us = RXRPC_TIMEOUT_INIT;
- peer->backoff = 0;
- //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U);
+ call->rtt_last_req = KTIME_MIN;
+ call->rto_us = RXRPC_TIMEOUT_INIT;
+ call->mdev_us = RXRPC_TIMEOUT_INIT;
+ call->backoff = 0;
+ //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U);
}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 48a1475e6b06..62b09d23ec08 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -148,14 +148,14 @@ error:
static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp)
{
struct rxrpc_txbuf *txb;
- size_t shdr, space;
+ size_t shdr, alloc, limit, part;
- remain = min(remain, 65535 - sizeof(struct rxrpc_wire_header));
+ remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header));
switch (call->conn->security_level) {
default:
- space = min_t(size_t, remain, RXRPC_JUMBO_DATALEN);
- return rxrpc_alloc_data_txbuf(call, space, 1, gfp);
+ alloc = umin(remain, RXRPC_JUMBO_DATALEN);
+ return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp);
case RXRPC_SECURITY_AUTH:
shdr = sizeof(struct rxkad_level1_hdr);
break;
@@ -164,15 +164,21 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem
break;
}
- space = min_t(size_t, round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr);
- space = round_up(space, RXKAD_ALIGN);
+ limit = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN) - shdr;
+ if (remain < limit) {
+ part = remain;
+ alloc = round_up(shdr + part, RXKAD_ALIGN);
+ } else {
+ part = limit;
+ alloc = RXRPC_JUMBO_DATALEN;
+ }
- txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp);
+ txb = rxrpc_alloc_data_txbuf(call, alloc, RXKAD_ALIGN, gfp);
if (!txb)
return NULL;
txb->offset += shdr;
- txb->space -= shdr;
+ txb->space = part;
return txb;
}
@@ -263,13 +269,13 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
check = txb->seq ^ call->call_id;
hdr->data_size = htonl((u32)check << 16 | txb->len);
- txb->len += sizeof(struct rxkad_level1_hdr);
- pad = txb->len;
+ txb->pkt_len = sizeof(struct rxkad_level1_hdr) + txb->len;
+ pad = txb->pkt_len;
pad = RXKAD_ALIGN - pad;
pad &= RXKAD_ALIGN - 1;
if (pad) {
memset(txb->kvec[0].iov_base + txb->offset, 0, pad);
- txb->len += pad;
+ txb->pkt_len += pad;
}
/* start the encryption afresh */
@@ -298,7 +304,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1);
struct rxrpc_crypt iv;
struct scatterlist sg;
- size_t pad;
+ size_t content, pad;
u16 check;
int ret;
@@ -309,23 +315,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
rxkhdr->data_size = htonl(txb->len | (u32)check << 16);
rxkhdr->checksum = 0;
- txb->len += sizeof(struct rxkad_level2_hdr);
- pad = txb->len;
- pad = RXKAD_ALIGN - pad;
- pad &= RXKAD_ALIGN - 1;
- if (pad) {
+ content = sizeof(struct rxkad_level2_hdr) + txb->len;
+ txb->pkt_len = round_up(content, RXKAD_ALIGN);
+ pad = txb->pkt_len - content;
+ if (pad)
memset(txb->kvec[0].iov_base + txb->offset, 0, pad);
- txb->len += pad;
- }
/* encrypt from the session key */
token = call->conn->key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
- sg_init_one(&sg, rxkhdr, txb->len);
+ sg_init_one(&sg, rxkhdr, txb->pkt_len);
skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x);
+ skcipher_request_set_crypt(req, &sg, &sg, txb->pkt_len, iv.x);
ret = crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
return ret;
@@ -384,19 +387,33 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
switch (call->conn->security_level) {
case RXRPC_SECURITY_PLAIN:
+ txb->pkt_len = txb->len;
ret = 0;
break;
case RXRPC_SECURITY_AUTH:
ret = rxkad_secure_packet_auth(call, txb, req);
+ if (txb->alloc_size == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
break;
case RXRPC_SECURITY_ENCRYPT:
ret = rxkad_secure_packet_encrypt(call, txb, req);
+ if (txb->alloc_size == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
break;
default:
ret = -EPERM;
break;
}
+ /* Clear excess space in the packet */
+ if (txb->pkt_len < txb->alloc_size) {
+ struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
+ size_t gap = txb->alloc_size - txb->pkt_len;
+ void *p = whdr + 1;
+
+ memset(p + txb->pkt_len, 0, gap);
+ }
+
skcipher_request_free(req);
_leave(" = %d [set %x]", ret, y);
return ret;
diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c
index 085e7892d310..7ef93407be83 100644
--- a/net/rxrpc/rxperf.c
+++ b/net/rxrpc/rxperf.c
@@ -503,7 +503,7 @@ static int rxperf_process_call(struct rxperf_call *call)
reply_len + sizeof(rxperf_magic_cookie));
while (reply_len > 0) {
- len = min_t(size_t, reply_len, PAGE_SIZE);
+ len = umin(reply_len, PAGE_SIZE);
bvec_set_page(&bv, ZERO_PAGE(0), len, 0);
iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, len);
msg.msg_flags = MSG_MORE;
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index cb8dd1d3b1d4..9784adc8f275 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -114,10 +114,10 @@ found:
if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) {
ret = conn->security->init_connection_security(conn, token);
if (ret == 0) {
- spin_lock(&conn->state_lock);
+ spin_lock_irq(&conn->state_lock);
if (conn->state == RXRPC_CONN_CLIENT_UNSECURED)
conn->state = RXRPC_CONN_CLIENT;
- spin_unlock(&conn->state_lock);
+ spin_unlock_irq(&conn->state_lock);
}
}
mutex_unlock(&conn->security_lock);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 6abb8eec1b2b..c4c8b718cafa 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -94,9 +94,11 @@ no_wait:
*/
static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win)
{
+ rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom);
+
if (_tx_win)
- *_tx_win = call->tx_bottom;
- return call->tx_prepared - call->tx_bottom < 256;
+ *_tx_win = tx_bottom;
+ return call->send_top - tx_bottom < 256;
}
/*
@@ -132,13 +134,13 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
rxrpc_seq_t tx_start, tx_win;
signed long rtt, timeout;
- rtt = READ_ONCE(call->peer->srtt_us) >> 3;
+ rtt = READ_ONCE(call->srtt_us) >> 3;
rtt = usecs_to_jiffies(rtt) * 2;
if (rtt < 2)
rtt = 2;
timeout = rtt;
- tx_start = smp_load_acquire(&call->acks_hard_ack);
+ tx_start = READ_ONCE(call->tx_bottom);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -195,8 +197,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
DECLARE_WAITQUEUE(myself, current);
int ret;
- _enter(",{%u,%u,%u,%u}",
- call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize);
+ _enter(",{%u,%u,%u}",
+ call->tx_bottom, call->tx_top, call->tx_winsize);
add_wait_queue(&call->waitq, &myself);
@@ -240,37 +242,76 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
struct rxrpc_txbuf *txb,
rxrpc_notify_end_tx_t notify_end_tx)
{
+ struct rxrpc_txqueue *sq = call->send_queue;
rxrpc_seq_t seq = txb->seq;
bool poke, last = txb->flags & RXRPC_LAST_PACKET;
-
+ int ix = seq & RXRPC_TXQ_MASK;
rxrpc_inc_stat(call->rxnet, stat_tx_data);
- ASSERTCMP(txb->seq, ==, call->tx_prepared + 1);
-
- /* We have to set the timestamp before queueing as the retransmit
- * algorithm can see the packet as soon as we queue it.
- */
- txb->last_sent = ktime_get_real();
+ ASSERTCMP(txb->seq, ==, call->send_top + 1);
if (last)
trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last);
else
trace_rxrpc_txqueue(call, rxrpc_txqueue_queue);
+ if (WARN_ON_ONCE(sq->bufs[ix]))
+ trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup);
+ else
+ trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue);
+
/* Add the packet to the call's output buffer */
- spin_lock(&call->tx_lock);
- poke = list_empty(&call->tx_sendmsg);
- list_add_tail(&txb->call_link, &call->tx_sendmsg);
- call->tx_prepared = seq;
- if (last)
+ poke = (READ_ONCE(call->tx_bottom) == call->send_top);
+ sq->bufs[ix] = txb;
+ /* Order send_top after the queue->next pointer and txb content. */
+ smp_store_release(&call->send_top, seq);
+ if (last) {
rxrpc_notify_end_tx(rx, call, notify_end_tx);
- spin_unlock(&call->tx_lock);
+ call->send_queue = NULL;
+ }
if (poke)
rxrpc_poke_call(call, rxrpc_call_poke_start);
}
/*
+ * Allocate a new txqueue unit and add it to the transmission queue.
+ */
+static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call)
+{
+ struct rxrpc_txqueue *tq;
+
+ tq = kzalloc(sizeof(*tq), sk->sk_allocation);
+ if (!tq)
+ return -ENOMEM;
+
+ tq->xmit_ts_base = KTIME_MIN;
+ for (int i = 0; i < RXRPC_NR_TXQUEUE; i++)
+ tq->segment_xmit_ts[i] = UINT_MAX;
+
+ if (call->send_queue) {
+ tq->qbase = call->send_top + 1;
+ call->send_queue->next = tq;
+ call->send_queue = tq;
+ } else if (WARN_ON(call->tx_queue)) {
+ kfree(tq);
+ return -ENOMEM;
+ } else {
+ /* We start at seq 1, so pretend seq 0 is hard-acked. */
+ tq->nr_reported_acks = 1;
+ tq->segment_acked = 1UL;
+ tq->qbase = 0;
+ call->tx_qbase = 0;
+ call->send_queue = tq;
+ call->tx_qtail = tq;
+ call->tx_queue = tq;
+ }
+
+ trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc);
+ return 0;
+}
+
+/*
* send data through a socket
* - must be called in process context
* - The caller holds the call user access mutex, but not the socket lock.
@@ -344,6 +385,13 @@ reload:
if (!rxrpc_check_tx_space(call, NULL))
goto wait_for_space;
+ /* See if we need to begin/extend the Tx queue. */
+ if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) {
+ ret = rxrpc_alloc_txqueue(sk, call);
+ if (ret < 0)
+ goto maybe_error;
+ }
+
/* Work out the maximum size of a packet. Assume that
* the security header is going to be in the padded
* region (enc blocksize), but the trailer is not.
@@ -360,7 +408,7 @@ reload:
/* append next segment of data to the current buffer */
if (msg_data_left(msg) > 0) {
- size_t copy = min_t(size_t, txb->space, msg_data_left(msg));
+ size_t copy = umin(txb->space, msg_data_left(msg));
_debug("add %zu", copy);
if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset,
@@ -385,16 +433,12 @@ reload:
(msg_data_left(msg) == 0 && !more)) {
if (msg_data_left(msg) == 0 && !more)
txb->flags |= RXRPC_LAST_PACKET;
- else if (call->tx_top - call->acks_hard_ack <
- call->tx_winsize)
- txb->flags |= RXRPC_MORE_PACKETS;
ret = call->security->secure_packet(call, txb);
if (ret < 0)
goto out;
txb->kvec[0].iov_len += txb->len;
- txb->len = txb->kvec[0].iov_len;
rxrpc_queue_packet(rx, call, txb, notify_end_tx);
txb = NULL;
}
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 9bf9a1f6e4cb..46a20cf4c402 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -11,6 +11,8 @@
#include "ar-internal.h"
static struct ctl_table_header *rxrpc_sysctl_reg_table;
+static const unsigned int rxrpc_rx_mtu_min = 500;
+static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO;
static const unsigned int four = 4;
static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1;
static const unsigned int n_65535 = 65535;
@@ -115,7 +117,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)SYSCTL_ONE,
+ .extra1 = (void *)&rxrpc_rx_mtu_min,
.extra2 = (void *)&n_65535,
},
{
@@ -125,7 +127,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)SYSCTL_ONE,
- .extra2 = (void *)&four,
+ .extra2 = (void *)&rxrpc_jumbo_max,
},
};
diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c
index c3913d8a50d3..131d9e55c8e9 100644
--- a/net/rxrpc/txbuf.c
+++ b/net/rxrpc/txbuf.c
@@ -24,7 +24,7 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_
size_t total, hoff;
void *buf;
- txb = kmalloc(sizeof(*txb), gfp);
+ txb = kzalloc(sizeof(*txb), gfp);
if (!txb)
return NULL;
@@ -43,20 +43,14 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_
whdr = buf + hoff;
- INIT_LIST_HEAD(&txb->call_link);
- INIT_LIST_HEAD(&txb->tx_link);
refcount_set(&txb->ref, 1);
- txb->last_sent = KTIME_MIN;
txb->call_debug_id = call->debug_id;
txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids);
+ txb->alloc_size = data_size;
txb->space = data_size;
- txb->len = 0;
txb->offset = sizeof(*whdr);
txb->flags = call->conn->out_clientflag;
- txb->ack_why = 0;
- txb->seq = call->tx_prepared + 1;
- txb->serial = 0;
- txb->cksum = 0;
+ txb->seq = call->send_top + 1;
txb->nr_kvec = 1;
txb->kvec[0].iov_base = whdr;
txb->kvec[0].iov_len = sizeof(*whdr);
@@ -79,84 +73,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_
return txb;
}
-/*
- * Allocate and partially initialise an ACK packet.
- */
-struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size)
-{
- struct rxrpc_wire_header *whdr;
- struct rxrpc_acktrailer *trailer;
- struct rxrpc_ackpacket *ack;
- struct rxrpc_txbuf *txb;
- gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS;
- void *buf, *buf2 = NULL;
- u8 *filler;
-
- txb = kmalloc(sizeof(*txb), gfp);
- if (!txb)
- return NULL;
-
- buf = page_frag_alloc(&call->local->tx_alloc,
- sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp);
- if (!buf) {
- kfree(txb);
- return NULL;
- }
-
- if (sack_size) {
- buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp);
- if (!buf2) {
- page_frag_free(buf);
- kfree(txb);
- return NULL;
- }
- }
-
- whdr = buf;
- ack = buf + sizeof(*whdr);
- filler = buf + sizeof(*whdr) + sizeof(*ack) + 1;
- trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3;
-
- INIT_LIST_HEAD(&txb->call_link);
- INIT_LIST_HEAD(&txb->tx_link);
- refcount_set(&txb->ref, 1);
- txb->call_debug_id = call->debug_id;
- txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids);
- txb->space = 0;
- txb->len = sizeof(*whdr) + sizeof(*ack) + 3 + sizeof(*trailer);
- txb->offset = 0;
- txb->flags = call->conn->out_clientflag;
- txb->ack_rwind = 0;
- txb->seq = 0;
- txb->serial = 0;
- txb->cksum = 0;
- txb->nr_kvec = 3;
- txb->kvec[0].iov_base = whdr;
- txb->kvec[0].iov_len = sizeof(*whdr) + sizeof(*ack);
- txb->kvec[1].iov_base = buf2;
- txb->kvec[1].iov_len = sack_size;
- txb->kvec[2].iov_base = filler;
- txb->kvec[2].iov_len = 3 + sizeof(*trailer);
-
- whdr->epoch = htonl(call->conn->proto.epoch);
- whdr->cid = htonl(call->cid);
- whdr->callNumber = htonl(call->call_id);
- whdr->seq = 0;
- whdr->type = RXRPC_PACKET_TYPE_ACK;
- whdr->flags = 0;
- whdr->userStatus = 0;
- whdr->securityIndex = call->security_ix;
- whdr->_rsvd = 0;
- whdr->serviceId = htons(call->dest_srx.srx_service);
-
- get_page(virt_to_head_page(trailer));
-
- trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1,
- rxrpc_txbuf_alloc_ack);
- atomic_inc(&rxrpc_nr_txbuf);
- return txb;
-}
-
void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
{
int r;
@@ -179,7 +95,8 @@ static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb)
trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0,
rxrpc_txbuf_free);
for (i = 0; i < txb->nr_kvec; i++)
- if (txb->kvec[i].iov_base)
+ if (txb->kvec[i].iov_base &&
+ !is_zero_pfn(page_to_pfn(virt_to_page(txb->kvec[i].iov_base))))
page_frag_free(txb->kvec[i].iov_base);
kfree(txb);
atomic_dec(&rxrpc_nr_txbuf);
@@ -202,37 +119,3 @@ void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
rxrpc_free_txbuf(txb);
}
}
-
-/*
- * Shrink the transmit buffer.
- */
-void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call)
-{
- struct rxrpc_txbuf *txb;
- rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack);
- bool wake = false;
-
- _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top);
-
- while ((txb = list_first_entry_or_null(&call->tx_buffer,
- struct rxrpc_txbuf, call_link))) {
- hard_ack = smp_load_acquire(&call->acks_hard_ack);
- if (before(hard_ack, txb->seq))
- break;
-
- if (txb->seq != call->tx_bottom + 1)
- rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step);
- ASSERTCMP(txb->seq, ==, call->tx_bottom + 1);
- smp_store_release(&call->tx_bottom, call->tx_bottom + 1);
- list_del_rcu(&txb->call_link);
-
- trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue);
-
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated);
- if (after(call->acks_hard_ack, call->tx_bottom + 128))
- wake = true;
- }
-
- if (wake)
- wake_up(&call->waitq);
-}