diff options
Diffstat (limited to 'net/rxrpc')
-rw-r--r-- | net/rxrpc/Makefile | 1 | ||||
-rw-r--r-- | net/rxrpc/af_rxrpc.c | 25 | ||||
-rw-r--r-- | net/rxrpc/ar-internal.h | 420 | ||||
-rw-r--r-- | net/rxrpc/call_accept.c | 22 | ||||
-rw-r--r-- | net/rxrpc/call_event.c | 572 | ||||
-rw-r--r-- | net/rxrpc/call_object.c | 127 | ||||
-rw-r--r-- | net/rxrpc/conn_client.c | 30 | ||||
-rw-r--r-- | net/rxrpc/conn_event.c | 71 | ||||
-rw-r--r-- | net/rxrpc/conn_object.c | 26 | ||||
-rw-r--r-- | net/rxrpc/input.c | 821 | ||||
-rw-r--r-- | net/rxrpc/input_rack.c | 418 | ||||
-rw-r--r-- | net/rxrpc/insecure.c | 14 | ||||
-rw-r--r-- | net/rxrpc/io_thread.c | 134 | ||||
-rw-r--r-- | net/rxrpc/local_object.c | 6 | ||||
-rw-r--r-- | net/rxrpc/misc.c | 12 | ||||
-rw-r--r-- | net/rxrpc/output.c | 763 | ||||
-rw-r--r-- | net/rxrpc/peer_event.c | 130 | ||||
-rw-r--r-- | net/rxrpc/peer_object.c | 37 | ||||
-rw-r--r-- | net/rxrpc/proc.c | 69 | ||||
-rw-r--r-- | net/rxrpc/protocol.h | 19 | ||||
-rw-r--r-- | net/rxrpc/recvmsg.c | 18 | ||||
-rw-r--r-- | net/rxrpc/rtt.c | 121 | ||||
-rw-r--r-- | net/rxrpc/rxkad.c | 91 | ||||
-rw-r--r-- | net/rxrpc/rxperf.c | 14 | ||||
-rw-r--r-- | net/rxrpc/security.c | 4 | ||||
-rw-r--r-- | net/rxrpc/sendmsg.c | 172 | ||||
-rw-r--r-- | net/rxrpc/sysctl.c | 23 | ||||
-rw-r--r-- | net/rxrpc/txbuf.c | 114 |
28 files changed, 2712 insertions, 1562 deletions
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index ac5caf5a48e1..210b75e3179e 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -16,6 +16,7 @@ rxrpc-y := \ conn_object.o \ conn_service.o \ input.o \ + input_rack.o \ insecure.o \ io_thread.o \ key.o \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 465bfe5eb061..86873399f7d5 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } @@ -408,9 +408,9 @@ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call) /* Make sure we're not going to call back into a kernel service */ if (call->notify_rx) { - spin_lock(&call->notify_lock); + spin_lock_irq(&call->notify_lock); call->notify_rx = rxrpc_dummy_notify_rx; - spin_unlock(&call->notify_lock); + spin_unlock_irq(&call->notify_lock); } } mutex_unlock(&call->user_mutex); @@ -487,7 +487,7 @@ EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); * rxrpc_kernel_set_max_life - Set maximum lifespan on a call * @sock: The socket the call is on * @call: The call to configure - * @hard_timeout: The maximum lifespan of the call in jiffies + * @hard_timeout: The maximum lifespan of the call in ms * * Set the maximum lifespan of a call. The call will end with ETIME or * ETIMEDOUT if it takes longer than this. @@ -495,14 +495,14 @@ EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call, unsigned long hard_timeout) { - unsigned long now; + ktime_t delay = ms_to_ktime(hard_timeout), expect_term_by; mutex_lock(&call->user_mutex); - now = jiffies; - hard_timeout += now; - WRITE_ONCE(call->expect_term_by, hard_timeout); - rxrpc_reduce_call_timer(call, hard_timeout, now, rxrpc_timer_set_for_hard); + expect_term_by = ktime_add(ktime_get_real(), delay); + WRITE_ONCE(call->expect_term_by, expect_term_by); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); + rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); mutex_unlock(&call->user_mutex); } @@ -707,9 +707,10 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; - ret = copy_from_sockptr(&min_sec_level, optval, - sizeof(unsigned int)); - if (ret < 0) + ret = copy_safe_from_sockptr(&min_sec_level, + sizeof(min_sec_level), + optval, optlen); + if (ret) goto error; ret = -EINVAL; if (min_sec_level > RXRPC_SECURITY_MAX) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7818aae1be8e..a64a0cab1bf7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -30,6 +30,7 @@ struct rxrpc_crypt { struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; +struct rxrpc_txqueue; /* * Mark applied to socket buffers in skb->mark. skb->priority is used @@ -98,6 +99,7 @@ struct rxrpc_net { atomic_t stat_tx_data_send; atomic_t stat_tx_data_send_frag; atomic_t stat_tx_data_send_fail; + atomic_t stat_tx_data_send_msgsize; atomic_t stat_tx_data_underflow; atomic_t stat_tx_data_cwnd_reset; atomic_t stat_rx_data; @@ -109,6 +111,8 @@ struct rxrpc_net { atomic_t stat_tx_ack_skip; atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; + atomic_t stat_tx_jumbo[10]; + atomic_t stat_rx_jumbo[10]; atomic_t stat_why_req_ack[8]; @@ -198,8 +202,8 @@ struct rxrpc_host_header { * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { - struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ union { + struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ @@ -208,9 +212,11 @@ struct rxrpc_skb_priv { }; struct { rxrpc_seq_t first_ack; /* First packet in acks table */ - u8 nr_acks; /* Number of acks+nacks */ - u8 nr_nacks; /* Number of nacks */ - }; + rxrpc_seq_t prev_ack; /* Highest seq seen */ + rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */ + u16 nr_acks; /* Number of acks+nacks */ + u8 reason; /* Reason for ack */ + } ack; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; @@ -248,10 +254,9 @@ struct rxrpc_security { struct rxrpc_key_token *); /* Work out how much data we can store in a packet, given an estimate - * of the amount of data remaining. + * of the amount of data remaining and allocate a data buffer. */ - int (*how_much_data)(struct rxrpc_call *, size_t, - size_t *, size_t *, size_t *); + struct rxrpc_txbuf *(*alloc_txbuf)(struct rxrpc_call *call, size_t remaining, gfp_t gfp); /* impose security on a packet */ int (*secure_packet)(struct rxrpc_call *, struct rxrpc_txbuf *); @@ -292,6 +297,7 @@ struct rxrpc_local { struct socket *socket; /* my UDP socket */ struct task_struct *io_thread; struct completion io_thread_ready; /* Indication that the I/O thread started */ + struct page_frag_cache tx_alloc; /* Tx control packet allocation (I/O thread only) */ struct rxrpc_sock *service; /* Service(s) listening on this endpoint */ #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY struct sk_buff_head rx_delay_queue; /* Delay injection queue */ @@ -317,6 +323,12 @@ struct rxrpc_local { struct list_head new_client_calls; /* Newly created client calls need connection */ spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ + /* Provide a kvec table sufficiently large to manage either a DATA + * packet with a maximum set of jumbo subpackets or a PING ACK padded + * out to 64K with zeropages for PMTUD. + */ + struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ? + 1 + RXRPC_MAX_NR_JUMBO : 3 + 16]; }; /* @@ -335,25 +347,27 @@ struct rxrpc_peer { time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ - unsigned int if_mtu; /* interface MTU for this peer */ - unsigned int mtu; /* network MTU for this peer */ - unsigned int maxdata; /* data size (MTU - hdrsize) */ - unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ struct sockaddr_rxrpc srx; /* remote address */ - /* calculated RTT cache */ -#define RXRPC_RTT_CACHE_SIZE 32 - spinlock_t rtt_input_lock; /* RTT lock for input routine */ - ktime_t rtt_last_req; /* Time of last RTT request */ - unsigned int rtt_count; /* Number of samples we've got */ + /* Path MTU discovery [RFC8899] */ + unsigned int pmtud_trial; /* Current MTU probe size */ + unsigned int pmtud_good; /* Largest working MTU probe we've tried */ + unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */ + bool pmtud_lost; /* T if MTU probe was lost */ + bool pmtud_probing; /* T if we have an active probe outstanding */ + bool pmtud_pending; /* T if a call to this peer should send a probe */ + u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ + bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ + unsigned int ackr_max_data; /* Maximum data advertised by peer */ + unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ + unsigned int max_data; /* Maximum packet data capacity for this peer */ + unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ + unsigned short tx_seg_max; /* Maximum number of transmissable segments */ - u32 srtt_us; /* smoothed round trip time << 3 in usecs */ - u32 mdev_us; /* medium deviation */ - u32 mdev_max_us; /* maximal mdev for the last rtt period */ - u32 rttvar_us; /* smoothed mdev_max */ - u32 rto_j; /* Retransmission timeout in jiffies */ - u8 backoff; /* Backoff timeout */ + /* Calculated RTT cache */ + unsigned int recent_srtt_us; + unsigned int recent_rto_us; u8 cong_ssthresh; /* Congestion slow-start threshold */ }; @@ -500,6 +514,8 @@ struct rxrpc_connection { struct list_head proc_link; /* link in procfs list */ struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ + struct page_frag_cache tx_data_alloc; /* Tx DATA packet allocation */ + struct mutex tx_data_alloc_lock; struct mutex security_lock; /* Lock for security management */ const struct rxrpc_security *security; /* applied security module */ @@ -520,6 +536,8 @@ struct rxrpc_connection { int debug_id; /* debug ID for printks */ rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ + rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */ + unsigned int pmtud_call; /* ID of call used for probe */ u32 service_id; /* Service ID, possibly upgraded */ u32 security_level; /* Security level selected */ u8 security_ix; /* security type */ @@ -552,6 +570,7 @@ enum rxrpc_call_flag { RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */ + RXRPC_CALL_TX_NO_MORE, /* No more data to transmit (MSG_MORE deasserted) */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ @@ -562,6 +581,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */ RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */ + RXRPC_CALL_CONN_CHALLENGING, /* The connection is being challenged */ }; /* @@ -582,7 +602,6 @@ enum rxrpc_call_state { RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ - RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ @@ -594,13 +613,25 @@ enum rxrpc_call_state { /* * Call Tx congestion management modes. */ -enum rxrpc_congest_mode { - RXRPC_CALL_SLOW_START, - RXRPC_CALL_CONGEST_AVOIDANCE, - RXRPC_CALL_PACKET_LOSS, - RXRPC_CALL_FAST_RETRANSMIT, - NR__RXRPC_CONGEST_MODES -}; +enum rxrpc_ca_state { + RXRPC_CA_SLOW_START, + RXRPC_CA_CONGEST_AVOIDANCE, + RXRPC_CA_PACKET_LOSS, + RXRPC_CA_FAST_RETRANSMIT, + NR__RXRPC_CA_STATES +} __mode(byte); + +/* + * Current purpose of call RACK timer. According to the RACK-TLP protocol + * [RFC8985], the transmission timer (call->rack_timo_at) may only be used for + * one of these at once. + */ +enum rxrpc_rack_timer_mode { + RXRPC_CALL_RACKTIMER_OFF, /* Timer not running */ + RXRPC_CALL_RACKTIMER_RACK_REORDER, /* RACK reordering timer */ + RXRPC_CALL_RACKTIMER_TLP_PTO, /* TLP timeout */ + RXRPC_CALL_RACKTIMER_RTO, /* Retransmission timeout */ +} __mode(byte); /* * RxRPC call definition @@ -618,17 +649,16 @@ struct rxrpc_call { const struct rxrpc_security *security; /* applied security module */ struct mutex user_mutex; /* User access mutex */ struct sockaddr_rxrpc dest_srx; /* Destination address */ - unsigned long delay_ack_at; /* When DELAY ACK needs to happen */ - unsigned long ack_lost_at; /* When ACK is figured as lost */ - unsigned long resend_at; /* When next resend needs to happen */ - unsigned long ping_at; /* When next to send a ping */ - unsigned long keepalive_at; /* When next to send a keepalive ping */ - unsigned long expect_rx_by; /* When we expect to get a packet by */ - unsigned long expect_req_by; /* When we expect to get a request DATA packet by */ - unsigned long expect_term_by; /* When we expect call termination by */ - u32 next_rx_timo; /* Timeout for next Rx packet (jif) */ - u32 next_req_timo; /* Timeout for next Rx request packet (jif) */ - u32 hard_timo; /* Maximum lifetime or 0 (jif) */ + ktime_t delay_ack_at; /* When DELAY ACK needs to happen */ + ktime_t rack_timo_at; /* When ACK is figured as lost */ + ktime_t ping_at; /* When next to send a ping */ + ktime_t keepalive_at; /* When next to send a keepalive ping */ + ktime_t expect_rx_by; /* When we expect to get a packet by */ + ktime_t expect_req_by; /* When we expect to get a request DATA packet by */ + ktime_t expect_term_by; /* When we expect call termination by */ + u32 next_rx_timo; /* Timeout for next Rx packet (ms) */ + u32 next_req_timo; /* Timeout for next Rx request packet (ms) */ + u32 hard_timo; /* Maximum lifetime or 0 (s) */ struct timer_list timer; /* Combined event timer */ struct work_struct destroyer; /* In-process-context destroyer */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ @@ -665,21 +695,30 @@ struct rxrpc_call { unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ + /* Sendmsg data tracking. */ + rxrpc_seq_t send_top; /* Highest Tx slot filled by sendmsg. */ + struct rxrpc_txqueue *send_queue; /* Queue that sendmsg is writing into */ + /* Transmitted data tracking. */ - spinlock_t tx_lock; /* Transmit queue lock */ - struct list_head tx_sendmsg; /* Sendmsg prepared packets */ - struct list_head tx_buffer; /* Buffer of transmissible packets */ + struct rxrpc_txqueue *tx_queue; /* Start of transmission buffers */ + struct rxrpc_txqueue *tx_qtail; /* End of transmission buffers */ + rxrpc_seq_t tx_qbase; /* First slot in tx_queue */ rxrpc_seq_t tx_bottom; /* First packet in buffer */ rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ - rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ - u16 tx_backoff; /* Delay to insert due to Tx failure */ - u8 tx_winsize; /* Maximum size of Tx window */ + rxrpc_serial_t tx_last_serial; /* Serial of last DATA transmitted */ + u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */ + u16 tx_nr_sent; /* Number of packets sent, but unacked */ + u16 tx_nr_lost; /* Number of packets marked lost */ + u16 tx_nr_resent; /* Number of packets resent, but unacked */ + u16 tx_winsize; /* Maximum size of Tx window */ #define RXRPC_TX_MAX_WINDOW 128 + u8 tx_jumbo_max; /* Maximum subpkts peer will accept */ ktime_t tx_last_sent; /* Last time a transmission occurred */ /* Received data tracking */ struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ + struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */ struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */ rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */ @@ -692,15 +731,33 @@ struct rxrpc_call { * packets) rather than bytes. */ #define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN -#define RXRPC_MIN_CWND (RXRPC_TX_SMSS > 2190 ? 2 : RXRPC_TX_SMSS > 1095 ? 3 : 4) - u8 cong_cwnd; /* Congestion window size */ +#define RXRPC_MIN_CWND 4 + enum rxrpc_ca_state cong_ca_state; /* Congestion control state */ u8 cong_extra; /* Extra to send for congestion management */ - u8 cong_ssthresh; /* Slow-start threshold */ - enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */ - u8 cong_dup_acks; /* Count of ACKs showing missing packets */ - u8 cong_cumul_acks; /* Cumulative ACK count */ + u16 cong_cwnd; /* Congestion window size */ + u16 cong_ssthresh; /* Slow-start threshold */ + u16 cong_dup_acks; /* Count of ACKs showing missing packets */ + u16 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ - struct sk_buff *cong_last_nack; /* Last ACK with nacks received */ + + /* RACK-TLP [RFC8985] state. */ + ktime_t rack_xmit_ts; /* Latest transmission timestamp */ + ktime_t rack_rtt; /* RTT of most recently ACK'd segment */ + ktime_t rack_rtt_ts; /* Timestamp of rack_rtt */ + ktime_t rack_reo_wnd; /* Reordering window */ + unsigned int rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */ + int rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */ + rxrpc_seq_t rack_fack; /* Highest sequence so far ACK'd */ + rxrpc_seq_t rack_end_seq; /* Highest sequence seen */ + rxrpc_seq_t rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */ + bool rack_dsack_round_none; /* T if dsack_round is "None" */ + bool rack_reordering_seen; /* T if detected reordering event */ + enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */ + bool tlp_is_retrans; /* T if unacked TLP retransmission */ + rxrpc_serial_t tlp_serial; /* Serial of TLP probe (or 0 if none in progress) */ + rxrpc_seq_t tlp_seq; /* Sequence of TLP probe */ + unsigned int tlp_rtt_taken; /* Last time RTT taken */ + ktime_t tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */ /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ @@ -725,32 +782,45 @@ struct rxrpc_call { /* Transmission-phase ACK management (ACKs we've received). */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ - rxrpc_seq_t acks_first_seq; /* first sequence number received */ + rxrpc_seq_t acks_hard_ack; /* Highest sequence hard acked */ rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ - rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ + unsigned short acks_nr_sacks; /* Number of soft acks recorded */ + unsigned short acks_nr_snacks; /* Number of soft nacks recorded */ + + /* Calculated RTT cache */ + ktime_t rtt_last_req; /* Time of last RTT request */ + unsigned int rtt_count; /* Number of samples we've got */ + unsigned int rtt_taken; /* Number of samples taken (wrapping) */ + struct minmax min_rtt; /* Estimated minimum RTT */ + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + u32 mdev_us; /* medium deviation */ + u32 mdev_max_us; /* maximal mdev for the last rtt period */ + u32 rttvar_us; /* smoothed mdev_max */ + u32 rto_us; /* Retransmission timeout in usec */ + u8 backoff; /* Backoff timeout (as shift) */ }; /* * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { - u16 nr_acks; /* Number of ACKs in packet */ - u16 nr_new_acks; /* Number of new ACKs in packet */ - u16 nr_new_nacks; /* Number of new nacks in packet */ - u16 nr_retained_nacks; /* Number of nacks retained between ACKs */ - u8 ack_reason; - bool saw_nacks; /* Saw NACKs in packet */ - bool new_low_nack; /* T if new low NACK found */ - bool retrans_timeo; /* T if reTx due to timeout happened */ - u8 flight_size; /* Number of unreceived transmissions */ - /* Place to stash values for tracing */ - enum rxrpc_congest_mode mode:8; - u8 cwnd; - u8 ssthresh; - u8 dup_acks; - u8 cumulative_acks; + rxrpc_serial_t ack_serial; /* Serial number of ACK */ + rxrpc_serial_t acked_serial; /* Serial number ACK'd */ + u16 in_flight; /* Number of unreceived transmissions */ + u16 nr_new_hacks; /* Number of rotated new ACKs */ + u16 nr_new_sacks; /* Number of new soft ACKs in packet */ + u16 nr_new_snacks; /* Number of new soft nacks in packet */ + u8 ack_reason; + bool new_low_snack:1; /* T if new low soft NACK found */ + bool retrans_timeo:1; /* T if reTx due to timeout happened */ + bool need_retransmit:1; /* T if we need transmission */ + bool rtt_sample_avail:1; /* T if RTT sample available */ + bool in_fast_or_rto_recovery:1; + bool exiting_fast_or_rto_recovery:1; + bool tlp_probe_acked:1; /* T if the TLP probe seq was acked */ + u8 /*enum rxrpc_congest_change*/ change; }; /* @@ -788,40 +858,27 @@ struct rxrpc_send_params { * Buffer of data to be output as a packet. */ struct rxrpc_txbuf { - struct rcu_head rcu; - struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */ - struct list_head tx_link; /* Link in live Enc queue or Tx queue */ - ktime_t last_sent; /* Time at which last transmitted */ refcount_t ref; rxrpc_seq_t seq; /* Sequence number of this packet */ + rxrpc_serial_t serial; /* Last serial number transmitted with */ unsigned int call_debug_id; unsigned int debug_id; - unsigned int len; /* Amount of data in buffer */ - unsigned int space; /* Remaining data space */ - unsigned int offset; /* Offset of fill point */ - unsigned long flags; -#define RXRPC_TXBUF_LAST 0 /* Set if last packet in Tx phase */ -#define RXRPC_TXBUF_RESENT 1 /* Set if has been resent */ - u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ - struct { - /* The packet for encrypting and DMA'ing. We align it such - * that data[] aligns correctly for any crypto blocksize. - */ - u8 pad[64 - sizeof(struct rxrpc_wire_header)]; - struct rxrpc_wire_header wire; /* Network-ready header */ - union { - u8 data[RXRPC_JUMBO_DATALEN]; /* Data packet */ - struct { - struct rxrpc_ackpacket ack; - DECLARE_FLEX_ARRAY(u8, acks); - }; - }; - } __aligned(64); + unsigned short len; /* Amount of data in buffer */ + unsigned short space; /* Remaining data space */ + unsigned short offset; /* Offset of fill point */ + unsigned short pkt_len; /* Size of packet content */ + unsigned short alloc_size; /* Amount of bufferage allocated */ + unsigned int flags; +#define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */ +#define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ + __be16 cksum; /* Checksum to go in header */ + bool jumboable; /* Can be non-terminal jumbo subpacket */ + void *data; /* Data with preceding jumbo header */ }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) { - return txb->wire.flags & RXRPC_CLIENT_INITIATED; + return txb->flags & RXRPC_CLIENT_INITIATED; } static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) @@ -829,6 +886,46 @@ static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) return !rxrpc_sending_to_server(txb); } +/* + * Transmit queue element, including RACK [RFC8985] per-segment metadata. The + * transmission timestamp is in usec from the base. + */ +struct rxrpc_txqueue { + /* Start with the members we want to prefetch. */ + struct rxrpc_txqueue *next; + ktime_t xmit_ts_base; + rxrpc_seq_t qbase; + u8 nr_reported_acks; /* Number of segments explicitly acked/nacked */ + unsigned long segment_acked; /* Bit-per-buf: Set if ACK'd */ + unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */ + unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */ + unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */ + unsigned long ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */ + + /* The arrays we want to pack into as few cache lines as possible. */ + struct { +#define RXRPC_NR_TXQUEUE BITS_PER_LONG +#define RXRPC_TXQ_MASK (RXRPC_NR_TXQUEUE - 1) + struct rxrpc_txbuf *bufs[RXRPC_NR_TXQUEUE]; + unsigned int segment_serial[RXRPC_NR_TXQUEUE]; + unsigned int segment_xmit_ts[RXRPC_NR_TXQUEUE]; + } ____cacheline_aligned; +}; + +/* + * Data transmission request. + */ +struct rxrpc_send_data_req { + ktime_t now; /* Current time */ + struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */ + rxrpc_seq_t seq; /* Sequence of first data */ + int n; /* Number of DATA packets to glue into jumbo */ + bool retrans; /* T if this is a retransmission */ + bool did_send; /* T if did actually send */ + bool tlp_probe; /* T if this is a TLP probe */ + int /* enum rxrpc_txdata_trace */ trace; +}; + #include <trace/events/rxrpc.h> /* @@ -846,6 +943,21 @@ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn } /* + * Allocate the next serial n numbers on a connection. 0 must be skipped. + */ +static inline rxrpc_serial_t rxrpc_get_next_serials(struct rxrpc_connection *conn, + unsigned int n) +{ + rxrpc_serial_t serial; + + serial = conn->tx_serial; + if (serial + n <= n) + serial = 1; + conn->tx_serial = serial + n; + return serial; +} + +/* * af_rxrpc.c */ extern atomic_t rxrpc_n_rx_skbs; @@ -861,7 +973,6 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb); -void rxrpc_accept_incoming_calls(struct rxrpc_local *); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); /* @@ -869,18 +980,12 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); */ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); -void rxrpc_send_ACK(struct rxrpc_call *, u8, rxrpc_serial_t, enum rxrpc_propose_ack_trace); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); -void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *); -void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb); - -void rxrpc_reduce_call_timer(struct rxrpc_call *call, - unsigned long expire_at, - unsigned long now, - enum rxrpc_timer_trace why); - -bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); +void rxrpc_resend_tlp(struct rxrpc_call *call); +void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace); +bool rxrpc_input_call_event(struct rxrpc_call *call); /* * call_object.c @@ -980,7 +1085,6 @@ void rxrpc_connect_client_calls(struct rxrpc_local *local); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); -void rxrpc_put_client_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_discard_expired_client_conns(struct rxrpc_local *local); void rxrpc_clean_up_local_conns(struct rxrpc_local *); @@ -1060,6 +1164,32 @@ void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); /* + * input_rack.c + */ +void rxrpc_input_rack_one(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix); +void rxrpc_input_rack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long new_acks); +void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary); +ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now); +void rxrpc_tlp_send_probe(struct rxrpc_call *call); +void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary); +void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by); + +/* Initialise TLP state [RFC8958 7.1]. */ +static inline void rxrpc_tlp_init(struct rxrpc_call *call) +{ + call->tlp_serial = 0; + call->tlp_seq = call->acks_hard_ack; + call->tlp_is_retrans = false; +} + +/* * io_thread.c */ int rxrpc_encap_rcv(struct sock *, struct sk_buff *); @@ -1069,7 +1199,7 @@ bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, int rxrpc_io_thread(void *data); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { - wake_up_process(local->io_thread); + wake_up_process(READ_ONCE(local->io_thread)); } static inline bool rxrpc_protocol_error(struct sk_buff *skb, enum rxrpc_abort_reason why) @@ -1160,19 +1290,22 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) /* * output.c */ -int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb); +void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, + rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); int rxrpc_send_abort_packet(struct rxrpc_call *); -int rxrpc_send_data_packet(struct rxrpc_call *, struct rxrpc_txbuf *); +void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); /* * peer_event.c */ void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *); void rxrpc_peer_keepalive_worker(struct work_struct *); +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail); /* * peer_object.c @@ -1221,10 +1354,12 @@ static inline int rxrpc_abort_eproto(struct rxrpc_call *call, /* * rtt.c */ -void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int, - rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); -unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *, bool); -void rxrpc_peer_init_rtt(struct rxrpc_peer *); +void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + int rtt_slot, + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, + ktime_t send_time, ktime_t resp_time); +ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans); +void rxrpc_call_init_rtt(struct rxrpc_call *call); /* * rxkad.c @@ -1295,8 +1430,8 @@ static inline void rxrpc_sysctl_exit(void) {} * txbuf.c */ extern atomic_t rxrpc_nr_txbuf; -struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type, - gfp_t gfp); +struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, + size_t data_align, gfp_t gfp); void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); @@ -1323,6 +1458,53 @@ static inline bool after_eq(u32 seq1, u32 seq2) return (s32)(seq1 - seq2) >= 0; } +static inline u32 earliest(u32 seq1, u32 seq2) +{ + return before(seq1, seq2) ? seq1 : seq2; +} + +static inline u32 latest(u32 seq1, u32 seq2) +{ + return after(seq1, seq2) ? seq1 : seq2; +} + +static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq) +{ + return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase; +} + +static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + rxrpc_get_skb(skb, rxrpc_skb_get_call_rx); + __skb_queue_tail(&call->rx_queue, skb); + rxrpc_poke_call(call, rxrpc_call_poke_rx_packet); +} + +/* + * Calculate how much space there is for transmitting more DATA packets. + */ +static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call) +{ + int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); + int transmitted = call->tx_top - call->tx_bottom; + + return max(winsize - transmitted, 0); +} + +static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call) +{ + return call->acks_nr_sacks + call->tx_nr_lost; +} + +/* + * Calculate the number of transmitted DATA packets assumed to be in flight + * [approx RFC6675]. + */ +static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call) +{ + return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent; +} + /* * debug tracing */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 0f5a1d77b890..e685034ce4f7 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -188,8 +188,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) /* Make sure that there aren't any incoming calls in progress before we * clear the preallocation buffers. */ - spin_lock(&rx->incoming_lock); - spin_unlock(&rx->incoming_lock); + spin_lock_irq(&rx->incoming_lock); + spin_unlock_irq(&rx->incoming_lock); head = b->peer_backlog_head; tail = b->peer_backlog_tail; @@ -343,7 +343,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA) return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call); - read_lock(&local->services_lock); + read_lock_irq(&local->services_lock); /* Weed out packets to services we're not offering. Packets that would * begin a call are explicitly rejected and the rest are just @@ -399,34 +399,34 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, spin_unlock(&conn->state_lock); spin_unlock(&rx->incoming_lock); - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); if (hlist_unhashed(&call->error_link)) { - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } _leave(" = %p{%d}", call, call->debug_id); - rxrpc_input_call_event(call, skb); + rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); return true; unsupported_service: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EOPNOTSUPP); unsupported_security: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); _leave(" = f [%u]", skb->mark); return false; discard: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return true; } diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 0f78544d043b..8e477f7f8850 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -23,14 +23,14 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why) { - unsigned long now = jiffies; - unsigned long ping_at = now + rxrpc_idle_ack_delay; - - if (time_before(ping_at, call->ping_at)) { - WRITE_ONCE(call->ping_at, ping_at); - rxrpc_reduce_call_timer(call, ping_at, now, - rxrpc_timer_set_for_ping); - trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial); + ktime_t delay = ms_to_ktime(READ_ONCE(rxrpc_idle_ack_delay)); + ktime_t now = ktime_get_real(); + ktime_t ping_at = ktime_add(now, delay); + + trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial); + if (ktime_before(ping_at, call->ping_at)) { + call->ping_at = ping_at; + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_ping); } } @@ -40,215 +40,119 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { - unsigned long expiry = rxrpc_soft_ack_delay; - unsigned long now = jiffies, ack_at; + ktime_t now = ktime_get_real(), delay; - if (rxrpc_soft_ack_delay < expiry) - expiry = rxrpc_soft_ack_delay; - if (call->peer->srtt_us != 0) - ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3); + trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial); + + if (call->srtt_us) + delay = (call->srtt_us >> 3) * NSEC_PER_USEC; else - ack_at = expiry; - - ack_at += READ_ONCE(call->tx_backoff); - ack_at += now; - if (time_before(ack_at, call->delay_ack_at)) { - WRITE_ONCE(call->delay_ack_at, ack_at); - rxrpc_reduce_call_timer(call, ack_at, now, - rxrpc_timer_set_for_ack); - } + delay = ms_to_ktime(READ_ONCE(rxrpc_soft_ack_delay)); + ktime_add_ms(delay, call->tx_backoff); - trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial); + call->delay_ack_at = ktime_add(now, delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_delayed_ack); } /* - * Queue an ACK for immediate transmission. + * Retransmit one or more packets. */ -void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, - rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) +static bool rxrpc_retransmit_data(struct rxrpc_call *call, + struct rxrpc_send_data_req *req) { - struct rxrpc_txbuf *txb; + struct rxrpc_txqueue *tq = req->tq; + unsigned int ix = req->seq & RXRPC_TXQ_MASK; + struct rxrpc_txbuf *txb = tq->bufs[ix]; - if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) - return; + _enter("%x,%x,%x,%x", tq->qbase, req->seq, ix, txb->debug_id); - rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); + req->retrans = true; + trace_rxrpc_retransmit(call, req, txb); - txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_ACK, - rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS); - if (!txb) { - kleave(" = -ENOMEM"); - return; - } - - txb->ack_why = why; - txb->wire.seq = 0; - txb->wire.type = RXRPC_PACKET_TYPE_ACK; - txb->wire.flags |= RXRPC_SLOW_START_OK; - txb->ack.bufferSpace = 0; - txb->ack.maxSkew = 0; - txb->ack.firstPacket = 0; - txb->ack.previousPacket = 0; - txb->ack.serial = htonl(serial); - txb->ack.reason = ack_reason; - txb->ack.nAcks = 0; - - trace_rxrpc_send_ack(call, why, ack_reason, serial); - rxrpc_send_ack_packet(call, txb); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); -} + txb->flags |= RXRPC_TXBUF_RESENT; + rxrpc_send_data_packet(call, req); + rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); -/* - * Handle congestion being detected by the retransmit timeout. - */ -static void rxrpc_congestion_timeout(struct rxrpc_call *call) -{ - set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags); + req->tq = NULL; + req->n = 0; + req->did_send = true; + req->now = ktime_get_real(); + return true; } /* * Perform retransmission of NAK'd and unack'd packets. */ -void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) +static void rxrpc_resend(struct rxrpc_call *call) { - struct rxrpc_ackpacket *ack = NULL; - struct rxrpc_skb_priv *sp; - struct rxrpc_txbuf *txb; - unsigned long resend_at; - rxrpc_seq_t transmitted = READ_ONCE(call->tx_transmitted); - ktime_t now, max_age, oldest, ack_ts; - bool unacked = false; - unsigned int i; - LIST_HEAD(retrans_queue); - - _enter("{%d,%d}", call->acks_hard_ack, call->tx_top); - - now = ktime_get_real(); - max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j)); - oldest = now; - - if (list_empty(&call->tx_buffer)) - goto no_resend; - - if (list_empty(&call->tx_buffer)) - goto no_further_resend; + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .trace = rxrpc_txdata_retransmit, + }; + struct rxrpc_txqueue *tq; - trace_rxrpc_resend(call, ack_skb); - txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link); + _enter("{%d,%d}", call->tx_bottom, call->tx_top); - /* Scan the soft ACK table without dropping the lock and resend any - * explicitly NAK'd packets. - */ - if (ack_skb) { - sp = rxrpc_skb(ack_skb); - ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); + trace_rxrpc_resend(call, call->acks_highest_serial); - for (i = 0; i < sp->nr_acks; i++) { - rxrpc_seq_t seq; + /* Scan the transmission queue, looking for lost packets. */ + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long lost = tq->segment_lost; - if (ack->acks[i] & 1) - continue; - seq = sp->first_ack + i; - if (after(txb->seq, transmitted)) - break; - if (after(txb->seq, seq)) - continue; /* A new hard ACK probably came in */ - list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - if (txb->seq == seq) - goto found_txb; - } - goto no_further_resend; - - found_txb: - if (after(ntohl(txb->wire.serial), call->acks_highest_serial)) - continue; /* Ack point not yet reached */ + if (after(tq->qbase, call->tx_transmitted)) + break; - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); + _debug("retr %16lx %u c=%08x [%x]", + tq->segment_acked, tq->nr_reported_acks, call->debug_id, tq->qbase); + _debug("lost %16lx", lost); - if (list_empty(&txb->tx_link)) { - list_add_tail(&txb->tx_link, &retrans_queue); - set_bit(RXRPC_TXBUF_RESENT, &txb->flags); - } + trace_rxrpc_resend_lost(call, tq, lost); + while (lost) { + unsigned int ix = __ffs(lost); + struct rxrpc_txbuf *txb = tq->bufs[ix]; - trace_rxrpc_retransmit(call, txb->seq, - ktime_to_ns(ktime_sub(txb->last_sent, - max_age))); + __clear_bit(ix, &lost); + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_lost); - if (list_is_last(&txb->call_link, &call->tx_buffer)) - goto no_further_resend; - txb = list_next_entry(txb, call_link); + req.tq = tq; + req.seq = tq->qbase + ix; + req.n = 1; + rxrpc_retransmit_data(call, &req); } } - /* Fast-forward through the Tx queue to the point the peer says it has - * seen. Anything between the soft-ACK table and that point will get - * ACK'd or NACK'd in due course, so don't worry about it here; here we - * need to consider retransmitting anything beyond that point. - * - * Note that ACK for a packet can beat the update of tx_transmitted. - */ - if (after_eq(READ_ONCE(call->acks_prev_seq), READ_ONCE(call->tx_transmitted))) - goto no_further_resend; - - list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - if (before_eq(txb->seq, READ_ONCE(call->acks_prev_seq))) - continue; - if (after(txb->seq, READ_ONCE(call->tx_transmitted))) - break; /* Not transmitted yet */ - - if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE && - before(ntohl(txb->wire.serial), ntohl(ack->serial))) - goto do_resend; /* Wasn't accounted for by a more recent ping. */ - - if (ktime_after(txb->last_sent, max_age)) { - if (ktime_before(txb->last_sent, oldest)) - oldest = txb->last_sent; - continue; - } - - do_resend: - unacked = true; - if (list_empty(&txb->tx_link)) { - list_add_tail(&txb->tx_link, &retrans_queue); - set_bit(RXRPC_TXBUF_RESENT, &txb->flags); - rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); - } - } + rxrpc_get_rto_backoff(call, req.did_send); + _leave(""); +} -no_further_resend: -no_resend: - resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); - resend_at += jiffies + rxrpc_get_rto_backoff(call->peer, - !list_empty(&retrans_queue)); - WRITE_ONCE(call->resend_at, resend_at); - - if (unacked) - rxrpc_congestion_timeout(call); - - /* If there was nothing that needed retransmission then it's likely - * that an ACK got lost somewhere. Send a ping to find out instead of - * retransmitting data. - */ - if (list_empty(&retrans_queue)) { - rxrpc_reduce_call_timer(call, resend_at, jiffies, - rxrpc_timer_set_for_resend); - ack_ts = ktime_sub(now, call->acks_latest_ts); - if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3)) - goto out; - rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, - rxrpc_propose_ack_ping_for_lost_ack); - goto out; +/* + * Resend the highest-seq DATA packet so far transmitted for RACK-TLP [RFC8985 7.3]. + */ +void rxrpc_resend_tlp(struct rxrpc_call *call) +{ + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .seq = call->tx_transmitted, + .n = 1, + .tlp_probe = true, + .trace = rxrpc_txdata_tlp_retransmit, + }; + + /* There's a chance it'll be on the tail segment of the queue. */ + req.tq = READ_ONCE(call->tx_qtail); + if (req.tq && + before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) { + rxrpc_retransmit_data(call, &req); + return; } - /* Retransmit the queue */ - while ((txb = list_first_entry_or_null(&retrans_queue, - struct rxrpc_txbuf, tx_link))) { - list_del_init(&txb->tx_link); - rxrpc_transmit_one(call, txb); + for (req.tq = call->tx_queue; req.tq; req.tq = req.tq->next) { + if (after_eq(call->tx_transmitted, req.tq->qbase) && + before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) { + rxrpc_retransmit_data(call, &req); + return; + } } - -out: - _leave(""); } /* @@ -257,13 +161,11 @@ out: */ static void rxrpc_begin_service_reply(struct rxrpc_call *call) { - unsigned long now = jiffies; - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SEND_REPLY); - WRITE_ONCE(call->delay_ack_at, now + MAX_JIFFY_OFFSET); if (call->ackr_reason == RXRPC_ACK_DELAY) call->ackr_reason = 0; - trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); + call->delay_ack_at = KTIME_MAX; + trace_rxrpc_timer_can(call, rxrpc_timer_trace_delayed_ack); } /* @@ -286,68 +188,93 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) } } -static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) -{ - unsigned int winsize = min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra); - rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize; - rxrpc_seq_t tx_top = call->tx_top; - int space; - - space = wtop - tx_top; - return space > 0; -} - /* - * Decant some if the sendmsg prepared queue into the transmission buffer. + * Transmit some as-yet untransmitted data, to a maximum of the supplied limit. */ -static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) +static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace) { - struct rxrpc_txbuf *txb; + int space = rxrpc_tx_window_space(call); if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { - if (list_empty(&call->tx_sendmsg)) + if (call->send_top == call->tx_top) return; rxrpc_expose_client_call(call); } - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { - spin_lock(&call->tx_lock); - list_del(&txb->call_link); - spin_unlock(&call->tx_lock); + while (space > 0) { + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .seq = call->tx_transmitted + 1, + .n = 0, + .trace = trace, + }; + struct rxrpc_txqueue *tq; + struct rxrpc_txbuf *txb; + rxrpc_seq_t send_top, seq; + int limit = min(space, max(call->peer->pmtud_jumbo, 1)); + + /* Order send_top before the contents of the new txbufs and + * txqueue pointers + */ + send_top = smp_load_acquire(&call->send_top); + if (call->tx_top == send_top) + break; - call->tx_top = txb->seq; - list_add_tail(&txb->call_link, &call->tx_buffer); + trace_rxrpc_transmit(call, send_top, space); - if (txb->wire.flags & RXRPC_LAST_PACKET) - rxrpc_close_tx_phase(call); + tq = call->tx_qtail; + seq = call->tx_top; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant); - rxrpc_transmit_one(call, txb); + do { + int ix; - if (!rxrpc_tx_window_has_space(call)) - break; + seq++; + ix = seq & RXRPC_TXQ_MASK; + if (!ix) { + tq = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant_advance); + } + if (!req.tq) + req.tq = tq; + txb = tq->bufs[ix]; + req.n++; + if (!txb->jumboable) + break; + } while (req.n < limit && before(seq, send_top)); + + if (txb->flags & RXRPC_LAST_PACKET) { + rxrpc_close_tx_phase(call); + tq = NULL; + } + call->tx_qtail = tq; + call->tx_top = seq; + + space -= req.n; + rxrpc_send_data_packet(call, &req); } } -static void rxrpc_transmit_some_data(struct rxrpc_call *call) +void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace) { switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_ACK_REQUEST: - if (list_empty(&call->tx_sendmsg)) + if (call->tx_bottom == READ_ONCE(call->send_top)) return; rxrpc_begin_service_reply(call); fallthrough; case RXRPC_CALL_SERVER_SEND_REPLY: case RXRPC_CALL_CLIENT_SEND_REQUEST: - if (!rxrpc_tx_window_has_space(call)) + if (!rxrpc_tx_window_space(call)) return; - if (list_empty(&call->tx_sendmsg)) { + if (call->tx_bottom == READ_ONCE(call->send_top)) { rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow); return; } - rxrpc_decant_prepared_tx(call); + rxrpc_transmit_fresh_data(call, limit, trace); break; default: return; @@ -360,8 +287,8 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) */ static void rxrpc_send_initial_ping(struct rxrpc_call *call) { - if (call->peer->rtt_count < 3 || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + if (call->rtt_count < 3 || + ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_params); @@ -370,10 +297,11 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call) /* * Handle retransmission and deferred ACK/abort generation. */ -bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) +bool rxrpc_input_call_event(struct rxrpc_call *call) { - unsigned long now, next, t; - bool resend = false, expired = false; + struct sk_buff *skb; + ktime_t now, t; + bool did_receive = false, saw_ack = false; s32 abort_code; rxrpc_see_call(call, rxrpc_call_see_input); @@ -383,9 +311,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)], call->events); - if (__rxrpc_call_is_complete(call)) - goto out; - /* Handle abort request locklessly, vs rxrpc_propose_abort(). */ abort_code = smp_load_acquire(&call->send_abort); if (abort_code) { @@ -394,113 +319,116 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) goto out; } - if (skb && skb->mark == RXRPC_SKB_MARK_ERROR) - goto out; + do { + skb = __skb_dequeue(&call->rx_queue); + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + if (__rxrpc_call_is_complete(call) || + skb->mark == RXRPC_SKB_MARK_ERROR) { + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + goto out; + } + + saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK; + + rxrpc_input_call_packet(call, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + did_receive = true; + } + + t = ktime_sub(call->rack_timo_at, ktime_get_real()); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, + rxrpc_timer_trace_rack_off + call->rack_timer_mode); + call->rack_timo_at = KTIME_MAX; + rxrpc_rack_timer_expired(call, t); + } + + } while (!skb_queue_empty(&call->rx_queue)); /* If we see our async-event poke, check for timeout trippage. */ - now = jiffies; - t = READ_ONCE(call->expect_rx_by); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_normal, now); - expired = true; + now = ktime_get_real(); + t = ktime_sub(call->expect_rx_by, now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_expect_rx); + goto expired; } - t = READ_ONCE(call->expect_req_by); - if (__rxrpc_call_state(call) == RXRPC_CALL_SERVER_RECV_REQUEST && - time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now); - expired = true; + t = ktime_sub(call->expect_req_by, now); + if (t <= 0) { + call->expect_req_by = KTIME_MAX; + if (__rxrpc_call_state(call) == RXRPC_CALL_SERVER_RECV_REQUEST) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_idle); + goto expired; + } } - t = READ_ONCE(call->expect_term_by); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_hard, now); - expired = true; + t = ktime_sub(READ_ONCE(call->expect_term_by), now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_hard); + goto expired; } - t = READ_ONCE(call->delay_ack_at); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now); - cmpxchg(&call->delay_ack_at, t, now + MAX_JIFFY_OFFSET); + t = ktime_sub(call->delay_ack_at, now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_delayed_ack); + call->delay_ack_at = KTIME_MAX; rxrpc_send_ACK(call, RXRPC_ACK_DELAY, 0, - rxrpc_propose_ack_ping_for_lost_ack); - } - - t = READ_ONCE(call->ack_lost_at); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_lost_ack, now); - cmpxchg(&call->ack_lost_at, t, now + MAX_JIFFY_OFFSET); - set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events); + rxrpc_propose_ack_delayed_ack); } - t = READ_ONCE(call->keepalive_at); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_keepalive, now); - cmpxchg(&call->keepalive_at, t, now + MAX_JIFFY_OFFSET); + t = ktime_sub(call->ping_at, now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping); + call->ping_at = KTIME_MAX; rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_keepalive); } - t = READ_ONCE(call->ping_at); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now); - cmpxchg(&call->ping_at, t, now + MAX_JIFFY_OFFSET); + now = ktime_get_real(); + t = ktime_sub(call->keepalive_at, now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_keepalive); + call->keepalive_at = KTIME_MAX; rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_keepalive); } - t = READ_ONCE(call->resend_at); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_resend, now); - cmpxchg(&call->resend_at, t, now + MAX_JIFFY_OFFSET); - resend = true; - } - - if (skb) - rxrpc_input_call_packet(call, skb); + if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events)) + rxrpc_send_initial_ping(call); - rxrpc_transmit_some_data(call); + rxrpc_transmit_some_data(call, UINT_MAX, rxrpc_txdata_new_data); - if (skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + if (saw_ack) + rxrpc_congestion_degrade(call); - if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK) - rxrpc_congestion_degrade(call); + if (did_receive && + (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_SEND_REQUEST || + __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SEND_REPLY)) { + t = ktime_sub(call->rack_timo_at, ktime_get_real()); + trace_rxrpc_rack(call, t); } - if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events)) - rxrpc_send_initial_ping(call); - /* Process events */ - if (expired) { - if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) && - (int)call->conn->hi_serial - (int)call->rx_serial > 0) { - trace_rxrpc_call_reset(call); - rxrpc_abort_call(call, 0, RX_CALL_DEAD, -ECONNRESET, - rxrpc_abort_call_reset); - } else { - rxrpc_abort_call(call, 0, RX_CALL_TIMEOUT, -ETIME, - rxrpc_abort_call_timeout); - } - goto out; - } - if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_lost_ack); - if (resend && __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY) - rxrpc_resend(call, NULL); + if (call->tx_nr_lost > 0 && + __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY && + !test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags)) + rxrpc_resend(call); if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, rxrpc_propose_ack_rx_idle); if (call->ackr_nr_unacked > 2) { - if (call->peer->rtt_count < 3) + if (call->rtt_count < 3) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_rtt); - else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_old_rtt); @@ -511,23 +439,32 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) /* Make sure the timer is restarted */ if (!__rxrpc_call_is_complete(call)) { - next = call->expect_rx_by; + ktime_t next = READ_ONCE(call->expect_term_by), delay; -#define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; } +#define set(T) { ktime_t _t = (T); if (ktime_before(_t, next)) next = _t; } set(call->expect_req_by); - set(call->expect_term_by); + set(call->expect_rx_by); set(call->delay_ack_at); - set(call->ack_lost_at); - set(call->resend_at); + set(call->rack_timo_at); set(call->keepalive_at); set(call->ping_at); - now = jiffies; - if (time_after_eq(now, next)) + now = ktime_get_real(); + delay = ktime_sub(next, now); + if (delay <= 0) { rxrpc_poke_call(call, rxrpc_call_poke_timer_now); - - rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); + } else { + unsigned long nowj = jiffies, delayj, nextj; + + delayj = umax(nsecs_to_jiffies(delay), 1); + nextj = nowj + delayj; + if (time_before(nextj, call->timer.expires) || + !timer_pending(&call->timer)) { + trace_rxrpc_timer_restart(call, delay, delayj); + timer_reduce(&call->timer, nextj); + } + } } out: @@ -537,9 +474,24 @@ out: rxrpc_disconnect_call(call); if (call->security) call->security->free_call_crypto(call); + } else { + if (did_receive && + call->peer->ackr_adv_pmtud && + call->peer->pmtud_pending) + rxrpc_send_probe_for_pmtud(call); } - if (call->acks_hard_ack != call->tx_bottom) - rxrpc_shrink_call_tx_buffer(call); _leave(""); return true; + +expired: + if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) && + (int)call->conn->hi_serial - (int)call->rx_serial > 0) { + trace_rxrpc_call_reset(call); + rxrpc_abort_call(call, 0, RX_CALL_DEAD, -ECONNRESET, + rxrpc_abort_call_reset); + } else { + rxrpc_abort_call(call, 0, RX_CALL_TIMEOUT, -ETIME, + rxrpc_abort_call_timeout); + } + goto out; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 9fc9a6c3f685..c4c8b46a68c6 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -22,7 +22,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", - [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", @@ -49,7 +48,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) bool busy; if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) { - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); busy = !list_empty(&call->attend_link); trace_rxrpc_poke_call(call, busy, what); if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke)) @@ -57,7 +56,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) if (!busy) { list_add_tail(&call->attend_link, &local->call_attend_q); } - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); if (!busy) rxrpc_wake_up_io_thread(local); } @@ -70,20 +69,11 @@ static void rxrpc_call_timer_expired(struct timer_list *t) _enter("%d", call->debug_id); if (!__rxrpc_call_is_complete(call)) { - trace_rxrpc_timer_expired(call, jiffies); + trace_rxrpc_timer_expired(call); rxrpc_poke_call(call, rxrpc_call_poke_timer); } } -void rxrpc_reduce_call_timer(struct rxrpc_call *call, - unsigned long expire_at, - unsigned long now, - enum rxrpc_timer_trace why) -{ - trace_rxrpc_timer(call, why, now); - timer_reduce(&call->timer, expire_at); -} - static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; static void rxrpc_destroy_call(struct work_struct *); @@ -155,34 +145,37 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); - INIT_LIST_HEAD(&call->tx_sendmsg); - INIT_LIST_HEAD(&call->tx_buffer); + skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); - spin_lock_init(&call->tx_lock); refcount_set(&call->ref, 1); - call->debug_id = debug_id; - call->tx_total_len = -1; - call->next_rx_timo = 20 * HZ; - call->next_req_timo = 1 * HZ; - call->ackr_window = 1; - call->ackr_wtop = 1; + call->debug_id = debug_id; + call->tx_total_len = -1; + call->tx_jumbo_max = 1; + call->next_rx_timo = 20 * HZ; + call->next_req_timo = 1 * HZ; + call->ackr_window = 1; + call->ackr_wtop = 1; + call->delay_ack_at = KTIME_MAX; + call->rack_timo_at = KTIME_MAX; + call->ping_at = KTIME_MAX; + call->keepalive_at = KTIME_MAX; + call->expect_rx_by = KTIME_MAX; + call->expect_req_by = KTIME_MAX; + call->expect_term_by = KTIME_MAX; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); call->rx_winsize = rxrpc_rx_window_size; call->tx_winsize = 16; - if (RXRPC_TX_SMSS > 2190) - call->cong_cwnd = 2; - else if (RXRPC_TX_SMSS > 1095) - call->cong_cwnd = 3; - else - call->cong_cwnd = 4; + call->cong_cwnd = RXRPC_MIN_CWND; call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; + rxrpc_call_init_rtt(call); + call->rxnet = rxnet; call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; atomic_inc(&rxnet->nr_calls); @@ -226,11 +219,11 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); if (p->timeouts.normal) - call->next_rx_timo = min(msecs_to_jiffies(p->timeouts.normal), 1UL); + call->next_rx_timo = umin(p->timeouts.normal, 1); if (p->timeouts.idle) - call->next_req_timo = min(msecs_to_jiffies(p->timeouts.idle), 1UL); + call->next_req_timo = umin(p->timeouts.idle, 1); if (p->timeouts.hard) - call->hard_timo = p->timeouts.hard * HZ; + call->hard_timo = p->timeouts.hard; ret = rxrpc_init_client_call_security(call); if (ret < 0) { @@ -253,18 +246,13 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, */ void rxrpc_start_call_timer(struct rxrpc_call *call) { - unsigned long now = jiffies; - unsigned long j = now + MAX_JIFFY_OFFSET; - - call->delay_ack_at = j; - call->ack_lost_at = j; - call->resend_at = j; - call->ping_at = j; - call->keepalive_at = j; - call->expect_rx_by = j; - call->expect_req_by = j; - call->expect_term_by = j + call->hard_timo; - call->timer.expires = now; + if (call->hard_timo) { + ktime_t delay = ms_to_ktime(call->hard_timo * 1000); + + call->expect_term_by = ktime_add(ktime_get_real(), delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); + } + call->timer.expires = jiffies; } /* @@ -313,9 +301,9 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call); rxrpc_get_call(call, rxrpc_call_get_io_thread); - spin_lock(&local->client_call_lock); + spin_lock_irq(&local->client_call_lock); list_add_tail(&call->wait_link, &local->new_client_calls); - spin_unlock(&local->client_call_lock); + spin_unlock_irq(&local->client_call_lock); rxrpc_wake_up_io_thread(local); return 0; @@ -445,7 +433,7 @@ error_attached_to_socket: /* * Set up an incoming call. call->conn points to the connection. - * This is called in BH context and isn't allowed to fail. + * This is called with interrupts disabled and isn't allowed to fail. */ void rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_call *call, @@ -464,17 +452,16 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->cong_tstamp = skb->tstamp; __set_bit(RXRPC_CALL_EXPOSED, &call->flags); - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + __set_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags); break; case RXRPC_CONN_SERVICE: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; case RXRPC_CONN_ABORTED: @@ -542,11 +529,29 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) } /* - * Clean up the Rx skb ring. + * Clean up the transmission buffers. + */ +static void rxrpc_cleanup_tx_buffers(struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq, *next; + + for (tq = call->tx_queue; tq; tq = next) { + next = tq->next; + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + if (tq->bufs[i]) + rxrpc_put_txbuf(tq->bufs[i], rxrpc_txbuf_put_cleaned); + trace_rxrpc_tq(call, tq, 0, rxrpc_tq_cleaned); + kfree(tq); + } +} + +/* + * Clean up the receive buffers. */ -static void rxrpc_cleanup_ring(struct rxrpc_call *call) +static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call) { rxrpc_purge_queue(&call->recvmsg_queue); + rxrpc_purge_queue(&call->rx_queue); rxrpc_purge_queue(&call->rx_oos_queue); } @@ -569,7 +574,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_put_call_slot(call); /* Make sure we don't get any more notifications */ - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", @@ -582,7 +587,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); @@ -682,23 +687,11 @@ static void rxrpc_rcu_free_call(struct rcu_head *rcu) static void rxrpc_destroy_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); - struct rxrpc_txbuf *txb; del_timer_sync(&call->timer); - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - rxrpc_cleanup_ring(call); - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } - + rxrpc_cleanup_tx_buffers(call); + rxrpc_cleanup_rx_buffers(call); rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_deactivate_bundle(call->bundle); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3b9b267a4431..db0099197890 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -231,7 +231,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) distance = id - id_cursor; if (distance < 0) distance = -distance; - limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024); + limit = umax(atomic_read(&rxnet->nr_conns) * 4, 1024); if (distance > limit) goto mark_dont_reuse; @@ -437,9 +437,9 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, call->dest_srx.srx_service = conn->service_id; call->cong_ssthresh = call->peer->cong_ssthresh; if (call->cong_cwnd >= call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; else - call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_ca_state = RXRPC_CA_SLOW_START; chan->call_id = call_id; chan->call_debug_id = call->debug_id; @@ -508,15 +508,18 @@ static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) void rxrpc_connect_client_calls(struct rxrpc_local *local) { struct rxrpc_call *call; + LIST_HEAD(new_client_calls); - while ((call = list_first_entry_or_null(&local->new_client_calls, - struct rxrpc_call, wait_link)) - ) { + spin_lock_irq(&local->client_call_lock); + list_splice_tail_init(&local->new_client_calls, &new_client_calls); + spin_unlock_irq(&local->client_call_lock); + + while ((call = list_first_entry_or_null(&new_client_calls, + struct rxrpc_call, wait_link))) { struct rxrpc_bundle *bundle = call->bundle; - spin_lock(&local->client_call_lock); list_move_tail(&call->wait_link, &bundle->waiting_calls); - spin_unlock(&local->client_call_lock); + rxrpc_see_call(call, rxrpc_call_see_waiting_call); if (rxrpc_bundle_has_space(bundle)) rxrpc_activate_channels(bundle); @@ -544,9 +547,9 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); trace_rxrpc_client(conn, channel, rxrpc_client_exposed); - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } } @@ -586,7 +589,10 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call _debug("call is waiting"); ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); + /* May still be on ->new_client_calls. */ + spin_lock_irq(&local->client_call_lock); list_del_init(&call->wait_link); + spin_unlock_irq(&local->client_call_lock); return; } @@ -636,7 +642,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { unsigned long final_ack_at = jiffies + 2; - WRITE_ONCE(chan->final_ack_at, final_ack_at); + chan->final_ack_at = final_ack_at; smp_wmb(); /* vs rxrpc_process_delayed_final_acks() */ set_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); rxrpc_reduce_conn_timer(conn, final_ack_at); @@ -770,7 +776,7 @@ next: conn_expires_at = conn->idle_timestamp + expiry; - now = READ_ONCE(jiffies); + now = jiffies; if (time_after(conn_expires_at, now)) goto not_yet_expired; } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 1f251d758cb9..4d9c5e21ba78 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -26,7 +26,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff bool aborted = false; if (conn->state != RXRPC_CONN_ABORTED) { - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state != RXRPC_CONN_ABORTED) { conn->abort_code = abort_code; conn->error = err; @@ -37,7 +37,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff set_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events); aborted = true; } - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); } return aborted; @@ -63,11 +63,12 @@ int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, /* * Mark a connection as being remotely aborted. */ -static bool rxrpc_input_conn_abort(struct rxrpc_connection *conn, +static void rxrpc_input_conn_abort(struct rxrpc_connection *conn, struct sk_buff *skb) { - return rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, - RXRPC_CALL_REMOTELY_ABORTED); + trace_rxrpc_rx_conn_abort(conn, skb); + rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, + RXRPC_CALL_REMOTELY_ABORTED); } /* @@ -88,10 +89,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct rxrpc_ackpacket ack; }; } __attribute__((packed)) pkt; - struct rxrpc_ackinfo ack_info; + struct rxrpc_acktrailer trailer; size_t len; int ret, ioc; - u32 serial, mtu, call_id, padding; + u32 serial, max_mtu, if_mtu, call_id, padding; _enter("%d", conn->debug_id); @@ -122,8 +123,8 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, iov[0].iov_len = sizeof(pkt.whdr); iov[1].iov_base = &padding; iov[1].iov_len = 3; - iov[2].iov_base = &ack_info; - iov[2].iov_len = sizeof(ack_info); + iov[2].iov_base = &trailer; + iov[2].iov_len = sizeof(trailer); serial = rxrpc_get_next_serial(conn); @@ -149,8 +150,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; case RXRPC_PACKET_TYPE_ACK: - mtu = conn->peer->if_mtu; - mtu -= conn->peer->hdrsize; + if_mtu = conn->peer->if_mtu - conn->peer->hdrsize; + if (conn->peer->ackr_adv_pmtud) { + max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(1444, if_mtu); + max_mtu = if_mtu; + } pkt.ack.bufferSpace = 0; pkt.ack.maxSkew = htons(skb ? skb->priority : 0); pkt.ack.firstPacket = htonl(chan->last_seq + 1); @@ -158,20 +164,21 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; - ack_info.rxMTU = htonl(rxrpc_rx_mtu); - ack_info.maxMTU = htonl(mtu); - ack_info.rwind = htonl(rxrpc_rx_window_size); - ack_info.jumbo_max = htonl(rxrpc_rx_jumbo_max); + trailer.maxMTU = htonl(max_mtu); + trailer.ifMTU = htonl(if_mtu); + trailer.rwind = htonl(rxrpc_rx_window_size); + trailer.jumbo_max = 0; pkt.whdr.flags |= RXRPC_SLOW_START_OK; padding = 0; iov[0].iov_len += sizeof(pkt.ack); - len += sizeof(pkt.ack) + 3 + sizeof(ack_info); + len += sizeof(pkt.ack) + 3 + sizeof(trailer); ioc = 3; trace_rxrpc_tx_ack(chan->call_debug_id, serial, ntohl(pkt.ack.firstPacket), ntohl(pkt.ack.serial), - pkt.ack.reason, 0, rxrpc_rx_window_size); + pkt.ack.reason, 0, rxrpc_rx_window_size, + rxrpc_propose_ack_retransmit); break; default: @@ -202,11 +209,14 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) for (i = 0; i < RXRPC_MAXCALLS; i++) { call = conn->channels[i].call; - if (call) + if (call) { + rxrpc_see_call(call, rxrpc_call_see_conn_abort); rxrpc_set_call_completion(call, conn->completion, conn->abort_code, conn->error); + rxrpc_poke_call(call, rxrpc_call_poke_conn_abort); + } } _leave(""); @@ -218,10 +228,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) */ static void rxrpc_call_is_secure(struct rxrpc_call *call) { - if (call && __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SECURING) { - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); + if (call && __test_and_clear_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) rxrpc_notify_socket(call); - } } /* @@ -252,16 +260,17 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) conn->state = RXRPC_CONN_SERVICE; - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE) { /* Offload call state flipping to the I/O thread. As * we've already received the packet, put it on the * front of the queue. */ + sp->conn = rxrpc_get_connection(conn, rxrpc_conn_get_poke_secured); skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); skb_queue_head(&conn->local->rx_queue, skb); @@ -427,14 +436,16 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); - switch (skb->mark) { - case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - if (conn->state != RXRPC_CONN_SERVICE) - break; + if (skb) { + switch (skb->mark) { + case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: + if (conn->state != RXRPC_CONN_SERVICE) + break; - for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure(conn->channels[loop].call); - break; + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure(conn->channels[loop].call); + break; + } } /* Process delayed ACKs whose time has come. */ diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index df8a271948a1..2f1fd1e2e7e4 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -31,13 +31,13 @@ void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) if (WARN_ON_ONCE(!local)) return; - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); busy = !list_empty(&conn->attend_link); if (!busy) { rxrpc_get_connection(conn, why); list_add_tail(&conn->attend_link, &local->conn_attend_q); } - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); rxrpc_wake_up_io_thread(local); } @@ -67,7 +67,9 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, INIT_WORK(&conn->destructor, rxrpc_clean_up_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); + INIT_LIST_HEAD(&conn->attend_link); mutex_init(&conn->security_lock); + mutex_init(&conn->tx_data_alloc_lock); skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; @@ -118,18 +120,13 @@ struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *lo switch (srx->transport.family) { case AF_INET: if (peer->srx.transport.sin.sin_port != - srx->transport.sin.sin_port || - peer->srx.transport.sin.sin_addr.s_addr != - srx->transport.sin.sin_addr.s_addr) + srx->transport.sin.sin_port) goto not_found; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: if (peer->srx.transport.sin6.sin6_port != - srx->transport.sin6.sin6_port || - memcmp(&peer->srx.transport.sin6.sin6_addr, - &srx->transport.sin6.sin6_addr, - sizeof(struct in6_addr)) != 0) + srx->transport.sin6.sin6_port) goto not_found; break; #endif @@ -200,9 +197,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) call->peer->cong_ssthresh = call->cong_ssthresh; if (!hlist_unhashed(&call->error_link)) { - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_del_init(&call->error_link); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } if (rxrpc_is_client_call(call)) { @@ -325,6 +322,12 @@ static void rxrpc_clean_up_connection(struct work_struct *work) list_del_init(&conn->proc_link); write_unlock(&rxnet->conn_lock); + if (conn->pmtud_probe) { + trace_rxrpc_pmtud_lost(conn, 0); + conn->peer->pmtud_probing = false; + conn->peer->pmtud_pending = true; + } + rxrpc_purge_queue(&conn->rx_queue); rxrpc_kill_client_conn(conn); @@ -341,6 +344,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work) */ rxrpc_purge_queue(&conn->rx_queue); + page_frag_cache_drain(&conn->tx_data_alloc); call_rcu(&conn->rcu, rxrpc_rcu_free_connection); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 9691de00ade7..24aceb183c2c 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -9,6 +9,17 @@ #include "ar-internal.h" +/* Override priority when generating ACKs for received DATA */ +static const u8 rxrpc_ack_priority[RXRPC_ACK__INVALID] = { + [RXRPC_ACK_IDLE] = 1, + [RXRPC_ACK_DELAY] = 2, + [RXRPC_ACK_REQUESTED] = 3, + [RXRPC_ACK_DUPLICATE] = 4, + [RXRPC_ACK_EXCEEDS_WINDOW] = 5, + [RXRPC_ACK_NOSPACE] = 6, + [RXRPC_ACK_OUT_OF_SEQUENCE] = 7, +}; + static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, enum rxrpc_abort_reason why) { @@ -16,80 +27,68 @@ static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, } /* - * Do TCP-style congestion management [RFC 5681]. + * Do TCP-style congestion management [RFC5681]. */ static void rxrpc_congestion_management(struct rxrpc_call *call, - struct sk_buff *skb, - struct rxrpc_ack_summary *summary, - rxrpc_serial_t acked_serial) + struct rxrpc_ack_summary *summary) { - enum rxrpc_congest_change change = rxrpc_cong_no_change; - unsigned int cumulative_acks = call->cong_cumul_acks; - unsigned int cwnd = call->cong_cwnd; - bool resend = false; - - summary->flight_size = - (call->tx_top - call->acks_hard_ack) - summary->nr_acks; + summary->change = rxrpc_cong_no_change; + summary->in_flight = rxrpc_tx_in_flight(call); if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); - cwnd = 1; - if (cwnd >= call->cong_ssthresh && - call->cong_mode == RXRPC_CALL_SLOW_START) { - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - call->cong_tstamp = skb->tstamp; - cumulative_acks = 0; + call->cong_ssthresh = umax(summary->in_flight / 2, 2); + call->cong_cwnd = 1; + if (call->cong_cwnd >= call->cong_ssthresh && + call->cong_ca_state == RXRPC_CA_SLOW_START) { + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; + call->cong_tstamp = call->acks_latest_ts; + call->cong_cumul_acks = 0; } } - cumulative_acks += summary->nr_new_acks; - if (cumulative_acks > 255) - cumulative_acks = 255; + call->cong_cumul_acks += summary->nr_new_sacks; + call->cong_cumul_acks += summary->nr_new_hacks; + if (call->cong_cumul_acks > 255) + call->cong_cumul_acks = 255; - summary->cwnd = call->cong_cwnd; - summary->ssthresh = call->cong_ssthresh; - summary->cumulative_acks = cumulative_acks; - summary->dup_acks = call->cong_dup_acks; - - switch (call->cong_mode) { - case RXRPC_CALL_SLOW_START: - if (summary->saw_nacks) + switch (call->cong_ca_state) { + case RXRPC_CA_SLOW_START: + if (call->acks_nr_snacks > 0) goto packet_loss_detected; - if (summary->cumulative_acks > 0) - cwnd += 1; - if (cwnd >= call->cong_ssthresh) { - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - call->cong_tstamp = skb->tstamp; + if (call->cong_cumul_acks > 0) + call->cong_cwnd += 1; + if (call->cong_cwnd >= call->cong_ssthresh) { + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; + call->cong_tstamp = call->acks_latest_ts; } goto out; - case RXRPC_CALL_CONGEST_AVOIDANCE: - if (summary->saw_nacks) + case RXRPC_CA_CONGEST_AVOIDANCE: + if (call->acks_nr_snacks > 0) goto packet_loss_detected; /* We analyse the number of packets that get ACK'd per RTT * period and increase the window if we managed to fill it. */ - if (call->peer->rtt_count == 0) + if (call->rtt_count == 0) goto out; - if (ktime_before(skb->tstamp, + if (ktime_before(call->acks_latest_ts, ktime_add_us(call->cong_tstamp, - call->peer->srtt_us >> 3))) + call->srtt_us >> 3))) goto out_no_clear_ca; - change = rxrpc_cong_rtt_window_end; - call->cong_tstamp = skb->tstamp; - if (cumulative_acks >= cwnd) - cwnd++; + summary->change = rxrpc_cong_rtt_window_end; + call->cong_tstamp = call->acks_latest_ts; + if (call->cong_cumul_acks >= call->cong_cwnd) + call->cong_cwnd++; goto out; - case RXRPC_CALL_PACKET_LOSS: - if (!summary->saw_nacks) + case RXRPC_CA_PACKET_LOSS: + if (call->acks_nr_snacks == 0) goto resume_normality; - if (summary->new_low_nack) { - change = rxrpc_cong_new_low_nack; + if (summary->new_low_snack) { + summary->change = rxrpc_cong_new_low_nack; call->cong_dup_acks = 1; if (call->cong_extra > 1) call->cong_extra = 1; @@ -100,31 +99,35 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, if (call->cong_dup_acks < 3) goto send_extra_data; - change = rxrpc_cong_begin_retransmission; - call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); - cwnd = call->cong_ssthresh + 3; + summary->change = rxrpc_cong_begin_retransmission; + call->cong_ca_state = RXRPC_CA_FAST_RETRANSMIT; + call->cong_ssthresh = umax(summary->in_flight / 2, 2); + call->cong_cwnd = call->cong_ssthresh + 3; call->cong_extra = 0; call->cong_dup_acks = 0; - resend = true; + summary->need_retransmit = true; + summary->in_fast_or_rto_recovery = true; goto out; - case RXRPC_CALL_FAST_RETRANSMIT: - if (!summary->new_low_nack) { - if (summary->nr_new_acks == 0) - cwnd += 1; + case RXRPC_CA_FAST_RETRANSMIT: + rxrpc_tlp_init(call); + summary->in_fast_or_rto_recovery = true; + if (!summary->new_low_snack) { + if (summary->nr_new_sacks == 0) + call->cong_cwnd += 1; call->cong_dup_acks++; if (call->cong_dup_acks == 2) { - change = rxrpc_cong_retransmit_again; + summary->change = rxrpc_cong_retransmit_again; call->cong_dup_acks = 0; - resend = true; + summary->need_retransmit = true; } } else { - change = rxrpc_cong_progress; - cwnd = call->cong_ssthresh; - if (!summary->saw_nacks) + summary->change = rxrpc_cong_progress; + call->cong_cwnd = call->cong_ssthresh; + if (call->acks_nr_snacks == 0) { + summary->exiting_fast_or_rto_recovery = true; goto resume_normality; + } } goto out; @@ -134,30 +137,25 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, } resume_normality: - change = rxrpc_cong_cleared_nacks; + summary->change = rxrpc_cong_cleared_nacks; call->cong_dup_acks = 0; call->cong_extra = 0; - call->cong_tstamp = skb->tstamp; - if (cwnd < call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_tstamp = call->acks_latest_ts; + if (call->cong_cwnd < call->cong_ssthresh) + call->cong_ca_state = RXRPC_CA_SLOW_START; else - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; out: - cumulative_acks = 0; + call->cong_cumul_acks = 0; out_no_clear_ca: - if (cwnd >= RXRPC_TX_MAX_WINDOW) - cwnd = RXRPC_TX_MAX_WINDOW; - call->cong_cwnd = cwnd; - call->cong_cumul_acks = cumulative_acks; - summary->mode = call->cong_mode; - trace_rxrpc_congest(call, summary, acked_serial, change); - if (resend) - rxrpc_resend(call, skb); + if (call->cong_cwnd >= RXRPC_TX_MAX_WINDOW) + call->cong_cwnd = RXRPC_TX_MAX_WINDOW; + trace_rxrpc_congest(call, summary); return; packet_loss_detected: - change = rxrpc_cong_saw_nack; - call->cong_mode = RXRPC_CALL_PACKET_LOSS; + summary->change = rxrpc_cong_saw_nack; + call->cong_ca_state = RXRPC_CA_PACKET_LOSS; call->cong_dup_acks = 0; goto send_extra_data; @@ -166,7 +164,7 @@ send_extra_data: * state. */ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) || - summary->nr_acks != call->tx_top - call->acks_hard_ack) { + call->acks_nr_sacks != call->tx_top - call->tx_bottom) { call->cong_extra++; wake_up(&call->waitq); } @@ -178,26 +176,42 @@ send_extra_data: */ void rxrpc_congestion_degrade(struct rxrpc_call *call) { - ktime_t rtt, now; + ktime_t rtt, now, time_since; - if (call->cong_mode != RXRPC_CALL_SLOW_START && - call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE) + if (call->cong_ca_state != RXRPC_CA_SLOW_START && + call->cong_ca_state != RXRPC_CA_CONGEST_AVOIDANCE) return; if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY) return; - rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8)); + rtt = ns_to_ktime(call->srtt_us * (NSEC_PER_USEC / 8)); now = ktime_get_real(); - if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now)) + time_since = ktime_sub(now, call->tx_last_sent); + if (ktime_before(time_since, rtt)) return; - trace_rxrpc_reset_cwnd(call, now); + trace_rxrpc_reset_cwnd(call, time_since, rtt); rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); call->tx_last_sent = now; - call->cong_mode = RXRPC_CALL_SLOW_START; - call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh, - call->cong_cwnd * 3 / 4); - call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND); + call->cong_ca_state = RXRPC_CA_SLOW_START; + call->cong_ssthresh = umax(call->cong_ssthresh, call->cong_cwnd * 3 / 4); + call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND); +} + +/* + * Add an RTT sample derived from an ACK'd DATA packet. + */ +static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + int ix) +{ + ktime_t xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); + + rxrpc_call_add_rtt(call, rxrpc_rtt_rx_data_ack, -1, + summary->acked_serial, summary->ack_serial, + xmit_ts, call->acks_latest_ts); + __clear_bit(ix, &tq->rtt_samples); /* Prevent repeat RTT sample */ } /* @@ -206,37 +220,120 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, struct rxrpc_ack_summary *summary) { - struct rxrpc_txbuf *txb; - bool rot_last = false; + struct rxrpc_txqueue *tq = call->tx_queue; + rxrpc_seq_t seq = call->tx_bottom + 1; + bool rot_last = false, trace = false; - list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { - if (before_eq(txb->seq, call->acks_hard_ack)) - continue; - if (test_bit(RXRPC_TXBUF_LAST, &txb->flags)) { + _enter("%x,%x", call->tx_bottom, to); + + trace_rxrpc_tx_rotate(call, seq, to); + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate); + + if (call->acks_lowest_nak == call->tx_bottom) { + call->acks_lowest_nak = to; + } else if (after(to, call->acks_lowest_nak)) { + summary->new_low_snack = true; + call->acks_lowest_nak = to; + } + + /* We may have a left over fully-consumed buffer at the front that we + * couldn't drop before (rotate_and_keep below). + */ + if (seq == call->tx_qbase + RXRPC_NR_TXQUEUE) { + call->tx_qbase += RXRPC_NR_TXQUEUE; + call->tx_queue = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + tq = call->tx_queue; + } + + do { + unsigned int ix = seq - call->tx_qbase; + + _debug("tq=%x seq=%x i=%d f=%x", tq->qbase, seq, ix, tq->bufs[ix]->flags); + if (tq->bufs[ix]->flags & RXRPC_LAST_PACKET) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; } - if (txb->seq == to) - break; - } - if (rot_last) - set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); + if (summary->acked_serial == tq->segment_serial[ix] && + test_bit(ix, &tq->rtt_samples)) + rxrpc_add_data_rtt_sample(call, summary, tq, ix); + + if (ix == tq->nr_reported_acks) { + /* Packet directly hard ACK'd. */ + tq->nr_reported_acks++; + rxrpc_input_rack_one(call, summary, tq, ix); + if (seq == call->tlp_seq) + summary->tlp_probe_acked = true; + summary->nr_new_hacks++; + __set_bit(ix, &tq->segment_acked); + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_hack); + } else if (test_bit(ix, &tq->segment_acked)) { + /* Soft ACK -> hard ACK. */ + call->acks_nr_sacks--; + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_sack); + } else { + /* Soft NAK -> hard ACK. */ + call->acks_nr_snacks--; + rxrpc_input_rack_one(call, summary, tq, ix); + if (seq == call->tlp_seq) + summary->tlp_probe_acked = true; + summary->nr_new_hacks++; + __set_bit(ix, &tq->segment_acked); + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_snak); + } - _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last); + call->tx_nr_sent--; + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) + call->tx_nr_resent--; + __clear_bit(ix, &tq->ever_retransmitted); - if (call->acks_lowest_nak == call->acks_hard_ack) { - call->acks_lowest_nak = to; - } else if (after(to, call->acks_lowest_nak)) { - summary->new_low_nack = true; - call->acks_lowest_nak = to; + rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated); + tq->bufs[ix] = NULL; + + WRITE_ONCE(call->tx_bottom, seq); + trace_rxrpc_txqueue(call, (rot_last ? + rxrpc_txqueue_rotate_last : + rxrpc_txqueue_rotate)); + + seq++; + trace = true; + if (!(seq & RXRPC_TXQ_MASK)) { + trace_rxrpc_rack_update(call, summary); + trace = false; + prefetch(tq->next); + if (tq != call->tx_qtail) { + call->tx_qbase += RXRPC_NR_TXQUEUE; + call->tx_queue = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + tq = call->tx_queue; + } else { + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_keep); + tq = NULL; + break; + } + } + + } while (before_eq(seq, to)); + + if (trace) + trace_rxrpc_rack_update(call, summary); + + if (rot_last) { + set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); + if (tq) { + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + call->tx_queue = NULL; + } } - smp_store_release(&call->acks_hard_ack, to); + _debug("%x,%x,%x,%d", to, call->tx_bottom, call->tx_top, rot_last); - trace_rxrpc_txqueue(call, (rot_last ? - rxrpc_txqueue_rotate_last : - rxrpc_txqueue_rotate)); wake_up(&call->waitq); return rot_last; } @@ -252,10 +349,10 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, { ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); - if (unlikely(call->cong_last_nack)) { - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - call->cong_last_nack = NULL; - } + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; + call->rack_timo_at = KTIME_MAX; + trace_rxrpc_rack_timer(call, 0, false); + trace_rxrpc_timer_can(call, rxrpc_timer_trace_rack_off + call->rack_timer_mode); switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: @@ -288,15 +385,11 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, static bool rxrpc_receiving_reply(struct rxrpc_call *call) { struct rxrpc_ack_summary summary = { 0 }; - unsigned long now, timo; rxrpc_seq_t top = READ_ONCE(call->tx_top); if (call->ackr_reason) { - now = jiffies; - timo = now + MAX_JIFFY_OFFSET; - - WRITE_ONCE(call->delay_ack_at, timo); - trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now); + call->delay_ack_at = KTIME_MAX; + trace_rxrpc_timer_can(call, rxrpc_timer_trace_delayed_ack); } if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { @@ -329,7 +422,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) case RXRPC_CALL_SERVER_RECV_REQUEST: rxrpc_set_call_state(call, RXRPC_CALL_SERVER_ACK_REQUEST); - call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; + call->expect_req_by = KTIME_MAX; rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_processing_op); break; @@ -355,18 +448,26 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); bool last = sp->hdr.flags & RXRPC_LAST_PACKET; + spin_lock_irq(&call->recvmsg_queue.lock); + __skb_queue_tail(&call->recvmsg_queue, skb); rxrpc_input_update_ack_window(call, window, wtop); trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); if (last) + /* Change the state inside the lock so that recvmsg syncs + * correctly with it and using sendmsg() to send a reply + * doesn't race. + */ rxrpc_end_rx_phase(call, sp->hdr.serial); + + spin_unlock_irq(&call->recvmsg_queue.lock); } /* * Process a DATA packet. */ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, - bool *_notify) + bool *_notify, rxrpc_serial_t *_ack_serial, int *_ack_reason) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct sk_buff *oos; @@ -419,8 +520,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, /* Send an immediate ACK if we fill in a hole */ else if (!skb_queue_empty(&call->rx_oos_queue)) ack_reason = RXRPC_ACK_DELAY; - else - call->ackr_nr_unacked++; window++; if (after(window, wtop)) { @@ -434,7 +533,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg); - spin_lock(&call->recvmsg_queue.lock); rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue); *_notify = true; @@ -456,8 +554,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_receive_queue_oos); } - spin_unlock(&call->recvmsg_queue.lock); - call->ackr_sack_base = sack; } else { unsigned int slot; @@ -498,12 +594,16 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, } send_ack: - if (ack_reason >= 0) - rxrpc_send_ACK(call, ack_reason, serial, - rxrpc_propose_ack_input_data); - else - rxrpc_propose_delay_ACK(call, serial, - rxrpc_propose_ack_input_data); + if (ack_reason >= 0) { + if (rxrpc_ack_priority[ack_reason] > rxrpc_ack_priority[*_ack_reason]) { + *_ack_serial = serial; + *_ack_reason = ack_reason; + } else if (rxrpc_ack_priority[ack_reason] == rxrpc_ack_priority[*_ack_reason] && + ack_reason == RXRPC_ACK_REQUESTED) { + *_ack_serial = serial; + *_ack_reason = ack_reason; + } + } } /* @@ -514,9 +614,11 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb struct rxrpc_jumbo_header jhdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb), *jsp; struct sk_buff *jskb; + rxrpc_serial_t ack_serial = 0; unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = skb->len - offset; bool notify = false; + int ack_reason = 0, count = 1, stat_ix; while (sp->hdr.flags & RXRPC_JUMBO_PACKET) { if (len < RXRPC_JUMBO_SUBPKTLEN) @@ -536,7 +638,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb jsp = rxrpc_skb(jskb); jsp->offset = offset; jsp->len = RXRPC_JUMBO_DATALEN; - rxrpc_input_data_one(call, jskb, ¬ify); + rxrpc_input_data_one(call, jskb, ¬ify, &ack_serial, &ack_reason); rxrpc_free_skb(jskb, rxrpc_skb_put_jumbo_subpacket); sp->hdr.flags = jhdr.flags; @@ -545,12 +647,25 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb sp->hdr.serial++; offset += RXRPC_JUMBO_SUBPKTLEN; len -= RXRPC_JUMBO_SUBPKTLEN; + count++; } sp->offset = offset; sp->len = len; - rxrpc_input_data_one(call, skb, ¬ify); - if (notify) { + rxrpc_input_data_one(call, skb, ¬ify, &ack_serial, &ack_reason); + + stat_ix = umin(count, ARRAY_SIZE(call->rxnet->stat_rx_jumbo)) - 1; + atomic_inc(&call->rxnet->stat_rx_jumbo[stat_ix]); + + if (ack_reason > 0) { + rxrpc_send_ACK(call, ack_reason, ack_serial, + rxrpc_propose_ack_input_data); + } else { + call->ackr_nr_unacked++; + rxrpc_propose_delay_ACK(call, sp->hdr.serial, + rxrpc_propose_ack_input_data); + } + if (notify && !test_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) { trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); rxrpc_notify_socket(call); } @@ -589,14 +704,12 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_CALL_SERVER_RECV_REQUEST: { unsigned long timo = READ_ONCE(call->next_req_timo); - unsigned long now, expect_req_by; if (timo) { - now = jiffies; - expect_req_by = now + timo; - WRITE_ONCE(call->expect_req_by, expect_req_by); - rxrpc_reduce_call_timer(call, expect_req_by, now, - rxrpc_timer_set_for_idle); + ktime_t delay = ms_to_ktime(timo); + + call->expect_req_by = ktime_add(ktime_get_real(), delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_idle); } break; } @@ -646,7 +759,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_mb(); /* Read data before setting avail bit */ set_bit(i, &call->rtt_avail); - rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, + rxrpc_call_add_rtt(call, type, i, acked_serial, ack_serial, sent_at, resp_time); matched = true; } @@ -656,7 +769,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, */ if (after(acked_serial, orig_serial)) { trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_obsolete, i, - orig_serial, acked_serial, 0, 0); + orig_serial, acked_serial, 0, 0, 0); clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_wmb(); set_bit(i, &call->rtt_avail); @@ -664,20 +777,23 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, } if (!matched) - trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0); + trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0, 0); } /* * Process the extra information that may be appended to an ACK packet */ -static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, - struct rxrpc_ackinfo *ackinfo) +static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb, + struct rxrpc_acktrailer *trailer) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_peer *peer; - unsigned int mtu; + struct rxrpc_peer *peer = call->peer; + unsigned int max_data, capacity; bool wake = false; - u32 rwind = ntohl(ackinfo->rwind); + u32 max_mtu = ntohl(trailer->maxMTU); + //u32 if_mtu = ntohl(trailer->ifMTU); + u32 rwind = ntohl(trailer->rwind); + u32 jumbo_max = ntohl(trailer->jumbo_max); if (rwind > RXRPC_TX_MAX_WINDOW) rwind = RXRPC_TX_MAX_WINDOW; @@ -688,58 +804,147 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, call->tx_winsize = rwind; } - if (call->cong_ssthresh > rwind) - call->cong_ssthresh = rwind; + max_mtu = clamp(max_mtu, 500, 65535); + peer->ackr_max_data = max_mtu; - mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU)); + if (max_mtu < peer->max_data) { + trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu, + rxrpc_pmtud_reduce_ack); + peer->max_data = max_mtu; + } + + max_data = umin(max_mtu, peer->max_data); + capacity = max_data; + capacity += sizeof(struct rxrpc_jumbo_header); /* First subpacket has main hdr, not jumbo */ + capacity /= sizeof(struct rxrpc_jumbo_header) + RXRPC_JUMBO_DATALEN; - peer = call->peer; - if (mtu < peer->maxdata) { - spin_lock(&peer->lock); - peer->maxdata = mtu; - peer->mtu = mtu + peer->hdrsize; - spin_unlock(&peer->lock); + if (jumbo_max == 0) { + /* The peer says it supports pmtu discovery */ + peer->ackr_adv_pmtud = true; + } else { + peer->ackr_adv_pmtud = false; + capacity = clamp(capacity, 1, jumbo_max); } + call->tx_jumbo_max = capacity; + if (wake) wake_up(&call->waitq); } +#if defined(CONFIG_X86) && __GNUC__ && !defined(__clang__) +/* Clang doesn't support the %z constraint modifier */ +#define shiftr_adv_rotr(shift_from, rotate_into) ({ \ + asm(" shr%z1 %1\n" \ + " inc %0\n" \ + " rcr%z2 %2\n" \ + : "+d"(shift_from), "+m"(*(shift_from)), "+rm"(rotate_into) \ + ); \ + }) +#else +#define shiftr_adv_rotr(shift_from, rotate_into) ({ \ + typeof(rotate_into) __bit0 = *(shift_from) & 1; \ + *(shift_from) >>= 1; \ + shift_from++; \ + rotate_into >>= 1; \ + rotate_into |= __bit0 << (sizeof(rotate_into) * 8 - 1); \ + }) +#endif + /* - * Determine how many nacks from the previous ACK have now been satisfied. + * Deal with RTT samples from soft ACKs. */ -static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, - struct rxrpc_ack_summary *summary, - rxrpc_seq_t seq) +static void rxrpc_input_soft_rtt(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq) { - struct sk_buff *skb = call->cong_last_nack; - struct rxrpc_ackpacket ack; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int i, new_acks = 0, retained_nacks = 0; - rxrpc_seq_t old_seq = sp->first_ack; - u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(ack); - - if (after_eq(seq, old_seq + sp->nr_acks)) { - summary->nr_new_acks += sp->nr_nacks; - summary->nr_new_acks += seq - (old_seq + sp->nr_acks); - summary->nr_retained_nacks = 0; - } else if (seq == old_seq) { - summary->nr_retained_nacks = sp->nr_nacks; - } else { - for (i = 0; i < sp->nr_acks; i++) { - if (acks[i] == RXRPC_ACK_TYPE_NACK) { - if (before(old_seq + i, seq)) - new_acks++; - else - retained_nacks++; - } + for (int ix = 0; ix < RXRPC_NR_TXQUEUE; ix++) + if (summary->acked_serial == tq->segment_serial[ix]) + return rxrpc_add_data_rtt_sample(call, summary, tq, ix); +} + +/* + * Process a batch of soft ACKs specific to a transmission queue segment. + */ +static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long extracted_acks, + int nr_reported, + rxrpc_seq_t seq, + rxrpc_seq_t *lowest_nak) +{ + unsigned long old_reported = 0, flipped, new_acks = 0; + unsigned long a_to_n, n_to_a = 0; + int new, a, n; + + if (tq->nr_reported_acks > 0) + old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks); + + _enter("{%x,%lx,%d},%lx,%d,%x", + tq->qbase, tq->segment_acked, tq->nr_reported_acks, + extracted_acks, nr_reported, seq); + + _debug("[%x]", tq->qbase); + _debug("tq %16lx %u", tq->segment_acked, tq->nr_reported_acks); + _debug("sack %16lx %u", extracted_acks, nr_reported); + + /* See how many previously logged ACKs/NAKs have flipped. */ + flipped = (tq->segment_acked ^ extracted_acks) & old_reported; + if (flipped) { + n_to_a = ~tq->segment_acked & flipped; /* Old NAK -> ACK */ + a_to_n = tq->segment_acked & flipped; /* Old ACK -> NAK */ + a = hweight_long(n_to_a); + n = hweight_long(a_to_n); + _debug("flip %16lx", flipped); + _debug("ntoa %16lx %d", n_to_a, a); + _debug("aton %16lx %d", a_to_n, n); + call->acks_nr_sacks += a - n; + call->acks_nr_snacks += n - a; + summary->nr_new_sacks += a; + summary->nr_new_snacks += n; + } + + /* See how many new ACKs/NAKs have been acquired. */ + new = nr_reported - tq->nr_reported_acks; + if (new > 0) { + new_acks = extracted_acks & ~old_reported; + if (new_acks) { + a = hweight_long(new_acks); + n = new - a; + _debug("new_a %16lx new=%d a=%d n=%d", new_acks, new, a, n); + call->acks_nr_sacks += a; + call->acks_nr_snacks += n; + summary->nr_new_sacks += a; + summary->nr_new_snacks += n; + } else { + call->acks_nr_snacks += new; + summary->nr_new_snacks += new; } + } + + tq->nr_reported_acks = nr_reported; + tq->segment_acked = extracted_acks; + trace_rxrpc_apply_acks(call, tq); + + if (extracted_acks != ~0UL) { + rxrpc_seq_t lowest = seq + ffz(extracted_acks); - summary->nr_new_acks += new_acks; - summary->nr_retained_nacks = retained_nacks; + if (before(lowest, *lowest_nak)) + *lowest_nak = lowest; } - return old_seq + sp->nr_acks; + if (summary->acked_serial) + rxrpc_input_soft_rtt(call, summary, tq); + + new_acks |= n_to_a; + if (new_acks) + rxrpc_input_rack(call, summary, tq, new_acks); + + if (call->tlp_serial && + rxrpc_seq_in_txq(tq, call->tlp_seq) && + test_bit(call->tlp_seq - tq->qbase, &new_acks)) + summary->tlp_probe_acked = true; } /* @@ -753,39 +958,50 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, */ static void rxrpc_input_soft_acks(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - struct sk_buff *skb, - rxrpc_seq_t seq, - rxrpc_seq_t since) + struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int i, old_nacks = 0; - rxrpc_seq_t lowest_nak = seq + sp->nr_acks; + struct rxrpc_txqueue *tq = call->tx_queue; + unsigned long extracted = ~0UL; + unsigned int nr = 0; + rxrpc_seq_t seq = call->acks_hard_ack + 1; + rxrpc_seq_t lowest_nak = seq + sp->ack.nr_acks; u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - for (i = 0; i < sp->nr_acks; i++) { - if (acks[i] == RXRPC_ACK_TYPE_ACK) { - summary->nr_acks++; - if (after_eq(seq, since)) - summary->nr_new_acks++; - } else { - summary->saw_nacks = true; - if (before(seq, since)) { - /* Overlap with previous ACK */ - old_nacks++; - } else { - summary->nr_new_nacks++; - sp->nr_nacks++; - } + _enter("%x,%x,%u", tq->qbase, seq, sp->ack.nr_acks); + + while (after(seq, tq->qbase + RXRPC_NR_TXQUEUE - 1)) + tq = tq->next; - if (before(seq, lowest_nak)) - lowest_nak = seq; + for (unsigned int i = 0; i < sp->ack.nr_acks; i++) { + /* Decant ACKs until we hit a txqueue boundary. */ + shiftr_adv_rotr(acks, extracted); + if (i == 256) { + acks -= i; + i = 0; } seq++; + nr++; + if ((seq & RXRPC_TXQ_MASK) != 0) + continue; + + _debug("bound %16lx %u", extracted, nr); + + rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE, + seq - RXRPC_NR_TXQUEUE, &lowest_nak); + extracted = ~0UL; + nr = 0; + tq = tq->next; + prefetch(tq); } - if (lowest_nak != call->acks_lowest_nak) { - call->acks_lowest_nak = lowest_nak; - summary->new_low_nack = true; + if (nr) { + unsigned int nr_reported = seq & RXRPC_TXQ_MASK; + + extracted >>= RXRPC_NR_TXQUEUE - nr_reported; + _debug("tail %16lx %u", extracted, nr_reported); + rxrpc_input_soft_ack_tq(call, summary, tq, extracted, nr_reported, + seq & ~RXRPC_TXQ_MASK, &lowest_nak); } /* We *can* have more nacks than we did - the peer is permitted to drop @@ -793,9 +1009,14 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * possible for the nack distribution to change whilst the number of * nacks stays the same or goes down. */ - if (old_nacks < summary->nr_retained_nacks) - summary->nr_new_acks += summary->nr_retained_nacks - old_nacks; - summary->nr_retained_nacks = old_nacks; + if (lowest_nak != call->acks_lowest_nak) { + call->acks_lowest_nak = lowest_nak; + summary->new_low_snack = true; + } + + _debug("summary A=%d+%d N=%d+%d", + call->acks_nr_sacks, summary->nr_new_sacks, + call->acks_nr_snacks, summary->nr_new_snacks); } /* @@ -803,21 +1024,21 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * with respect to the ack state conveyed by preceding ACKs. */ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, - rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt) + rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt) { - rxrpc_seq_t base = READ_ONCE(call->acks_first_seq); + rxrpc_seq_t base = READ_ONCE(call->acks_hard_ack); - if (after(first_pkt, base)) + if (after(hard_ack, base)) return true; /* The window advanced */ - if (before(first_pkt, base)) + if (before(hard_ack, base)) return false; /* firstPacket regressed */ if (after_eq(prev_pkt, call->acks_prev_seq)) return true; /* previousPacket hasn't regressed. */ /* Some rx implementations put a serial number in previousPacket. */ - if (after_eq(prev_pkt, base + call->tx_winsize)) + if (after(prev_pkt, base + call->tx_winsize)) return false; return true; } @@ -835,59 +1056,34 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; - struct rxrpc_ackpacket ack; + struct rxrpc_acktrailer trailer; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_ackinfo info; - rxrpc_serial_t ack_serial, acked_serial; - rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since; + rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; _enter(""); - offset = sizeof(struct rxrpc_wire_header); - if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0) - return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack); - offset += sizeof(ack); - - ack_serial = sp->hdr.serial; - acked_serial = ntohl(ack.serial); - first_soft_ack = ntohl(ack.firstPacket); - prev_pkt = ntohl(ack.previousPacket); - hard_ack = first_soft_ack - 1; - nr_acks = ack.nAcks; - sp->first_ack = first_soft_ack; - sp->nr_acks = nr_acks; - summary.ack_reason = (ack.reason < RXRPC_ACK__INVALID ? - ack.reason : RXRPC_ACK__INVALID); - - trace_rxrpc_rx_ack(call, ack_serial, acked_serial, - first_soft_ack, prev_pkt, - summary.ack_reason, nr_acks); - rxrpc_inc_stat(call->rxnet, stat_rx_acks[ack.reason]); - - if (acked_serial != 0) { - switch (ack.reason) { - case RXRPC_ACK_PING_RESPONSE: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_ping_response); - break; - case RXRPC_ACK_REQUESTED: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_requested_ack); - break; - default: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_other_ack); - break; - } - } + offset = sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); + + summary.ack_serial = sp->hdr.serial; + first_soft_ack = sp->ack.first_ack; + prev_pkt = sp->ack.prev_ack; + nr_acks = sp->ack.nr_acks; + hard_ack = first_soft_ack - 1; + summary.acked_serial = sp->ack.acked_serial; + summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? + sp->ack.reason : RXRPC_ACK__INVALID); + + trace_rxrpc_rx_ack(call, sp); + rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); + prefetch(call->tx_queue); /* If we get an EXCEEDS_WINDOW ACK from the server, it probably * indicates that the client address changed due to NAT. The server * lost the call because it switched to a different peer. */ - if (unlikely(ack.reason == RXRPC_ACK_EXCEEDS_WINDOW) && - first_soft_ack == 1 && + if (unlikely(summary.ack_reason == RXRPC_ACK_EXCEEDS_WINDOW) && + hard_ack == 0 && prev_pkt == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, @@ -899,10 +1095,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) * indicate a change of address. However, we can retransmit the call * if we still have it buffered to the beginning. */ - if (unlikely(ack.reason == RXRPC_ACK_OUT_OF_SEQUENCE) && - first_soft_ack == 1 && + if (unlikely(summary.ack_reason == RXRPC_ACK_OUT_OF_SEQUENCE) && + hard_ack == 0 && prev_pkt == 0 && - call->acks_hard_ack == 0 && + call->tx_bottom == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); @@ -910,50 +1106,44 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } /* Discard any out-of-order or duplicate ACKs (outside lock). */ - if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { - trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, - first_soft_ack, call->acks_first_seq, - prev_pkt, call->acks_prev_seq); - goto send_response; + if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) { + trace_rxrpc_rx_discard_ack(call, summary.ack_serial, hard_ack, prev_pkt); + goto send_response; /* Still respond if requested. */ } - info.rxMTU = 0; + trailer.maxMTU = 0; ioffset = offset + nr_acks + 3; - if (skb->len >= ioffset + sizeof(info) && - skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0) - return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack_info); + if (skb->len >= ioffset + sizeof(trailer) && + skb_copy_bits(skb, ioffset, &trailer, sizeof(trailer)) < 0) + return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack_trailer); if (nr_acks > 0) skb_condense(skb); - if (call->cong_last_nack) { - since = rxrpc_input_check_prev_ack(call, &summary, first_soft_ack); - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - call->cong_last_nack = NULL; - } else { - summary.nr_new_acks = first_soft_ack - call->acks_first_seq; - call->acks_lowest_nak = first_soft_ack + nr_acks; - since = first_soft_ack; - } - - call->acks_latest_ts = skb->tstamp; - call->acks_first_seq = first_soft_ack; + call->acks_latest_ts = ktime_get_real(); + call->acks_hard_ack = hard_ack; call->acks_prev_seq = prev_pkt; - switch (ack.reason) { - case RXRPC_ACK_PING: - break; - default: - if (acked_serial && after(acked_serial, call->acks_highest_serial)) - call->acks_highest_serial = acked_serial; - break; + if (summary.acked_serial) { + switch (summary.ack_reason) { + case RXRPC_ACK_PING_RESPONSE: + rxrpc_complete_rtt_probe(call, call->acks_latest_ts, + summary.acked_serial, summary.ack_serial, + rxrpc_rtt_rx_ping_response); + break; + default: + if (after(summary.acked_serial, call->acks_highest_serial)) + call->acks_highest_serial = summary.acked_serial; + summary.rtt_sample_avail = true; + break; + } } /* Parse rwind and mtu sizes if provided. */ - if (info.rxMTU) - rxrpc_input_ackinfo(call, skb, &info); + if (trailer.maxMTU) + rxrpc_input_ack_trailer(call, skb, &trailer); - if (first_soft_ack == 0) + if (hard_ack + 1 == 0) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); /* Ignore ACKs unless we are or have just been transmitting. */ @@ -967,13 +1157,13 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto send_response; } - if (before(hard_ack, call->acks_hard_ack) || + if (before(hard_ack, call->tx_bottom) || after(hard_ack, call->tx_top)) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_outside_window); if (nr_acks > call->tx_top - hard_ack) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow); - if (after(hard_ack, call->acks_hard_ack)) { + if (after(hard_ack, call->tx_bottom)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); goto send_response; @@ -983,25 +1173,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) { if (offset > (int)skb->len - nr_acks) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack); - rxrpc_input_soft_acks(call, &summary, skb, first_soft_ack, since); - rxrpc_get_skb(skb, rxrpc_skb_get_last_nack); - call->cong_last_nack = skb; + rxrpc_input_soft_acks(call, &summary, skb); } if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && - summary.nr_acks == call->tx_top - hard_ack && + call->acks_nr_sacks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) - rxrpc_propose_ping(call, ack_serial, + rxrpc_propose_ping(call, summary.ack_serial, rxrpc_propose_ack_ping_for_lost_reply); - rxrpc_congestion_management(call, skb, &summary, acked_serial); + /* Drive the congestion management algorithm first and then RACK-TLP as + * the latter depends on the state/change in state in the former. + */ + rxrpc_congestion_management(call, &summary); + rxrpc_rack_detect_loss_and_arm_timer(call, &summary); + rxrpc_tlp_process_ack(call, &summary); + if (call->tlp_serial && after_eq(summary.acked_serial, call->tlp_serial)) + call->tlp_serial = 0; send_response: - if (ack.reason == RXRPC_ACK_PING) - rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, + if (summary.ack_reason == RXRPC_ACK_PING) + rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, summary.ack_serial, rxrpc_propose_ack_respond_to_ping); else if (sp->hdr.flags & RXRPC_REQUEST_ACK) - rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, summary.ack_serial, rxrpc_propose_ack_respond_to_ack); } @@ -1048,12 +1243,10 @@ void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb) timo = READ_ONCE(call->next_rx_timo); if (timo) { - unsigned long now = jiffies, expect_rx_by; + ktime_t delay = ms_to_ktime(timo); - expect_rx_by = now + timo; - WRITE_ONCE(call->expect_rx_by, expect_rx_by); - rxrpc_reduce_call_timer(call, expect_rx_by, now, - rxrpc_timer_set_for_normal); + call->expect_rx_by = ktime_add(ktime_get_real(), delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx); } switch (sp->hdr.type) { @@ -1102,5 +1295,5 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) break; } - rxrpc_input_call_event(call, skb); + rxrpc_input_call_event(call); } diff --git a/net/rxrpc/input_rack.c b/net/rxrpc/input_rack.c new file mode 100644 index 000000000000..13c371261e0a --- /dev/null +++ b/net/rxrpc/input_rack.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RACK-TLP [RFC8958] Implementation + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "ar-internal.h" + +static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1, + ktime_t t2, rxrpc_seq_t seq2) +{ + if (ktime_after(t1, t2)) + return true; + return t1 == t2 && after(seq1, seq2); +} + +/* + * Mark a packet lost. + */ +static void rxrpc_rack_mark_lost(struct rxrpc_call *call, + struct rxrpc_txqueue *tq, unsigned int ix) +{ + if (__test_and_set_bit(ix, &tq->segment_lost)) { + if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) + call->tx_nr_resent--; + } else { + call->tx_nr_lost++; + } + tq->segment_xmit_ts[ix] = UINT_MAX; +} + +/* + * Get the transmission time of a packet in the Tx queue. + */ +static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix) +{ + if (tq->segment_xmit_ts[ix] == UINT_MAX) + return KTIME_MAX; + return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); +} + +/* + * Get a bitmask of nack bits for a queue segment and mask off any that aren't + * yet reported. + */ +static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq) +{ + unsigned long nacks = ~tq->segment_acked; + + if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) + nacks &= (1UL << tq->nr_reported_acks) - 1; + return nacks; +} + +/* + * Update the RACK state for the most recently sent packet that has been + * delivered [RFC8958 6.2 Step 2]. + */ +static void rxrpc_rack_update(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts); + + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + + if (test_bit(ix, &tq->segment_retransmitted)) { + /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */ + if (before(call->acks_highest_serial, tq->segment_serial[ix])) + return; + if (rtt < minmax_get(&call->min_rtt)) + return; + } + + /* The RACK algorithm requires the segment ACKs to be traversed in + * order of segment transmission - but the only thing this seems to + * matter for is that RACK.rtt is set to the rtt of the most recently + * transmitted segment. We should be able to achieve the same by only + * setting RACK.rtt if the xmit time is greater. + */ + if (ktime_after(xmit_ts, call->rack_rtt_ts)) { + call->rack_rtt = rtt; + call->rack_rtt_ts = xmit_ts; + } + + if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) { + call->rack_rtt = rtt; + call->rack_xmit_ts = xmit_ts; + call->rack_end_seq = seq; + } +} + +/* + * Detect data segment reordering [RFC8958 6.2 Step 3]. + */ +static void rxrpc_rack_detect_reordering(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_seq_t seq = tq->qbase + ix; + + /* Track the highest sequence number so far ACK'd. This is not + * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer + * could put a NACK in the last SACK slot. + */ + if (after(seq, call->rack_fack)) + call->rack_fack = seq; + else if (before(seq, call->rack_fack) && + test_bit(ix, &tq->segment_retransmitted)) + call->rack_reordering_seen = true; +} + +void rxrpc_input_rack_one(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_rack_update(call, summary, tq, ix); + rxrpc_rack_detect_reordering(call, summary, tq, ix); +} + +void rxrpc_input_rack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long new_acks) +{ + while (new_acks) { + unsigned int ix = __ffs(new_acks); + + __clear_bit(ix, &new_acks); + rxrpc_input_rack_one(call, summary, tq, ix); + } + + trace_rxrpc_rack_update(call, summary); +} + +/* + * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated + * duration of the reordering window. + * + * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can + * be given a 'DUPLICATE' reason with the serial number referring to the + * duplicated DATA packet. Rx does not inform as to whether this was a + * reception of the same packet twice or of a retransmission of a packet we + * already received (though this could be determined by the transmitter based + * on the serial number). + */ +static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ + rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */ + bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE; + int dup_thresh = 3; + + /* DSACK-based reordering window adaptation */ + if (!call->rack_dsack_round_none && + after_eq(snd_una, call->rack_dsack_round)) + call->rack_dsack_round_none = true; + + /* Grow the reordering window per round that sees DSACK. Reset the + * window after 16 DSACK-free recoveries. + */ + if (call->rack_dsack_round_none && have_dsack_option) { + call->rack_dsack_round_none = false; + call->rack_dsack_round = snd_nxt; + call->rack_reo_wnd_mult++; + call->rack_reo_wnd_persist = 16; + } else if (summary->exiting_fast_or_rto_recovery) { + call->rack_reo_wnd_persist--; + if (call->rack_reo_wnd_persist <= 0) + call->rack_reo_wnd_mult = 1; + } + + if (!call->rack_reordering_seen) { + if (summary->in_fast_or_rto_recovery) + return 0; + if (call->acks_nr_sacks >= dup_thresh) + return 0; + } + + return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4, + call->srtt_us >> 3)); +} + +/* + * Detect losses [RFC8958 6.2 Step 5]. + */ +static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + struct rxrpc_txqueue *tq; + ktime_t timeout = 0, lost_after, now = ktime_get_real(); + + call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary); + lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); + trace_rxrpc_rack_scan_loss(call); + + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long nacks = rxrpc_tq_nacks(tq); + + if (after(tq->qbase, call->tx_transmitted)) + break; + trace_rxrpc_rack_scan_loss_tq(call, tq, nacks); + + /* Skip ones marked lost but not yet retransmitted */ + nacks &= ~tq->segment_lost | tq->segment_retransmitted; + + while (nacks) { + unsigned int ix = __ffs(nacks); + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t remaining; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + + __clear_bit(ix, &nacks); + + if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq, + xmit_ts, seq)) { + remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now); + if (remaining <= 0) { + rxrpc_rack_mark_lost(call, tq, ix); + trace_rxrpc_rack_detect_loss(call, summary, seq); + } else { + timeout = max(remaining, timeout); + } + } + } + } + + return timeout; +} + +/* + * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5]. + */ +void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + ktime_t timeout = rxrpc_rack_detect_loss(call, summary); + + if (timeout) { + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER; + call->rack_timo_at = ktime_add(ktime_get_real(), timeout); + trace_rxrpc_rack_timer(call, timeout, false); + trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo); + } +} + +/* + * Handle RACK-TLP RTO expiration [RFC8958 6.3]. + */ +static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq; + rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ + ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); + ktime_t deadline = ktime_sub(ktime_get_real(), lost_after); + + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long unacked = ~tq->segment_acked; + + trace_rxrpc_rack_mark_loss_tq(call, tq); + while (unacked) { + unsigned int ix = __ffs(unacked); + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + + if (after(seq, call->tx_transmitted)) + return; + __clear_bit(ix, &unacked); + + if (seq == snd_una || + ktime_before(xmit_ts, deadline)) + rxrpc_rack_mark_lost(call, tq, ix); + } + } +} + +/* + * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2]. + */ +ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now) +{ + unsigned int flight_size = rxrpc_tx_in_flight(call); + ktime_t rto_at = ktime_add(call->tx_last_sent, + rxrpc_get_rto_backoff(call, false)); + ktime_t pto; + + if (call->rtt_count > 0) { + /* Use 2*SRTT as the timeout. */ + pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4); + if (flight_size) + pto = ktime_add(pto, call->tlp_max_ack_delay); + } else { + pto = NSEC_PER_SEC; + } + + if (ktime_after(ktime_add(now, pto), rto_at)) + pto = ktime_sub(rto_at, now); + return pto; +} + +/* + * Send a TLP loss probe on PTO expiration [RFC8958 7.3]. + */ +void rxrpc_tlp_send_probe(struct rxrpc_call *call) +{ + unsigned int in_flight = rxrpc_tx_in_flight(call); + + if (after_eq(call->acks_hard_ack, call->tx_transmitted)) + return; /* Everything we transmitted has been acked. */ + + /* There must be no other loss probe still in flight and we need to + * have taken a new RTT sample since last probe or the start of + * connection. + */ + if (!call->tlp_serial && + call->tlp_rtt_taken != call->rtt_taken) { + call->tlp_is_retrans = false; + if (after(call->send_top, call->tx_transmitted) && + rxrpc_tx_window_space(call) > 0) { + /* Transmit the lowest-sequence unsent DATA */ + call->tx_last_serial = 0; + rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data); + call->tlp_serial = call->tx_last_serial; + call->tlp_seq = call->tx_transmitted; + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new); + in_flight = rxrpc_tx_in_flight(call); + } else { + /* Retransmit the highest-sequence DATA sent */ + call->tx_last_serial = 0; + rxrpc_resend_tlp(call); + call->tlp_is_retrans = true; + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit); + } + } else { + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy); + } + + if (in_flight != 0) { + ktime_t rto = rxrpc_get_rto_backoff(call, false); + + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO; + call->rack_timo_at = ktime_add(ktime_get_real(), rto); + trace_rxrpc_rack_timer(call, rto, false); + trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto); + } +} + +/* + * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4]. + */ +void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) +{ + if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack)) + return; + + if (!call->tlp_is_retrans) { + /* TLP of new data delivered */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data); + call->tlp_serial = 0; + } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE && + summary->acked_serial == call->tlp_serial) { + /* General Case: Detected packet losses using RACK [7.4.1] */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked); + call->tlp_serial = 0; + } else if (after(call->acks_hard_ack, call->tlp_seq)) { + /* Repaired the single loss */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond); + call->tlp_serial = 0; + // TODO: Invoke congestion control to react to the loss + // event the probe has repaired + } else if (summary->tlp_probe_acked) { + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked); + /* Special Case: Detected a single loss repaired by the loss + * probe [7.4.2] + */ + call->tlp_serial = 0; + } else { + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete); + } +} + +/* + * Handle RACK timer expiration; returns true to request a resend. + */ +void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by) +{ + struct rxrpc_ack_summary summary = {}; + enum rxrpc_rack_timer_mode mode = call->rack_timer_mode; + + trace_rxrpc_rack_timer(call, overran_by, true); + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; + + switch (mode) { + case RXRPC_CALL_RACKTIMER_RACK_REORDER: + rxrpc_rack_detect_loss_and_arm_timer(call, &summary); + break; + case RXRPC_CALL_RACKTIMER_TLP_PTO: + rxrpc_tlp_send_probe(call); + break; + case RXRPC_CALL_RACKTIMER_RTO: + // Might need to poke the congestion algo in some way + rxrpc_rack_mark_losses_on_rto(call); + break; + //case RXRPC_CALL_RACKTIMER_ZEROWIN: + default: + pr_warn("Unexpected rack timer %u", call->rack_timer_mode); + } +} diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 34353b6e584b..e068f9b79d02 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -15,18 +15,18 @@ static int none_init_connection_security(struct rxrpc_connection *conn, } /* - * Work out how much data we can put in an unsecured packet. + * Allocate an appropriately sized buffer for the amount of data remaining. */ -static int none_how_much_data(struct rxrpc_call *call, size_t remain, - size_t *_buf_size, size_t *_data_size, size_t *_offset) +static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { - *_buf_size = *_data_size = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); - *_offset = 0; - return 0; + return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp); } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { + txb->pkt_len = txb->len; + if (txb->len == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; return 0; } @@ -79,7 +79,7 @@ const struct rxrpc_security rxrpc_no_security = { .exit = none_exit, .init_connection_security = none_init_connection_security, .free_call_crypto = none_free_call_crypto, - .how_much_data = none_how_much_data, + .alloc_txbuf = none_alloc_txbuf, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, .respond_to_challenge = none_respond_to_challenge, diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 4a3a08a0e2cd..64f8d77b8731 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -27,11 +27,17 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) { struct sk_buff_head *rx_queue; struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); + struct task_struct *io_thread; if (unlikely(!local)) { kfree_skb(skb); return 0; } + io_thread = READ_ONCE(local->io_thread); + if (!io_thread) { + kfree_skb(skb); + return 0; + } if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); @@ -47,7 +53,7 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) #endif skb_queue_tail(rx_queue, skb); - rxrpc_wake_up_io_thread(local); + wake_up_process(io_thread); return 0; } @@ -124,6 +130,7 @@ static bool rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) { struct rxrpc_wire_header whdr; + struct rxrpc_ackpacket ack; /* dig out the RxRPC connection details */ if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) @@ -141,6 +148,16 @@ static bool rxrpc_extract_header(struct rxrpc_skb_priv *sp, sp->hdr.securityIndex = whdr.securityIndex; sp->hdr._rsvd = ntohs(whdr._rsvd); sp->hdr.serviceId = ntohs(whdr.serviceId); + + if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK) { + if (skb_copy_bits(skb, sizeof(whdr), &ack, sizeof(ack)) < 0) + return rxrpc_bad_message(skb, rxrpc_badmsg_short_ack); + sp->ack.first_ack = ntohl(ack.firstPacket); + sp->ack.prev_ack = ntohl(ack.previousPacket); + sp->ack.acked_serial = ntohl(ack.serial); + sp->ack.reason = ack.reason; + sp->ack.nr_acks = ack.nAcks; + } return true; } @@ -321,7 +338,6 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; unsigned int channel; - bool ret; if (sp->hdr.securityIndex != conn->security_ix) return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security, @@ -347,6 +363,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (sp->hdr.callNumber == 0) return rxrpc_input_conn_packet(conn, skb); + /* Deal with path MTU discovery probing. */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK && + conn->pmtud_probe && + after_eq(sp->ack.acked_serial, conn->pmtud_probe)) + rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false); + /* Call-bound packets are routed by connection channel. */ channel = sp->hdr.cid & RXRPC_CHANNELMASK; chan = &conn->channels[channel]; @@ -402,9 +424,9 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, peer_srx, skb); } - ret = rxrpc_input_call_event(call, skb); + rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - return ret; + return true; } /* @@ -421,6 +443,8 @@ int rxrpc_io_thread(void *data) ktime_t now; #endif bool should_stop; + LIST_HEAD(conn_attend_q); + LIST_HEAD(call_attend_q); complete(&local->io_thread_ready); @@ -431,43 +455,26 @@ int rxrpc_io_thread(void *data) for (;;) { rxrpc_inc_stat(local->rxnet, stat_io_loop); - /* Deal with connections that want immediate attention. */ - conn = list_first_entry_or_null(&local->conn_attend_q, - struct rxrpc_connection, - attend_link); - if (conn) { - spin_lock_bh(&local->lock); - list_del_init(&conn->attend_link); - spin_unlock_bh(&local->lock); - - rxrpc_input_conn_event(conn, NULL); - rxrpc_put_connection(conn, rxrpc_conn_put_poke); - continue; + /* Inject a delay into packets if requested. */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + now = ktime_get_real(); + while ((skb = skb_peek(&local->rx_delay_queue))) { + if (ktime_before(now, skb->tstamp)) + break; + skb = skb_dequeue(&local->rx_delay_queue); + skb_queue_tail(&local->rx_queue, skb); } +#endif - if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, - &local->client_conn_flags)) - rxrpc_discard_expired_client_conns(local); - - /* Deal with calls that want immediate attention. */ - if ((call = list_first_entry_or_null(&local->call_attend_q, - struct rxrpc_call, - attend_link))) { - spin_lock_bh(&local->lock); - list_del_init(&call->attend_link); - spin_unlock_bh(&local->lock); - - trace_rxrpc_call_poked(call); - rxrpc_input_call_event(call, NULL); - rxrpc_put_call(call, rxrpc_call_put_poke); - continue; + if (!skb_queue_empty(&local->rx_queue)) { + spin_lock_irq(&local->rx_queue.lock); + skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); + spin_unlock_irq(&local->rx_queue.lock); + trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue)); } - if (!list_empty(&local->new_client_calls)) - rxrpc_connect_client_calls(local); - - /* Process received packets and errors. */ - if ((skb = __skb_dequeue(&rx_queue))) { + /* Distribute packets and errors. */ + while ((skb = __skb_dequeue(&rx_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: @@ -491,27 +498,46 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_unknown); break; } - continue; } - /* Inject a delay into packets if requested. */ -#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY - now = ktime_get_real(); - while ((skb = skb_peek(&local->rx_delay_queue))) { - if (ktime_before(now, skb->tstamp)) - break; - skb = skb_dequeue(&local->rx_delay_queue); - skb_queue_tail(&local->rx_queue, skb); + /* Deal with connections that want immediate attention. */ + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); + spin_unlock_irq(&local->lock); + + while ((conn = list_first_entry_or_null(&conn_attend_q, + struct rxrpc_connection, + attend_link))) { + spin_lock_irq(&local->lock); + list_del_init(&conn->attend_link); + spin_unlock_irq(&local->lock); + rxrpc_input_conn_event(conn, NULL); + rxrpc_put_connection(conn, rxrpc_conn_put_poke); } -#endif - if (!skb_queue_empty(&local->rx_queue)) { - spin_lock_irq(&local->rx_queue.lock); - skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); - spin_unlock_irq(&local->rx_queue.lock); - continue; + if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, + &local->client_conn_flags)) + rxrpc_discard_expired_client_conns(local); + + /* Deal with calls that want immediate attention. */ + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->call_attend_q, &call_attend_q); + spin_unlock_irq(&local->lock); + + while ((call = list_first_entry_or_null(&call_attend_q, + struct rxrpc_call, + attend_link))) { + spin_lock_irq(&local->lock); + list_del_init(&call->attend_link); + spin_unlock_irq(&local->lock); + trace_rxrpc_call_poked(call); + rxrpc_input_call_event(call); + rxrpc_put_call(call, rxrpc_call_put_poke); } + if (!list_empty(&local->new_client_calls)) + rxrpc_connect_client_calls(local); + set_current_state(TASK_INTERRUPTIBLE); should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || @@ -541,7 +567,7 @@ int rxrpc_io_thread(void *data) } timeout = nsecs_to_jiffies(delay_ns); - timeout = max(timeout, 1UL); + timeout = umax(timeout, 1); schedule_timeout(timeout); __set_current_state(TASK_RUNNING); continue; @@ -554,7 +580,7 @@ int rxrpc_io_thread(void *data) __set_current_state(TASK_RUNNING); rxrpc_see_local(local, rxrpc_local_stop); rxrpc_destroy_local(local); - local->io_thread = NULL; + WRITE_ONCE(local->io_thread, NULL); rxrpc_see_local(local, rxrpc_local_stopped); return 0; } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 34d307368135..a74a4b43904f 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -215,9 +215,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) /* we want to set the don't fragment bit */ rxrpc_local_dont_fragment(local, true); - - /* We want receive timestamps. */ - sock_enable_timestamps(usk); break; default: @@ -232,7 +229,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } wait_for_completion(&local->io_thread_ready); - local->io_thread = io_thread; + WRITE_ONCE(local->io_thread, io_thread); _leave(" = 0"); return 0; @@ -452,6 +449,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local) #endif rxrpc_purge_queue(&local->rx_queue); rxrpc_purge_client_connections(local); + page_frag_cache_drain(&local->tx_alloc); } /* diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 825b81183046..8fcc8139d771 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -17,22 +17,22 @@ unsigned int rxrpc_max_backlog __read_mostly = 10; /* - * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). + * How long to wait before scheduling an ACK with subtype DELAY (in ms). * * We use this when we've received new data packets. If those packets aren't * all consumed within this time we will send a DELAY ACK if an ACK was not * requested to let the sender know it doesn't need to resend. */ -unsigned long rxrpc_soft_ack_delay = HZ; +unsigned long rxrpc_soft_ack_delay = 1000; /* - * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). + * How long to wait before scheduling an ACK with subtype IDLE (in ms). * * We use this when we've consumed some previously soft-ACK'd packets when * further packets aren't immediately received to decide when to send an IDLE * ACK let the other end know that it can free up its Tx buffer space. */ -unsigned long rxrpc_idle_ack_delay = HZ / 2; +unsigned long rxrpc_idle_ack_delay = 500; /* * Receive window size in packets. This indicates the maximum number of @@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255; * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet * made by gluing normal packets together that we're willing to handle. */ -unsigned int rxrpc_rx_mtu = 5692; +unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46); /* * The maximum number of fragments in a received jumbo packet that we tell the * sender that we're willing to handle. */ -unsigned int rxrpc_rx_jumbo_max = 4; +unsigned int rxrpc_rx_jumbo_max = 46; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY /* diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 4a292f860ae3..95905b85a8d7 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -48,12 +48,10 @@ static const char rxrpc_keepalive_string[] = ""; static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret) { if (ret < 0) { - u16 tx_backoff = READ_ONCE(call->tx_backoff); - - if (tx_backoff < HZ) - WRITE_ONCE(call->tx_backoff, tx_backoff + 1); + if (call->tx_backoff < 1000) + call->tx_backoff += 100; } else { - WRITE_ONCE(call->tx_backoff, 0); + call->tx_backoff = 0; } } @@ -65,84 +63,66 @@ static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret) * Receiving a response to the ping will prevent the ->expect_rx_by timer from * expiring. */ -static void rxrpc_set_keepalive(struct rxrpc_call *call) +static void rxrpc_set_keepalive(struct rxrpc_call *call, ktime_t now) { - unsigned long now = jiffies, keepalive_at = call->next_rx_timo / 6; + ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo) / 6); - keepalive_at += now; - WRITE_ONCE(call->keepalive_at, keepalive_at); - rxrpc_reduce_call_timer(call, keepalive_at, now, - rxrpc_timer_set_for_keepalive); + call->keepalive_at = ktime_add(ktime_get_real(), delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_keepalive); } /* - * Fill out an ACK packet. + * Allocate transmission buffers for an ACK and attach them to local->kv[]. */ -static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, - struct rxrpc_call *call, - struct rxrpc_txbuf *txb, - u16 *_rwind) +static int rxrpc_alloc_ack(struct rxrpc_call *call, size_t sack_size) { - struct rxrpc_ackinfo ackinfo; - unsigned int qsize, sack, wrap, to; - rxrpc_seq_t window, wtop; - int rsize; - u32 mtu, jmax; - u8 *ackp = txb->acks; - - call->ackr_nr_unacked = 0; - atomic_set(&call->ackr_nr_consumed, 0); - rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); - clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); - - window = call->ackr_window; - wtop = call->ackr_wtop; - sack = call->ackr_sack_base % RXRPC_SACK_SIZE; - txb->ack.firstPacket = htonl(window); - txb->ack.nAcks = wtop - window; - - if (after(wtop, window)) { - wrap = RXRPC_SACK_SIZE - sack; - to = min_t(unsigned int, txb->ack.nAcks, RXRPC_SACK_SIZE); - - if (sack + txb->ack.nAcks <= RXRPC_SACK_SIZE) { - memcpy(txb->acks, call->ackr_sack_table + sack, txb->ack.nAcks); - } else { - memcpy(txb->acks, call->ackr_sack_table + sack, wrap); - memcpy(txb->acks + wrap, call->ackr_sack_table, - to - wrap); + struct rxrpc_wire_header *whdr; + struct rxrpc_acktrailer *trailer; + struct rxrpc_ackpacket *ack; + struct kvec *kv = call->local->kvec; + gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS; + void *buf, *buf2 = NULL; + u8 *filler; + + buf = page_frag_alloc(&call->local->tx_alloc, + sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp); + if (!buf) + return -ENOMEM; + + if (sack_size) { + buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp); + if (!buf2) { + page_frag_free(buf); + return -ENOMEM; } - - ackp += to; - } else if (before(wtop, window)) { - pr_warn("ack window backward %x %x", window, wtop); - } else if (txb->ack.reason == RXRPC_ACK_DELAY) { - txb->ack.reason = RXRPC_ACK_IDLE; } - mtu = conn->peer->if_mtu; - mtu -= conn->peer->hdrsize; - jmax = rxrpc_rx_jumbo_max; - qsize = (window - 1) - call->rx_consumed; - rsize = max_t(int, call->rx_winsize - qsize, 0); - *_rwind = rsize; - ackinfo.rxMTU = htonl(rxrpc_rx_mtu); - ackinfo.maxMTU = htonl(mtu); - ackinfo.rwind = htonl(rsize); - ackinfo.jumbo_max = htonl(jmax); - - *ackp++ = 0; - *ackp++ = 0; - *ackp++ = 0; - memcpy(ackp, &ackinfo, sizeof(ackinfo)); - return txb->ack.nAcks + 3 + sizeof(ackinfo); + whdr = buf; + ack = buf + sizeof(*whdr); + filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; + trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; + + kv[0].iov_base = whdr; + kv[0].iov_len = sizeof(*whdr) + sizeof(*ack); + kv[1].iov_base = buf2; + kv[1].iov_len = sack_size; + kv[2].iov_base = filler; + kv[2].iov_len = 3 + sizeof(*trailer); + return 3; /* Number of kvec[] used. */ +} + +static void rxrpc_free_ack(struct rxrpc_call *call) +{ + page_frag_free(call->local->kvec[0].iov_base); + if (call->local->kvec[1].iov_base) + page_frag_free(call->local->kvec[1].iov_base); } /* * Record the beginning of an RTT probe. */ -static int rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, - enum rxrpc_rtt_tx_trace why) +static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, + ktime_t now, enum rxrpc_rtt_tx_trace why) { unsigned long avail = call->rtt_avail; int rtt_slot = 9; @@ -155,47 +135,126 @@ static int rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, goto no_slot; call->rtt_serial[rtt_slot] = serial; - call->rtt_sent_at[rtt_slot] = ktime_get_real(); + call->rtt_sent_at[rtt_slot] = now; smp_wmb(); /* Write data before avail bit */ set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); - return rtt_slot; + return; no_slot: trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); - return -1; } /* - * Cancel an RTT probe. + * Fill out an ACK packet. */ -static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call, - rxrpc_serial_t serial, int rtt_slot) +static int rxrpc_fill_out_ack(struct rxrpc_call *call, int nr_kv, u8 ack_reason, + rxrpc_serial_t serial_to_ack, rxrpc_serial_t *_ack_serial) { - if (rtt_slot != -1) { - clear_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); - smp_wmb(); /* Clear pending bit before setting slot */ - set_bit(rtt_slot, &call->rtt_avail); - trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_cancel, rtt_slot, serial); + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; + struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3; + struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); + unsigned int qsize, sack, wrap, to, max_mtu, if_mtu; + rxrpc_seq_t window, wtop; + ktime_t now = ktime_get_real(); + int rsize; + u8 *filler = kv[2].iov_base; + u8 *sackp = kv[1].iov_base; + + rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); + + window = call->ackr_window; + wtop = call->ackr_wtop; + sack = call->ackr_sack_base % RXRPC_SACK_SIZE; + + *_ack_serial = rxrpc_get_next_serial(call->conn); + + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->serial = htonl(*_ack_serial); + whdr->seq = 0; + whdr->type = RXRPC_PACKET_TYPE_ACK; + whdr->flags = call->conn->out_clientflag | RXRPC_SLOW_START_OK; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->dest_srx.srx_service); + + ack->bufferSpace = 0; + ack->maxSkew = 0; + ack->firstPacket = htonl(window); + ack->previousPacket = htonl(call->rx_highest_seq); + ack->serial = htonl(serial_to_ack); + ack->reason = ack_reason; + ack->nAcks = wtop - window; + filler[0] = 0; + filler[1] = 0; + filler[2] = 0; + + if (ack_reason == RXRPC_ACK_PING) + whdr->flags |= RXRPC_REQUEST_ACK; + + if (after(wtop, window)) { + kv[1].iov_len = ack->nAcks; + + wrap = RXRPC_SACK_SIZE - sack; + to = umin(ack->nAcks, RXRPC_SACK_SIZE); + + if (sack + ack->nAcks <= RXRPC_SACK_SIZE) { + memcpy(sackp, call->ackr_sack_table + sack, ack->nAcks); + } else { + memcpy(sackp, call->ackr_sack_table + sack, wrap); + memcpy(sackp + wrap, call->ackr_sack_table, to - wrap); + } + } else if (before(wtop, window)) { + pr_warn("ack window backward %x %x", window, wtop); + } else if (ack->reason == RXRPC_ACK_DELAY) { + ack->reason = RXRPC_ACK_IDLE; + } + + qsize = (window - 1) - call->rx_consumed; + rsize = max_t(int, call->rx_winsize - qsize, 0); + + if_mtu = call->peer->if_mtu - call->peer->hdrsize; + if (call->peer->ackr_adv_pmtud) { + max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(if_mtu, 1444); + max_mtu = if_mtu; } + + trailer->maxMTU = htonl(max_mtu); + trailer->ifMTU = htonl(if_mtu); + trailer->rwind = htonl(rsize); + trailer->jumbo_max = 0; /* Advertise pmtu discovery */ + + if (ack_reason == RXRPC_ACK_PING) + rxrpc_begin_rtt_probe(call, *_ack_serial, now, rxrpc_rtt_tx_ping); + if (whdr->flags & RXRPC_REQUEST_ACK) + call->rtt_last_req = now; + rxrpc_set_keepalive(call, now); + return nr_kv; } /* * Transmit an ACK packet. */ -int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static void rxrpc_send_ack_packet(struct rxrpc_call *call, int nr_kv, size_t len, + rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; + struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3; struct rxrpc_connection *conn; + struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); struct msghdr msg; - struct kvec iov[1]; - rxrpc_serial_t serial; - size_t len, n; - int ret, rtt_slot = -1; - u16 rwind; + int ret; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) - return -ECONNRESET; + return; conn = call->conn; @@ -203,55 +262,100 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; - msg.msg_flags = 0; + msg.msg_flags = MSG_SPLICE_PAGES; - if (txb->ack.reason == RXRPC_ACK_PING) - txb->wire.flags |= RXRPC_REQUEST_ACK; - - n = rxrpc_fill_out_ack(conn, call, txb, &rwind); - if (n == 0) - return 0; - - iov[0].iov_base = &txb->wire; - iov[0].iov_len = sizeof(txb->wire) + sizeof(txb->ack) + n; - len = iov[0].iov_len; - - serial = rxrpc_get_next_serial(conn); - txb->wire.serial = htonl(serial); trace_rxrpc_tx_ack(call->debug_id, serial, - ntohl(txb->ack.firstPacket), - ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks, - rwind); - - if (txb->ack.reason == RXRPC_ACK_PING) - rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping); + ntohl(ack->firstPacket), + ntohl(ack->serial), ack->reason, ack->nAcks, + ntohl(trailer->rwind), why); rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); - /* Grab the highest received seq as late as possible */ - txb->ack.previousPacket = htonl(call->rx_highest_seq); + iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, len); + rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe); - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); ret = do_udp_sendmsg(conn->local->socket, &msg, len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) { trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); + if (why == rxrpc_propose_ack_ping_for_mtu_probe && + ret == -EMSGSIZE) + rxrpc_input_probe_for_pmtud(conn, serial, true); } else { - trace_rxrpc_tx_packet(call->debug_id, &txb->wire, + trace_rxrpc_tx_packet(call->debug_id, whdr, rxrpc_tx_point_call_ack); - if (txb->wire.flags & RXRPC_REQUEST_ACK) - call->peer->rtt_last_req = ktime_get_real(); + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + call->peer->pmtud_pending = false; + call->peer->pmtud_probing = true; + call->conn->pmtud_probe = serial; + call->conn->pmtud_call = call->debug_id; + trace_rxrpc_pmtud_tx(call); + } } rxrpc_tx_backoff(call, ret); +} - if (!__rxrpc_call_is_complete(call)) { - if (ret < 0) - rxrpc_cancel_rtt_probe(call, serial, rtt_slot); - rxrpc_set_keepalive(call); +/* + * Queue an ACK for immediate transmission. + */ +void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, + rxrpc_serial_t serial_to_ack, enum rxrpc_propose_ack_trace why) +{ + struct kvec *kv = call->local->kvec; + rxrpc_serial_t ack_serial; + size_t len; + int nr_kv; + + if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) + return; + + rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); + + nr_kv = rxrpc_alloc_ack(call, call->ackr_wtop - call->ackr_window); + if (nr_kv < 0) { + kleave(" = -ENOMEM"); + return; } - return ret; + nr_kv = rxrpc_fill_out_ack(call, nr_kv, ack_reason, serial_to_ack, &ack_serial); + len = kv[0].iov_len; + len += kv[1].iov_len; + len += kv[2].iov_len; + + /* Extend a path MTU probe ACK. */ + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header); + + if (len > probe_mtu) + goto skip; + while (len < probe_mtu) { + size_t part = umin(probe_mtu - len, PAGE_SIZE); + + kv[nr_kv].iov_base = page_address(ZERO_PAGE(0)); + kv[nr_kv].iov_len = part; + len += part; + nr_kv++; + } + } + + call->ackr_nr_unacked = 0; + atomic_set(&call->ackr_nr_consumed, 0); + clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); + + trace_rxrpc_send_ack(call, why, ack_reason, ack_serial); + rxrpc_send_ack_packet(call, nr_kv, len, ack_serial, why); +skip: + rxrpc_free_ack(call); +} + +/* + * Send an ACK probe for path MTU discovery. + */ +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call) +{ + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_mtu_probe); } /* @@ -319,38 +423,38 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) } /* - * send a packet through the transport endpoint + * Prepare a (sub)packet for transmission. */ -int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_txbuf *txb, + struct rxrpc_wire_header *whdr, + rxrpc_serial_t serial, int subpkt) { + struct rxrpc_jumbo_header *jumbo = txb->data - sizeof(*jumbo); enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; - struct msghdr msg; - struct kvec iov[1]; - rxrpc_serial_t serial; - size_t len; - int ret, rtt_slot = -1; + struct kvec *kv = &call->local->kvec[1 + subpkt]; + size_t len = txb->pkt_len; + bool last; + u8 flags; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,%zd", txb->seq, len); - /* Each transmission of a Tx packet needs a new serial number */ - serial = rxrpc_get_next_serial(conn); - txb->wire.serial = htonl(serial); + txb->serial = serial; if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) && txb->seq == 1) - txb->wire.userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; + whdr->userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; - iov[0].iov_base = &txb->wire; - iov[0].iov_len = sizeof(txb->wire) + txb->len; - len = iov[0].iov_len; - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); + txb->flags &= ~RXRPC_REQUEST_ACK; + flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; + last = txb->flags & RXRPC_LAST_PACKET; - msg.msg_name = &call->peer->srx.transport; - msg.msg_namelen = call->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; + if (subpkt < req->n - 1) { + len = RXRPC_JUMBO_DATALEN; + goto dont_set_request_ack; + } /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. @@ -359,63 +463,245 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (txb->wire.flags & RXRPC_REQUEST_ACK) - why = rxrpc_reqack_already_on; - else if (test_bit(RXRPC_TXBUF_LAST, &txb->flags) && rxrpc_sending_to_client(txb)) + if (last && rxrpc_sending_to_client(txb)) why = rxrpc_reqack_no_srv_last; else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) why = rxrpc_reqack_ack_lost; - else if (test_bit(RXRPC_TXBUF_RESENT, &txb->flags)) + else if (txb->flags & RXRPC_TXBUF_RESENT) why = rxrpc_reqack_retrans; - else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2) + else if (call->cong_ca_state == RXRPC_CA_SLOW_START && call->cong_cwnd <= RXRPC_MIN_CWND) why = rxrpc_reqack_slow_start; else if (call->tx_winsize <= 2) why = rxrpc_reqack_small_txwin; - else if (call->peer->rtt_count < 3 && txb->seq & 1) + else if (call->rtt_count < 3) why = rxrpc_reqack_more_rtt; - else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) + else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) why = rxrpc_reqack_old_rtt; + else if (!last && !after(READ_ONCE(call->send_top), txb->seq)) + why = rxrpc_reqack_app_stall; else goto dont_set_request_ack; rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]); trace_rxrpc_req_ack(call->debug_id, txb->seq, why); - if (why != rxrpc_reqack_no_srv_last) - txb->wire.flags |= RXRPC_REQUEST_ACK; + if (why != rxrpc_reqack_no_srv_last) { + flags |= RXRPC_REQUEST_ACK; + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, -1, serial); + call->rtt_last_req = req->now; + } dont_set_request_ack: - if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { - static int lose; - if ((lose++ & 7) == 7) { - ret = 0; - trace_rxrpc_tx_data(call, txb->seq, serial, - txb->wire.flags, - test_bit(RXRPC_TXBUF_RESENT, &txb->flags), - true); - goto done; + /* There's a jumbo header prepended to the data if we need it. */ + if (subpkt < req->n - 1) + flags |= RXRPC_JUMBO_PACKET; + else + flags &= ~RXRPC_JUMBO_PACKET; + if (subpkt == 0) { + whdr->flags = flags; + whdr->cksum = txb->cksum; + kv->iov_base = txb->data; + } else { + jumbo->flags = flags; + jumbo->pad = 0; + jumbo->cksum = txb->cksum; + kv->iov_base = jumbo; + len += sizeof(*jumbo); + } + + trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, req->trace); + kv->iov_len = len; + return len; +} + +/* + * Prepare a transmission queue object for initial transmission. Returns the + * number of microseconds since the transmission queue base timestamp. + */ +static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq, + struct rxrpc_send_data_req *req) +{ + if (!tq) + return 0; + if (tq->xmit_ts_base == KTIME_MIN) { + tq->xmit_ts_base = req->now; + return 0; + } + return ktime_to_us(ktime_sub(req->now, tq->xmit_ts_base)); +} + +/* + * Prepare a (jumbo) packet for transmission. + */ +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_wire_header *whdr) +{ + struct rxrpc_txqueue *tq = req->tq; + rxrpc_serial_t serial; + unsigned int xmit_ts; + rxrpc_seq_t seq = req->seq; + size_t len = 0; + bool start_tlp = false; + + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit); + + /* Each transmission of a Tx packet needs a new serial number */ + serial = rxrpc_get_next_serials(call->conn, req->n); + + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->seq = htonl(seq); + whdr->serial = htonl(serial); + whdr->type = RXRPC_PACKET_TYPE_DATA; + whdr->flags = 0; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->conn->service_id); + + call->tx_last_serial = serial + req->n - 1; + call->tx_last_sent = req->now; + xmit_ts = rxrpc_prepare_txqueue(tq, req); + prefetch(tq->next); + + for (int i = 0;;) { + int ix = seq & RXRPC_TXQ_MASK; + struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK]; + + _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq); + + /* Record (re-)transmission for RACK [RFC8985 6.1]. */ + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + if (req->retrans) { + __set_bit(ix, &tq->ever_retransmitted); + __set_bit(ix, &tq->segment_retransmitted); + call->tx_nr_resent++; + } else { + call->tx_nr_sent++; + start_tlp = true; + } + tq->segment_xmit_ts[ix] = xmit_ts; + tq->segment_serial[ix] = serial; + if (i + 1 == req->n) + /* Only sample the last subpacket in a jumbo. */ + __set_bit(ix, &tq->rtt_samples); + len += rxrpc_prepare_data_subpacket(call, req, txb, whdr, serial, i); + serial++; + seq++; + i++; + if (i >= req->n) + break; + if (!(seq & RXRPC_TXQ_MASK)) { + tq = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit_advance); + xmit_ts = rxrpc_prepare_txqueue(tq, req); } } - trace_rxrpc_tx_data(call, txb->seq, serial, txb->wire.flags, - test_bit(RXRPC_TXBUF_RESENT, &txb->flags), false); + /* Set timeouts */ + if (req->tlp_probe) { + /* Sending TLP loss probe [RFC8985 7.3]. */ + call->tlp_serial = serial - 1; + call->tlp_seq = seq - 1; + } else if (start_tlp) { + /* Schedule TLP loss probe [RFC8985 7.2]. */ + ktime_t pto; + + if (!test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + /* The first packet may take longer to elicit a response. */ + pto = NSEC_PER_SEC; + else + pto = rxrpc_tlp_calc_pto(call, req->now); + + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_TLP_PTO; + call->rack_timo_at = ktime_add(req->now, pto); + trace_rxrpc_rack_timer(call, pto, false); + trace_rxrpc_timer_set(call, pto, rxrpc_timer_trace_rack_tlp_pto); + } + + if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { + ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo)); + + call->expect_rx_by = ktime_add(req->now, delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx); + } + + rxrpc_set_keepalive(call, req->now); + page_frag_free(whdr); + return len; +} + +/* + * Send one or more packets through the transport endpoint + */ +void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) +{ + struct rxrpc_wire_header *whdr; + struct rxrpc_connection *conn = call->conn; + enum rxrpc_tx_point frag; + struct rxrpc_txqueue *tq = req->tq; + struct rxrpc_txbuf *txb; + struct msghdr msg; + rxrpc_seq_t seq = req->seq; + size_t len = sizeof(*whdr); + bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags); + int ret, stat_ix; + + _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1); + + whdr = page_frag_alloc(&call->local->tx_alloc, sizeof(*whdr), GFP_NOFS); + if (!whdr) + return; /* Drop the packet if no memory. */ + + call->local->kvec[0].iov_base = whdr; + call->local->kvec[0].iov_len = sizeof(*whdr); + + stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1; + atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]); + + len += rxrpc_prepare_data_packet(call, req, whdr); + txb = tq->bufs[seq & RXRPC_TXQ_MASK]; + + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1 + req->n, len); + + msg.msg_name = &call->peer->srx.transport; + msg.msg_namelen = call->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + /* Send the packet with the don't fragment bit set unless we think it's + * too big or if this is a retransmission. + */ + if (seq == call->tx_transmitted + 1 && + len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) { + rxrpc_local_dont_fragment(conn->local, false); + frag = rxrpc_tx_point_call_data_frag; + } else { + rxrpc_local_dont_fragment(conn->local, true); + frag = rxrpc_tx_point_call_data_nofrag; + } /* Track what we've attempted to transmit at least once so that the * retransmission algorithm doesn't try to resend what we haven't sent - * yet. However, this can race as we can receive an ACK before we get - * to this point. But, OTOH, if we won't get an ACK mentioning this - * packet unless the far side received it (though it could have - * discarded it anyway and NAK'd it). + * yet. */ - cmpxchg(&call->tx_transmitted, txb->seq - 1, txb->seq); + if (seq == call->tx_transmitted + 1) + call->tx_transmitted = seq + req->n - 1; - /* send the packet with the don't fragment bit set if we currently - * think it's small enough */ - if (txb->len >= call->peer->maxdata) - goto send_fragmentable; + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { + static int lose; - txb->last_sent = ktime_get_real(); - if (txb->wire.flags & RXRPC_REQUEST_ACK) - rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); + if ((lose++ & 7) == 7) { + ret = 0; + trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, + rxrpc_txdata_inject_loss); + conn->peer->last_tx_at = ktime_get_seconds(); + goto done; + } + } /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet @@ -427,96 +713,35 @@ dont_set_request_ack: ret = do_udp_sendmsg(conn->local->socket, &msg, len); conn->peer->last_tx_at = ktime_get_seconds(); - if (ret < 0) { + if (ret == -EMSGSIZE) { + rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); + ret = 0; + } else if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); - rxrpc_cancel_rtt_probe(call, serial, rtt_slot); - trace_rxrpc_tx_fail(call->debug_id, serial, ret, - rxrpc_tx_point_call_data_nofrag); + trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { - trace_rxrpc_tx_packet(call->debug_id, &txb->wire, - rxrpc_tx_point_call_data_nofrag); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); } rxrpc_tx_backoff(call, ret); - if (ret == -EMSGSIZE) - goto send_fragmentable; - -done: - if (ret >= 0) { - call->tx_last_sent = txb->last_sent; - if (txb->wire.flags & RXRPC_REQUEST_ACK) { - call->peer->rtt_last_req = txb->last_sent; - if (call->peer->rtt_count > 1) { - unsigned long nowj = jiffies, ack_lost_at; - - ack_lost_at = rxrpc_get_rto_backoff(call->peer, false); - ack_lost_at += nowj; - WRITE_ONCE(call->ack_lost_at, ack_lost_at); - rxrpc_reduce_call_timer(call, ack_lost_at, nowj, - rxrpc_timer_set_for_lost_ack); - } - } - - if (txb->seq == 1 && - !test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, - &call->flags)) { - unsigned long nowj = jiffies, expect_rx_by; - - expect_rx_by = nowj + call->next_rx_timo; - WRITE_ONCE(call->expect_rx_by, expect_rx_by); - rxrpc_reduce_call_timer(call, expect_rx_by, nowj, - rxrpc_timer_set_for_normal); - } - rxrpc_set_keepalive(call); - } else { - /* Cancel the call if the initial transmission fails, - * particularly if that's due to network routing issues that - * aren't going away anytime soon. The layer above can arrange - * the retransmission. + if (ret < 0) { + /* Cancel the call if the initial transmission fails or if we + * hit due to network routing issues that aren't going away + * anytime soon. The layer above can arrange the + * retransmission. */ - if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + if (new_call || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH || + ret == -ECONNREFUSED) rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_USER_ABORT, ret); } - _leave(" = %d [%u]", ret, call->peer->maxdata); - return ret; - -send_fragmentable: - /* attempt to send this message with fragmentation enabled */ - _debug("send fragment"); - - txb->last_sent = ktime_get_real(); - if (txb->wire.flags & RXRPC_REQUEST_ACK) - rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); - - switch (conn->local->srx.transport.family) { - case AF_INET6: - case AF_INET: - rxrpc_local_dont_fragment(conn->local, false); - rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag); - ret = do_udp_sendmsg(conn->local->socket, &msg, len); - conn->peer->last_tx_at = ktime_get_seconds(); - - rxrpc_local_dont_fragment(conn->local, true); - break; - - default: - BUG(); - } - - if (ret < 0) { - rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); - rxrpc_cancel_rtt_probe(call, serial, rtt_slot); - trace_rxrpc_tx_fail(call->debug_id, serial, ret, - rxrpc_tx_point_call_data_frag); - } else { - trace_rxrpc_tx_packet(call->debug_id, &txb->wire, - rxrpc_tx_point_call_data_frag); - } - rxrpc_tx_backoff(call, ret); - goto done; +done: + _leave(" = %d [%u]", ret, call->peer->max_data); } /* @@ -691,43 +916,3 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) peer->last_tx_at = ktime_get_seconds(); _leave(""); } - -/* - * Schedule an instant Tx resend. - */ -static inline void rxrpc_instant_resend(struct rxrpc_call *call, - struct rxrpc_txbuf *txb) -{ - if (!__rxrpc_call_is_complete(call)) - kdebug("resend"); -} - -/* - * Transmit one packet. - */ -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb) -{ - int ret; - - ret = rxrpc_send_data_packet(call, txb); - if (ret < 0) { - switch (ret) { - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - 0, ret); - break; - default: - _debug("need instant resend %d", ret); - rxrpc_instant_resend(call, txb); - } - } else { - unsigned long now = jiffies; - unsigned long resend_at = now + call->peer->rto_j; - - WRITE_ONCE(call->resend_at, resend_at); - rxrpc_reduce_call_timer(call, resend_at, now, - rxrpc_timer_set_for_send); - } -} diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 552ba84a255c..7f4729234957 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -102,6 +102,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, */ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { + unsigned int max_data; + /* wind down the local interface MTU */ if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) peer->if_mtu = mtu; @@ -120,11 +122,15 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) } } - if (mtu < peer->mtu) { - spin_lock(&peer->lock); - peer->mtu = mtu; - peer->maxdata = peer->mtu - peer->hdrsize; - spin_unlock(&peer->lock); + max_data = max_t(int, mtu - peer->hdrsize, 500); + if (max_data < peer->max_data) { + if (peer->pmtud_good > max_data) + peer->pmtud_good = max_data; + if (peer->pmtud_bad > max_data + 1) + peer->pmtud_bad = max_data + 1; + + trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp); + peer->max_data = max_data; } } @@ -161,6 +167,13 @@ void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb) goto out; } + if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 && + serr->ee.ee_type == ICMPV6_PKT_TOOBIG && + serr->ee.ee_code == 0)) { + rxrpc_adjust_mtu(peer, serr->ee.ee_info); + goto out; + } + rxrpc_store_error(peer, skb); out: rxrpc_put_peer(peer, rxrpc_peer_put_input_error); @@ -205,23 +218,23 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb, struct rxrpc_call *call; HLIST_HEAD(error_targets); - spin_lock(&peer->lock); + spin_lock_irq(&peer->lock); hlist_move_list(&peer->error_targets, &error_targets); while (!hlist_empty(&error_targets)) { call = hlist_entry(error_targets.first, struct rxrpc_call, error_link); hlist_del_init(&call->error_link); - spin_unlock(&peer->lock); + spin_unlock_irq(&peer->lock); rxrpc_see_call(call, rxrpc_call_see_distribute_error); rxrpc_set_call_completion(call, compl, 0, -err); - rxrpc_input_call_event(call, skb); + rxrpc_input_call_event(call); - spin_lock(&peer->lock); + spin_lock_irq(&peer->lock); } - spin_unlock(&peer->lock); + spin_unlock_irq(&peer->lock); } /* @@ -238,7 +251,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, bool use; int slot; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); while (!list_empty(collector)) { peer = list_entry(collector->next, @@ -249,7 +262,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, continue; use = __rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (use) { keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; @@ -269,17 +282,17 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, */ slot += cursor; slot &= mask; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } rxrpc_put_peer(peer, rxrpc_peer_put_keepalive); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); } /* @@ -309,7 +322,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) * second; the bucket at cursor + 1 goes at now + 1s and so * on... */ - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_splice_init(&rxnet->peer_keepalive_new, &collector); stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive); @@ -321,7 +334,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) } base = now; - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxnet->peer_keepalive_base = base; rxnet->peer_keepalive_cursor = cursor; @@ -347,3 +360,84 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) _leave(""); } + +/* + * Do path MTU probing. + */ +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail) +{ + struct rxrpc_peer *peer = conn->peer; + unsigned int max_data = peer->max_data; + int good, trial, bad, jumbo; + + good = peer->pmtud_good; + trial = peer->pmtud_trial; + bad = peer->pmtud_bad; + if (good >= bad - 1) { + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + return; + } + + if (!peer->pmtud_probing) + goto send_probe; + + if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) { + /* Retry a lost probe. */ + if (!peer->pmtud_lost) { + trace_rxrpc_pmtud_lost(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = true; + goto send_probe; + } + + /* The probed size didn't seem to get through. */ + bad = trial; + peer->pmtud_bad = bad; + if (bad <= max_data) + max_data = bad - 1; + } else { + /* It did get through. */ + good = trial; + peer->pmtud_good = good; + if (good > max_data) + max_data = good; + } + + max_data = umin(max_data, peer->ackr_max_data); + if (max_data != peer->max_data) + peer->max_data = max_data; + + jumbo = max_data + sizeof(struct rxrpc_jumbo_header); + jumbo /= RXRPC_JUMBO_SUBPKTLEN; + peer->pmtud_jumbo = jumbo; + + trace_rxrpc_pmtud_rx(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + + if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2)) + trial = RXRPC_JUMBO(2); + else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4)) + trial = RXRPC_JUMBO(4); + else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3)) + trial = RXRPC_JUMBO(3); + else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6)) + trial = RXRPC_JUMBO(6); + else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5)) + trial = RXRPC_JUMBO(5); + else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8)) + trial = RXRPC_JUMBO(8); + else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7)) + trial = RXRPC_JUMBO(7); + else + trial = (good + bad) / 2; + peer->pmtud_trial = trial; + + if (good >= bad) + return; + +send_probe: + peer->pmtud_pending = true; +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 49dcda67a0d5..56e09d161a97 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -162,6 +162,11 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, #endif peer->if_mtu = 1500; + if (peer->max_data < peer->if_mtu - peer->hdrsize) { + trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize, + rxrpc_pmtud_reduce_route); + peer->max_data = peer->if_mtu - peer->hdrsize; + } memset(&fl, 0, sizeof(fl)); switch (peer->srx.transport.family) { @@ -199,8 +204,16 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, } peer->if_mtu = dst_mtu(dst); + peer->hdrsize += dst->header_len + dst->trailer_len; + peer->tx_seg_max = dst->dev->gso_max_segs; dst_release(dst); + peer->max_data = umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize); + peer->pmtud_good = 500; + peer->pmtud_bad = peer->if_mtu - peer->hdrsize + 1; + peer->pmtud_trial = umin(peer->max_data, peer->pmtud_bad - 1); + peer->pmtud_pending = true; + _leave(" [if_mtu %u]", peer->if_mtu); } @@ -222,11 +235,8 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); - spin_lock_init(&peer->rtt_input_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); - - rxrpc_peer_init_rtt(peer); - + peer->recent_srtt_us = UINT_MAX; peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; trace_rxrpc_peer(peer->debug_id, 1, why); } @@ -242,9 +252,7 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; - rxrpc_assess_MTU_size(local, peer); - peer->mtu = peer->if_mtu; - peer->rtt_last_req = ktime_get_real(); + switch (peer->srx.transport.family) { case AF_INET: @@ -268,7 +276,9 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, } peer->hdrsize += sizeof(struct rxrpc_wire_header); - peer->maxdata = peer->mtu - peer->hdrsize; + peer->max_data = peer->if_mtu - peer->hdrsize; + + rxrpc_assess_MTU_size(local, peer); } /* @@ -304,6 +314,7 @@ static void rxrpc_free_peer(struct rxrpc_peer *peer) * Set up a new incoming peer. There shouldn't be any other matching peers * since we've already done a search in the list from the non-reentrant context * (the data_ready handler) that is the only place we can add new peers. + * Called with interrupts disabled. */ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { @@ -348,7 +359,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, return NULL; } - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); @@ -361,7 +372,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, &rxnet->peer_keepalive_new); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (peer) rxrpc_free_peer(candidate); @@ -411,10 +422,10 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) ASSERT(hlist_empty(&peer->error_targets)); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_free_peer(peer); } @@ -479,7 +490,7 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); */ unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) { - return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX; + return READ_ONCE(peer->recent_srtt_us); } EXPORT_SYMBOL(rxrpc_kernel_get_srtt); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 26dc2f26d92d..d803562ca0ac 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -52,9 +52,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); enum rxrpc_call_state state; - unsigned long timeout = 0; - rxrpc_seq_t acks_hard_ack; + rxrpc_seq_t tx_bottom; char lbuff[50], rbuff[50]; + long timeout = 0; if (v == &rxnet->calls) { seq_puts(seq, @@ -76,12 +76,10 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) sprintf(rbuff, "%pISpc", &call->dest_srx.transport); state = rxrpc_call_state(call); - if (state != RXRPC_CALL_SERVER_PREALLOC) { - timeout = READ_ONCE(call->expect_rx_by); - timeout -= jiffies; - } + if (state != RXRPC_CALL_SERVER_PREALLOC) + timeout = ktime_ms_delta(READ_ONCE(call->expect_rx_by), ktime_get_real()); - acks_hard_ack = READ_ONCE(call->acks_hard_ack); + tx_bottom = READ_ONCE(call->tx_bottom); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" " %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n", @@ -95,7 +93,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) rxrpc_call_states[state], call->abort_code, call->debug_id, - acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack, + tx_bottom, READ_ONCE(call->tx_top) - tx_bottom, call->ackr_window, call->ackr_wtop - call->ackr_window, call->rx_serial, call->cong_cwnd, @@ -285,9 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_puts(seq, - "Proto Local " - " Remote " - " Use SST MTU LastUse RTT RTO\n" + "Proto Local Remote Use SST Maxd LastUse RTT RTO\n" ); return 0; } @@ -300,16 +296,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) now = ktime_get_seconds(); seq_printf(seq, - "UDP %-47.47s %-47.47s %3u" - " %3u %5u %6llus %8u %8u\n", + "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8d %8d\n", lbuff, rbuff, refcount_read(&peer->ref), peer->cong_ssthresh, - peer->mtu, + peer->max_data, now - peer->last_tx_at, - peer->srtt_us >> 3, - jiffies_to_usecs(peer->rto_j)); + READ_ONCE(peer->recent_srtt_us), + READ_ONCE(peer->recent_rto_us)); return 0; } @@ -478,10 +473,11 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq)); seq_printf(seq, - "Data : send=%u sendf=%u fail=%u\n", + "Data : send=%u sendf=%u fail=%u emsz=%u\n", atomic_read(&rxnet->stat_tx_data_send), atomic_read(&rxnet->stat_tx_data_send_frag), - atomic_read(&rxnet->stat_tx_data_send_fail)); + atomic_read(&rxnet->stat_tx_data_send_fail), + atomic_read(&rxnet->stat_tx_data_send_msgsize)); seq_printf(seq, "Data-Tx : nr=%u retrans=%u uf=%u cwr=%u\n", atomic_read(&rxnet->stat_tx_data), @@ -510,7 +506,7 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]), atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE])); seq_printf(seq, - "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n", + "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u z=%u\n", atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]), @@ -519,13 +515,14 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]), - atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE])); + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]), + atomic_read(&rxnet->stat_rx_acks[0])); seq_printf(seq, - "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n", + "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt])); + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall])); seq_printf(seq, "Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]), @@ -533,6 +530,30 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin])); seq_printf(seq, + "Jumbo-Tx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", + atomic_read(&rxnet->stat_tx_jumbo[0]), + atomic_read(&rxnet->stat_tx_jumbo[1]), + atomic_read(&rxnet->stat_tx_jumbo[2]), + atomic_read(&rxnet->stat_tx_jumbo[3]), + atomic_read(&rxnet->stat_tx_jumbo[4]), + atomic_read(&rxnet->stat_tx_jumbo[5]), + atomic_read(&rxnet->stat_tx_jumbo[6]), + atomic_read(&rxnet->stat_tx_jumbo[7]), + atomic_read(&rxnet->stat_tx_jumbo[8]), + atomic_read(&rxnet->stat_tx_jumbo[9])); + seq_printf(seq, + "Jumbo-Rx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", + atomic_read(&rxnet->stat_rx_jumbo[0]), + atomic_read(&rxnet->stat_rx_jumbo[1]), + atomic_read(&rxnet->stat_rx_jumbo[2]), + atomic_read(&rxnet->stat_rx_jumbo[3]), + atomic_read(&rxnet->stat_rx_jumbo[4]), + atomic_read(&rxnet->stat_rx_jumbo[5]), + atomic_read(&rxnet->stat_rx_jumbo[6]), + atomic_read(&rxnet->stat_rx_jumbo[7]), + atomic_read(&rxnet->stat_rx_jumbo[8]), + atomic_read(&rxnet->stat_rx_jumbo[9])); + seq_printf(seq, "Buffers : txb=%u rxb=%u\n", atomic_read(&rxrpc_nr_txbuf), atomic_read(&rxrpc_n_rx_skbs)); @@ -569,6 +590,8 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size) atomic_set(&rxnet->stat_tx_ack_skip, 0); memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks)); memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks)); + memset(&rxnet->stat_tx_jumbo, 0, sizeof(rxnet->stat_tx_jumbo)); + memset(&rxnet->stat_rx_jumbo, 0, sizeof(rxnet->stat_rx_jumbo)); memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack)); diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index e8ee4af43ca8..42f70e4636f8 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -92,11 +92,16 @@ struct rxrpc_jumbo_header { /* * The maximum number of subpackets that can possibly fit in a UDP packet is: * - * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1 - * = ((65535 - 28 - 28) / 1416) + 1 - * = 46 non-terminal packets and 1 terminal packet. + * (max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412) + * = ((65535 - 28 + 4) / 1416) + * = 45 non-terminal packets and 1 terminal packet. */ -#define RXRPC_MAX_NR_JUMBO 47 +#define RXRPC_MAX_NR_JUMBO 46 + +/* Size of a jumbo packet with N subpackets, excluding UDP+IP */ +#define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \ + RXRPC_JUMBO_DATALEN + \ + ((N) - 1) * RXRPC_JUMBO_SUBPKTLEN) /*****************************************************************************/ /* @@ -135,9 +140,9 @@ struct rxrpc_ackpacket { /* * ACK packets can have a further piece of information tagged on the end */ -struct rxrpc_ackinfo { - __be32 rxMTU; /* maximum Rx MTU size (bytes) [AFS 3.3] */ - __be32 maxMTU; /* maximum interface MTU size (bytes) [AFS 3.3] */ +struct rxrpc_acktrailer { + __be32 maxMTU; /* maximum Rx MTU size (bytes) [AFS 3.3] */ + __be32 ifMTU; /* maximum interface MTU size (bytes) [AFS 3.3] */ __be32 rwind; /* Rx window size (packets) [AFS 3.4] */ __be32 jumbo_max; /* max packets to stick into a jumbo packet [AFS 3.5] */ }; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a482f88c5fc5..32cd5f1d541d 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -36,16 +36,16 @@ void rxrpc_notify_socket(struct rxrpc_call *call) sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { - spin_lock(&call->notify_lock); + spin_lock_irq(&call->notify_lock); call->notify_rx(sk, call, call->user_call_ID); - spin_unlock(&call->notify_lock); + spin_unlock_irq(&call->notify_lock); } else { - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); @@ -337,14 +337,14 @@ try_again: * We also want to weed out calls that got requeued whilst we were * shovelling data out. */ - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && skb_queue_empty(&call->recvmsg_queue)) { list_del_init(&call->recvmsg_link); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0); rxrpc_put_call(call, rxrpc_call_put_recvmsg); @@ -355,7 +355,7 @@ try_again: list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_get_recvmsg); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); call_debug_id = call->debug_id; trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); @@ -445,9 +445,9 @@ error_unlock_call: error_requeue_call: if (!(flags & MSG_PEEK)) { - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index be61d6f5be8d..7474f88d7b18 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -11,23 +11,23 @@ #include <linux/net.h> #include "ar-internal.h" -#define RXRPC_RTO_MAX ((unsigned)(120 * HZ)) -#define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ +#define RXRPC_RTO_MAX (120 * USEC_PER_SEC) +#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ -static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) +static u32 rxrpc_rto_min_us(struct rxrpc_call *call) { return 200; } -static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) +static u32 __rxrpc_set_rto(const struct rxrpc_call *call) { - return usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); + return (call->srtt_us >> 3) + call->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) { - return min(rto, RXRPC_RTO_MAX); + return clamp(200000, rto + 100000, RXRPC_RTO_MAX); } /* @@ -40,10 +40,10 @@ static u32 rxrpc_bound_rto(u32 rto) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) +static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ - u32 srtt = peer->srtt_us; + u32 srtt = call->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -66,7 +66,7 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (peer->mdev_us >> 2); /* similar update on mdev */ + m -= (call->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -78,31 +78,31 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) if (m > 0) m >>= 3; } else { - m -= (peer->mdev_us >> 2); /* similar update on mdev */ + m -= (call->mdev_us >> 2); /* similar update on mdev */ } - peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ - if (peer->mdev_us > peer->mdev_max_us) { - peer->mdev_max_us = peer->mdev_us; - if (peer->mdev_max_us > peer->rttvar_us) - peer->rttvar_us = peer->mdev_max_us; + call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (call->mdev_us > call->mdev_max_us) { + call->mdev_max_us = call->mdev_us; + if (call->mdev_max_us > call->rttvar_us) + call->rttvar_us = call->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ - peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ - peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); - peer->mdev_max_us = peer->rttvar_us; + call->mdev_us = m << 1; /* make sure rto = 3*rtt */ + call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call)); + call->mdev_max_us = call->rttvar_us; } - peer->srtt_us = max(1U, srtt); + call->srtt_us = umax(srtt, 1); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static void rxrpc_set_rto(struct rxrpc_peer *peer) +static void rxrpc_set_rto(struct rxrpc_call *call) { u32 rto; @@ -113,7 +113,7 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ - rto = __rxrpc_set_rto(peer); + rto = __rxrpc_set_rto(call); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -124,72 +124,85 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ - peer->rto_j = rxrpc_bound_rto(rto); + call->rto_us = rxrpc_bound_rto(rto); } -static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) +static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) +{ + /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */ + u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024; + + minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024, + (u32)rtt_us ? : jiffies_to_usecs(1)); +} + +static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { if (rtt_us < 0) return; - //rxrpc_update_rtt_min(peer, rtt_us); - rxrpc_rtt_estimator(peer, rtt_us); - rxrpc_set_rto(peer); + /* Update RACK min RTT [RFC8985 6.1 Step 1]. */ + rxrpc_update_rtt_min(call, resp_time, rtt_us); + + rxrpc_rtt_estimator(call, rtt_us); + rxrpc_set_rto(call); - /* RFC6298: only reset backoff on valid RTT measurement. */ - peer->backoff = 0; + /* Only reset backoff on valid RTT measurement [RFC6298]. */ + call->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has - * exclusive access to the peer RTT data. + * exclusive access to the call RTT data. */ -void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, +void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { - struct rxrpc_peer *peer = call->peer; s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; - spin_lock(&peer->rtt_input_lock); - rxrpc_ack_update_rtt(peer, rtt_us); - if (peer->rtt_count < 3) - peer->rtt_count++; - spin_unlock(&peer->rtt_input_lock); + rxrpc_ack_update_rtt(call, resp_time, rtt_us); + if (call->rtt_count < 3) + call->rtt_count++; + call->rtt_taken++; + + WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8); + WRITE_ONCE(call->peer->recent_rto_us, call->rto_us); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, - peer->srtt_us >> 3, peer->rto_j); + rtt_us, call->srtt_us, call->rto_us); } /* - * Get the retransmission timeout to set in jiffies, backing it off each time - * we retransmit. + * Get the retransmission timeout to set in nanoseconds, backing it off each + * time we retransmit. */ -unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) +ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans) { - u64 timo_j; - u8 backoff = READ_ONCE(peer->backoff); + u64 timo_us; + u32 backoff = READ_ONCE(call->backoff); - timo_j = peer->rto_j; - timo_j <<= backoff; - if (retrans && timo_j * 2 <= RXRPC_RTO_MAX) - WRITE_ONCE(peer->backoff, backoff + 1); + timo_us = call->rto_us; + timo_us <<= backoff; + if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) + WRITE_ONCE(call->backoff, backoff + 1); - if (timo_j < 1) - timo_j = 1; + if (timo_us < 1) + timo_us = 1; - return timo_j; + return ns_to_ktime(timo_us * NSEC_PER_USEC); } -void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) +void rxrpc_call_init_rtt(struct rxrpc_call *call) { - peer->rto_j = RXRPC_TIMEOUT_INIT; - peer->mdev_us = jiffies_to_usecs(RXRPC_TIMEOUT_INIT); - peer->backoff = 0; - //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); + call->rtt_last_req = KTIME_MIN; + call->rto_us = RXRPC_TIMEOUT_INIT; + call->mdev_us = RXRPC_TIMEOUT_INIT; + call->backoff = 0; + //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U); } diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 6b32d61d4cdc..6cb37b0eb77f 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -145,16 +145,17 @@ error: /* * Work out how much data we can put in a packet. */ -static int rxkad_how_much_data(struct rxrpc_call *call, size_t remain, - size_t *_buf_size, size_t *_data_size, size_t *_offset) +static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { - size_t shdr, buf_size, chunk; + struct rxrpc_txbuf *txb; + size_t shdr, alloc, limit, part; + + remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header)); switch (call->conn->security_level) { default: - buf_size = chunk = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); - shdr = 0; - goto out; + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); case RXRPC_SECURITY_AUTH: shdr = sizeof(struct rxkad_level1_hdr); break; @@ -163,17 +164,22 @@ static int rxkad_how_much_data(struct rxrpc_call *call, size_t remain, break; } - buf_size = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN); + limit = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN) - shdr; + if (remain < limit) { + part = remain; + alloc = round_up(shdr + part, RXKAD_ALIGN); + } else { + part = limit; + alloc = RXRPC_JUMBO_DATALEN; + } - chunk = buf_size - shdr; - if (remain < chunk) - buf_size = round_up(shdr + remain, RXKAD_ALIGN); + txb = rxrpc_alloc_data_txbuf(call, alloc, RXKAD_ALIGN, gfp); + if (!txb) + return NULL; -out: - *_buf_size = buf_size; - *_data_size = chunk; - *_offset = shdr; - return 0; + txb->offset += shdr; + txb->space = part; + return txb; } /* @@ -251,7 +257,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, struct rxrpc_txbuf *txb, struct skcipher_request *req) { - struct rxkad_level1_hdr *hdr = (void *)txb->data; + struct rxkad_level1_hdr *hdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; size_t pad; @@ -259,22 +265,22 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, _enter(""); - check = txb->seq ^ ntohl(txb->wire.callNumber); + check = txb->seq ^ call->call_id; hdr->data_size = htonl((u32)check << 16 | txb->len); - txb->len += sizeof(struct rxkad_level1_hdr); - pad = txb->len; + txb->pkt_len = sizeof(struct rxkad_level1_hdr) + txb->len; + pad = txb->pkt_len; pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { memset(txb->data + txb->offset, 0, pad); - txb->len += pad; + txb->pkt_len += pad; } /* start the encryption afresh */ memset(&iv, 0, sizeof(iv)); - sg_init_one(&sg, txb->data, 8); + sg_init_one(&sg, hdr, 8); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); @@ -293,37 +299,34 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct skcipher_request *req) { const struct rxrpc_key_token *token; - struct rxkad_level2_hdr *rxkhdr = (void *)txb->data; + struct rxkad_level2_hdr *rxkhdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; - size_t pad; + size_t content, pad; u16 check; int ret; _enter(""); - check = txb->seq ^ ntohl(txb->wire.callNumber); + check = txb->seq ^ call->call_id; rxkhdr->data_size = htonl(txb->len | (u32)check << 16); rxkhdr->checksum = 0; - txb->len += sizeof(struct rxkad_level2_hdr); - pad = txb->len; - pad = RXKAD_ALIGN - pad; - pad &= RXKAD_ALIGN - 1; - if (pad) { + content = sizeof(struct rxkad_level2_hdr) + txb->len; + txb->pkt_len = round_up(content, RXKAD_ALIGN); + pad = txb->pkt_len - content; + if (pad) memset(txb->data + txb->offset, 0, pad); - txb->len += pad; - } /* encrypt from the session key */ token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg, txb->data, txb->len); + sg_init_one(&sg, rxkhdr, txb->pkt_len); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x); + skcipher_request_set_crypt(req, &sg, &sg, txb->pkt_len, iv.x); ret = crypto_skcipher_encrypt(req); skcipher_request_zero(req); return ret; @@ -362,9 +365,9 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) memcpy(&iv, call->conn->rxkad.csum_iv.x, sizeof(iv)); /* calculate the security checksum */ - x = (ntohl(txb->wire.cid) & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); + x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); x |= txb->seq & 0x3fffffff; - crypto.buf[0] = txb->wire.callNumber; + crypto.buf[0] = htonl(call->call_id); crypto.buf[1] = htonl(x); sg_init_one(&sg, crypto.buf, 8); @@ -378,23 +381,36 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) y = (y >> 16) & 0xffff; if (y == 0) y = 1; /* zero checksums are not permitted */ - txb->wire.cksum = htons(y); + txb->cksum = htons(y); switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: + txb->pkt_len = txb->len; ret = 0; break; case RXRPC_SECURITY_AUTH: ret = rxkad_secure_packet_auth(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; case RXRPC_SECURITY_ENCRYPT: ret = rxkad_secure_packet_encrypt(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; default: ret = -EPERM; break; } + /* Clear excess space in the packet */ + if (txb->pkt_len < txb->alloc_size) { + size_t gap = txb->alloc_size - txb->pkt_len; + void *p = txb->data; + + memset(p + txb->pkt_len, 0, gap); + } + skcipher_request_free(req); _leave(" = %d [set %x]", ret, y); return ret; @@ -726,7 +742,6 @@ static int rxkad_send_response(struct rxrpc_connection *conn, rxrpc_local_dont_fragment(conn->local, false); ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); - rxrpc_local_dont_fragment(conn->local, true); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, rxrpc_tx_point_rxkad_response); @@ -1256,7 +1271,7 @@ const struct rxrpc_security rxkad = { .free_preparse_server_key = rxkad_free_preparse_server_key, .destroy_server_key = rxkad_destroy_server_key, .init_connection_security = rxkad_init_connection_security, - .how_much_data = rxkad_how_much_data, + .alloc_txbuf = rxkad_alloc_txbuf, .secure_packet = rxkad_secure_packet, .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 085e7892d310..e848a4777b8c 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -478,6 +478,18 @@ static int rxperf_deliver_request(struct rxperf_call *call) call->unmarshal++; fallthrough; case 2: + ret = rxperf_extract_data(call, true); + if (ret < 0) + return ret; + + /* Deal with the terminal magic cookie. */ + call->iov_len = 4; + call->kvec[0].iov_len = call->iov_len; + call->kvec[0].iov_base = call->tmp; + iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len); + call->unmarshal++; + fallthrough; + case 3: ret = rxperf_extract_data(call, false); if (ret < 0) return ret; @@ -503,7 +515,7 @@ static int rxperf_process_call(struct rxperf_call *call) reply_len + sizeof(rxperf_magic_cookie)); while (reply_len > 0) { - len = min_t(size_t, reply_len, PAGE_SIZE); + len = umin(reply_len, PAGE_SIZE); bvec_set_page(&bv, ZERO_PAGE(0), len, 0); iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, len); msg.msg_flags = MSG_MORE; diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index cb8dd1d3b1d4..9784adc8f275 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -114,10 +114,10 @@ found: if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) { ret = conn->security->init_connection_security(conn, token); if (ret == 0) { - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) conn->state = RXRPC_CONN_CLIENT; - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); } } mutex_unlock(&conn->security_lock); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 5677d5690a02..84dc6c94f23b 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -29,6 +29,7 @@ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, call->send_abort_why = why; call->send_abort_err = error; call->send_abort_seq = 0; + trace_rxrpc_abort_call(call, abort_code); /* Request abort locklessly vs rxrpc_input_call_event(). */ smp_store_release(&call->send_abort, abort_code); rxrpc_poke_call(call, rxrpc_call_poke_abort); @@ -93,9 +94,11 @@ no_wait: */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { + rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom); + if (_tx_win) - *_tx_win = call->tx_bottom; - return call->tx_prepared - call->tx_bottom < 256; + *_tx_win = tx_bottom; + return call->send_top - tx_bottom < 256; } /* @@ -131,13 +134,13 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, rxrpc_seq_t tx_start, tx_win; signed long rtt, timeout; - rtt = READ_ONCE(call->peer->srtt_us) >> 3; + rtt = READ_ONCE(call->srtt_us) >> 3; rtt = usecs_to_jiffies(rtt) * 2; if (rtt < 2) rtt = 2; timeout = rtt; - tx_start = smp_load_acquire(&call->acks_hard_ack); + tx_start = READ_ONCE(call->tx_bottom); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -194,8 +197,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, DECLARE_WAITQUEUE(myself, current); int ret; - _enter(",{%u,%u,%u,%u}", - call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize); + _enter(",{%u,%u,%u}", + call->tx_bottom, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); @@ -239,37 +242,77 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { + struct rxrpc_txqueue *sq = call->send_queue; rxrpc_seq_t seq = txb->seq; - bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags), poke; - + bool poke, last = txb->flags & RXRPC_LAST_PACKET; + int ix = seq & RXRPC_TXQ_MASK; rxrpc_inc_stat(call->rxnet, stat_tx_data); - ASSERTCMP(txb->seq, ==, call->tx_prepared + 1); - - /* We have to set the timestamp before queueing as the retransmit - * algorithm can see the packet as soon as we queue it. - */ - txb->last_sent = ktime_get_real(); + ASSERTCMP(txb->seq, ==, call->send_top + 1); if (last) trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); + if (WARN_ON_ONCE(sq->bufs[ix])) + trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup); + else + trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue); + /* Add the packet to the call's output buffer */ - spin_lock(&call->tx_lock); - poke = list_empty(&call->tx_sendmsg); - list_add_tail(&txb->call_link, &call->tx_sendmsg); - call->tx_prepared = seq; - if (last) + poke = (READ_ONCE(call->tx_bottom) == call->send_top); + sq->bufs[ix] = txb; + /* Order send_top after the queue->next pointer and txb content. */ + smp_store_release(&call->send_top, seq); + if (last) { + set_bit(RXRPC_CALL_TX_NO_MORE, &call->flags); rxrpc_notify_end_tx(rx, call, notify_end_tx); - spin_unlock(&call->tx_lock); + call->send_queue = NULL; + } if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } /* + * Allocate a new txqueue unit and add it to the transmission queue. + */ +static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq; + + tq = kzalloc(sizeof(*tq), sk->sk_allocation); + if (!tq) + return -ENOMEM; + + tq->xmit_ts_base = KTIME_MIN; + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + tq->segment_xmit_ts[i] = UINT_MAX; + + if (call->send_queue) { + tq->qbase = call->send_top + 1; + call->send_queue->next = tq; + call->send_queue = tq; + } else if (WARN_ON(call->tx_queue)) { + kfree(tq); + return -ENOMEM; + } else { + /* We start at seq 1, so pretend seq 0 is hard-acked. */ + tq->nr_reported_acks = 1; + tq->segment_acked = 1UL; + tq->qbase = 0; + call->tx_qbase = 0; + call->send_queue = tq; + call->tx_qtail = tq; + call->tx_queue = tq; + } + + trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc); + return 0; +} + +/* * send data through a socket * - must be called in process context * - The caller holds the call user access mutex, but not the socket lock. @@ -287,6 +330,13 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, bool more = msg->msg_flags & MSG_MORE; int ret, copied = 0; + if (test_bit(RXRPC_CALL_TX_NO_MORE, &call->flags)) { + trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, + call->cid, call->call_id, call->rx_consumed, + 0, -EPROTO); + return -EPROTO; + } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ret = rxrpc_wait_to_be_connected(call, &timeo); @@ -303,6 +353,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); reload: + txb = call->tx_pending; + call->tx_pending = NULL; + if (txb) + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); + ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) goto maybe_error; @@ -329,53 +384,43 @@ reload: goto maybe_error; } - txb = call->tx_pending; - call->tx_pending = NULL; - if (txb) - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); - do { if (!txb) { - size_t remain, bufsize, chunk, offset; + size_t remain; _debug("alloc"); if (!rxrpc_check_tx_space(call, NULL)) goto wait_for_space; + /* See if we need to begin/extend the Tx queue. */ + if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) { + ret = rxrpc_alloc_txqueue(sk, call); + if (ret < 0) + goto maybe_error; + } + /* Work out the maximum size of a packet. Assume that * the security header is going to be in the padded * region (enc blocksize), but the trailer is not. */ remain = more ? INT_MAX : msg_data_left(msg); - ret = call->conn->security->how_much_data(call, remain, - &bufsize, &chunk, &offset); - if (ret < 0) + txb = call->conn->security->alloc_txbuf(call, remain, sk->sk_allocation); + if (!txb) { + ret = -ENOMEM; goto maybe_error; - - _debug("SIZE: %zu/%zu @%zu", chunk, bufsize, offset); - - /* create a buffer that we can retain until it's ACK'd */ - ret = -ENOMEM; - txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_DATA, - GFP_KERNEL); - if (!txb) - goto maybe_error; - - txb->offset = offset; - txb->space -= offset; - txb->space = min_t(size_t, chunk, txb->space); + } } _debug("append"); /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { - size_t copy = min_t(size_t, txb->space, msg_data_left(msg)); + size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); - if (!copy_from_iter_full(txb->data + txb->offset, copy, - &msg->msg_iter)) + if (!copy_from_iter_full(txb->data + txb->offset, + copy, &msg->msg_iter)) goto efault; _debug("added"); txb->space -= copy; @@ -394,18 +439,12 @@ reload: /* add the packet to the send queue if it's now full */ if (!txb->space || (msg_data_left(msg) == 0 && !more)) { - if (msg_data_left(msg) == 0 && !more) { - txb->wire.flags |= RXRPC_LAST_PACKET; - __set_bit(RXRPC_TXBUF_LAST, &txb->flags); - } - else if (call->tx_top - call->acks_hard_ack < - call->tx_winsize) - txb->wire.flags |= RXRPC_MORE_PACKETS; + if (msg_data_left(msg) == 0 && !more) + txb->flags |= RXRPC_LAST_PACKET; ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; - rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } @@ -621,7 +660,6 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; - unsigned long now, j; bool dropped_lock = false; int ret; @@ -667,7 +705,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) } else { switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_AWAIT_CONN: - case RXRPC_CALL_SERVER_SECURING: + case RXRPC_CALL_SERVER_RECV_REQUEST: if (p.command == RXRPC_CMD_SEND_ABORT) break; fallthrough; @@ -699,25 +737,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) switch (p.call.nr_timeouts) { case 3: - j = msecs_to_jiffies(p.call.timeouts.normal); - if (p.call.timeouts.normal > 0 && j == 0) - j = 1; - WRITE_ONCE(call->next_rx_timo, j); + WRITE_ONCE(call->next_rx_timo, p.call.timeouts.normal); fallthrough; case 2: - j = msecs_to_jiffies(p.call.timeouts.idle); - if (p.call.timeouts.idle > 0 && j == 0) - j = 1; - WRITE_ONCE(call->next_req_timo, j); + WRITE_ONCE(call->next_req_timo, p.call.timeouts.idle); fallthrough; case 1: if (p.call.timeouts.hard > 0) { - j = p.call.timeouts.hard * HZ; - now = jiffies; - j += now; - WRITE_ONCE(call->expect_term_by, j); - rxrpc_reduce_call_timer(call, j, now, - rxrpc_timer_set_for_hard); + ktime_t delay = ms_to_ktime(p.call.timeouts.hard * MSEC_PER_SEC); + + WRITE_ONCE(call->expect_term_by, + ktime_add(p.call.timeouts.hard, + ktime_get_real())); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); + rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); + } break; } diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index ecaeb4ecfb58..46a20cf4c402 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -11,10 +11,14 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; +static const unsigned int rxrpc_rx_mtu_min = 500; +static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO; static const unsigned int four = 4; static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1; static const unsigned int n_65535 = 65535; static const unsigned int n_max_acks = 255; +static const unsigned long one_ms = 1; +static const unsigned long max_ms = 1000; static const unsigned long one_jiffy = 1; static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY @@ -28,24 +32,24 @@ static const unsigned long max_500 = 500; * information on the individual parameters. */ static struct ctl_table rxrpc_sysctl_table[] = { - /* Values measured in milliseconds but used in jiffies */ + /* Values measured in milliseconds */ { .procname = "soft_ack_delay", .data = &rxrpc_soft_ack_delay, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_ms_jiffies_minmax, - .extra1 = (void *)&one_jiffy, - .extra2 = (void *)&max_jiffies, + .proc_handler = proc_doulongvec_minmax, + .extra1 = (void *)&one_ms, + .extra2 = (void *)&max_ms, }, { .procname = "idle_ack_delay", .data = &rxrpc_idle_ack_delay, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_ms_jiffies_minmax, - .extra1 = (void *)&one_jiffy, - .extra2 = (void *)&max_jiffies, + .proc_handler = proc_doulongvec_minmax, + .extra1 = (void *)&one_ms, + .extra2 = (void *)&max_ms, }, { .procname = "idle_conn_expiry", @@ -113,7 +117,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)SYSCTL_ONE, + .extra1 = (void *)&rxrpc_rx_mtu_min, .extra2 = (void *)&n_65535, }, { @@ -123,9 +127,8 @@ static struct ctl_table rxrpc_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)SYSCTL_ONE, - .extra2 = (void *)&four, + .extra2 = (void *)&rxrpc_jumbo_max, }, - { } }; int __init rxrpc_sysctl_init(void) diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index d43be8512386..c550991d48fa 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -14,45 +14,49 @@ static atomic_t rxrpc_txbuf_debug_ids; atomic_t rxrpc_nr_txbuf; /* - * Allocate and partially initialise an I/O request structure. + * Allocate and partially initialise a data transmission buffer. */ -struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type, - gfp_t gfp) +struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, + size_t data_align, gfp_t gfp) { struct rxrpc_txbuf *txb; - - txb = kmalloc(sizeof(*txb), gfp); - if (txb) { - INIT_LIST_HEAD(&txb->call_link); - INIT_LIST_HEAD(&txb->tx_link); - refcount_set(&txb->ref, 1); - txb->call_debug_id = call->debug_id; - txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); - txb->space = sizeof(txb->data); - txb->len = 0; - txb->offset = 0; - txb->flags = 0; - txb->ack_why = 0; - txb->seq = call->tx_prepared + 1; - txb->wire.epoch = htonl(call->conn->proto.epoch); - txb->wire.cid = htonl(call->cid); - txb->wire.callNumber = htonl(call->call_id); - txb->wire.seq = htonl(txb->seq); - txb->wire.type = packet_type; - txb->wire.flags = call->conn->out_clientflag; - txb->wire.userStatus = 0; - txb->wire.securityIndex = call->security_ix; - txb->wire._rsvd = 0; - txb->wire.serviceId = htons(call->dest_srx.srx_service); - - trace_rxrpc_txbuf(txb->debug_id, - txb->call_debug_id, txb->seq, 1, - packet_type == RXRPC_PACKET_TYPE_DATA ? - rxrpc_txbuf_alloc_data : - rxrpc_txbuf_alloc_ack); - atomic_inc(&rxrpc_nr_txbuf); + size_t total, doff, jsize = sizeof(struct rxrpc_jumbo_header); + void *buf; + + txb = kzalloc(sizeof(*txb), gfp); + if (!txb) + return NULL; + + /* We put a jumbo header in the buffer, but not a full wire header to + * avoid delayed-corruption problems with zerocopy. + */ + doff = round_up(jsize, data_align); + total = doff + data_size; + + data_align = umax(data_align, L1_CACHE_BYTES); + mutex_lock(&call->conn->tx_data_alloc_lock); + buf = page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp, + data_align); + mutex_unlock(&call->conn->tx_data_alloc_lock); + if (!buf) { + kfree(txb); + return NULL; } + refcount_set(&txb->ref, 1); + txb->call_debug_id = call->debug_id; + txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); + txb->alloc_size = data_size; + txb->space = data_size; + txb->offset = 0; + txb->flags = call->conn->out_clientflag; + txb->seq = call->send_top + 1; + txb->data = buf + doff; + + trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, + rxrpc_txbuf_alloc_data); + + atomic_inc(&rxrpc_nr_txbuf); return txb; } @@ -71,12 +75,12 @@ void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r, what); } -static void rxrpc_free_txbuf(struct rcu_head *rcu) +static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) { - struct rxrpc_txbuf *txb = container_of(rcu, struct rxrpc_txbuf, rcu); - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); + if (txb->data) + page_frag_free(txb->data); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); } @@ -95,40 +99,6 @@ void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) dead = __refcount_dec_and_test(&txb->ref, &r); trace_rxrpc_txbuf(debug_id, call_debug_id, seq, r - 1, what); if (dead) - call_rcu(&txb->rcu, rxrpc_free_txbuf); - } -} - -/* - * Shrink the transmit buffer. - */ -void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) -{ - struct rxrpc_txbuf *txb; - rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack); - bool wake = false; - - _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top); - - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - hard_ack = smp_load_acquire(&call->acks_hard_ack); - if (before(hard_ack, txb->seq)) - break; - - if (txb->seq != call->tx_bottom + 1) - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step); - ASSERTCMP(txb->seq, ==, call->tx_bottom + 1); - smp_store_release(&call->tx_bottom, call->tx_bottom + 1); - list_del_rcu(&txb->call_link); - - trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue); - - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated); - if (after(call->acks_hard_ack, call->tx_bottom + 128)) - wake = true; + rxrpc_free_txbuf(txb); } - - if (wake) - wake_up(&call->waitq); } |