diff options
Diffstat (limited to 'net/rxrpc')
37 files changed, 5935 insertions, 1507 deletions
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index a20986806fea..f60b81c66078 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -67,6 +67,29 @@ config RXKAD See Documentation/networking/rxrpc.rst. +config RXGK + bool "RxRPC GSSAPI security" + select CRYPTO_KRB5 + select CRYPTO_MANAGER + select CRYPTO_KRB5ENC + select CRYPTO_AUTHENC + select CRYPTO_SKCIPHER + select CRYPTO_HASH_INFO + select CRYPTO_HMAC + select CRYPTO_CMAC + select CRYPTO_SHA1 + select CRYPTO_SHA256 + select CRYPTO_SHA512 + select CRYPTO_CBC + select CRYPTO_CTS + select CRYPTO_AES + select CRYPTO_CAMELLIA + help + Provide the GSSAPI-based RxGK security class for AFS. Keys are added + with add_key(). + + See Documentation/networking/rxrpc.rst. + config RXPERF tristate "RxRPC test service" help diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index ac5caf5a48e1..c0542bae719e 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -16,6 +16,7 @@ rxrpc-y := \ conn_object.o \ conn_service.o \ input.o \ + input_rack.o \ insecure.o \ io_thread.o \ key.o \ @@ -23,6 +24,7 @@ rxrpc-y := \ local_object.o \ misc.o \ net_ns.o \ + oob.o \ output.o \ peer_event.o \ peer_object.o \ @@ -38,6 +40,9 @@ rxrpc-y := \ rxrpc-$(CONFIG_PROC_FS) += proc.o rxrpc-$(CONFIG_RXKAD) += rxkad.o rxrpc-$(CONFIG_SYSCTL) += sysctl.o - +rxrpc-$(CONFIG_RXGK) += \ + rxgk.o \ + rxgk_app.o \ + rxgk_kdf.o obj-$(CONFIG_RXPERF) += rxperf.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 5222bc97d192..36df0274d7b7 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } @@ -265,7 +265,10 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @gfp: Allocation flags * * Lookup or create a remote transport endpoint record for the specified - * address and return it with a ref held. + * address. + * + * Return: The peer record found with a reference, %NULL if no record is found + * or a negative error code if the address is invalid or unsupported. */ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, struct sockaddr_rxrpc *srx, gfp_t gfp) @@ -283,9 +286,11 @@ EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); /** * rxrpc_kernel_get_peer - Get a reference on a peer - * @peer: The peer to get a reference on. + * @peer: The peer to get a reference on (may be NULL). + * + * Get a reference for a remote peer record (if not NULL). * - * Get a record for the remote peer in a call. + * Return: The @peer argument. */ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer) { @@ -296,6 +301,8 @@ EXPORT_SYMBOL(rxrpc_kernel_get_peer); /** * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference * @peer: The peer to drop a ref on + * + * Drop a reference on a peer record. */ void rxrpc_kernel_put_peer(struct rxrpc_peer *peer) { @@ -320,10 +327,12 @@ EXPORT_SYMBOL(rxrpc_kernel_put_peer); * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and - * call IDs as appropriate. The call to be used is returned. + * call IDs as appropriate. * * The default socket destination address and security may be overridden by * supplying @srx and @key. + * + * Return: The new call or an error code. */ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, @@ -408,9 +417,9 @@ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call) /* Make sure we're not going to call back into a kernel service */ if (call->notify_rx) { - spin_lock(&call->notify_lock); + spin_lock_irq(&call->notify_lock); call->notify_rx = rxrpc_dummy_notify_rx; - spin_unlock(&call->notify_lock); + spin_unlock_irq(&call->notify_lock); } } mutex_unlock(&call->user_mutex); @@ -437,6 +446,8 @@ EXPORT_SYMBOL(rxrpc_kernel_put_call); * * Allow a kernel service to find out whether a call is still alive - whether * it has completed successfully and all received data has been consumed. + * + * Return: %true if the call is still ongoing and %false if it has completed. */ bool rxrpc_kernel_check_life(const struct socket *sock, const struct rxrpc_call *call) @@ -450,63 +461,20 @@ bool rxrpc_kernel_check_life(const struct socket *sock, EXPORT_SYMBOL(rxrpc_kernel_check_life); /** - * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. - * @sock: The socket the call is on - * @call: The call to query - * - * Allow a kernel service to retrieve the epoch value from a service call to - * see if the client at the other end rebooted. - */ -u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) -{ - return call->conn->proto.epoch; -} -EXPORT_SYMBOL(rxrpc_kernel_get_epoch); - -/** - * rxrpc_kernel_new_call_notification - Get notifications of new calls - * @sock: The socket to intercept received messages on - * @notify_new_call: Function to be called when new calls appear - * @discard_new_call: Function to discard preallocated calls + * rxrpc_kernel_set_notifications - Set table of callback operations + * @sock: The socket to install table upon + * @app_ops: Callback operation table to set * - * Allow a kernel service to be given notifications about new calls. + * Allow a kernel service to set a table of event notifications on a socket. */ -void rxrpc_kernel_new_call_notification( - struct socket *sock, - rxrpc_notify_new_call_t notify_new_call, - rxrpc_discard_new_call_t discard_new_call) +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - rx->notify_new_call = notify_new_call; - rx->discard_new_call = discard_new_call; + rx->app_ops = app_ops; } -EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); - -/** - * rxrpc_kernel_set_max_life - Set maximum lifespan on a call - * @sock: The socket the call is on - * @call: The call to configure - * @hard_timeout: The maximum lifespan of the call in ms - * - * Set the maximum lifespan of a call. The call will end with ETIME or - * ETIMEDOUT if it takes longer than this. - */ -void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call, - unsigned long hard_timeout) -{ - ktime_t delay = ms_to_ktime(hard_timeout), expect_term_by; - - mutex_lock(&call->user_mutex); - - expect_term_by = ktime_add(ktime_get_real(), delay); - WRITE_ONCE(call->expect_term_by, expect_term_by); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); - rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); - - mutex_unlock(&call->user_mutex); -} -EXPORT_SYMBOL(rxrpc_kernel_set_max_life); +EXPORT_SYMBOL(rxrpc_kernel_set_notifications); /* * connect an RxRPC socket @@ -624,7 +592,10 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) fallthrough; case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: - ret = rxrpc_do_sendmsg(rx, m, len); + if (m->msg_flags & MSG_OOB) + ret = rxrpc_sendmsg_oob(rx, m, len); + else + ret = rxrpc_do_sendmsg(rx, m, len); /* The socket has been unlocked */ goto out; default: @@ -659,7 +630,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - unsigned int min_sec_level; + unsigned int min_sec_level, val; u16 service_upgrade[2]; int ret; @@ -707,9 +678,10 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; - ret = copy_from_sockptr(&min_sec_level, optval, - sizeof(unsigned int)); - if (ret < 0) + ret = copy_safe_from_sockptr(&min_sec_level, + sizeof(min_sec_level), + optval, optlen); + if (ret) goto error; ret = -EINVAL; if (min_sec_level > RXRPC_SECURITY_MAX) @@ -739,6 +711,26 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, rx->service_upgrade.to = service_upgrade[1]; goto success; + case RXRPC_MANAGE_RESPONSE: + ret = -EINVAL; + if (optlen != sizeof(unsigned int)) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNBOUND) + goto error; + ret = copy_safe_from_sockptr(&val, sizeof(val), + optval, optlen); + if (ret) + goto error; + ret = -EINVAL; + if (val > 1) + goto error; + if (val) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + goto success; + default: break; } @@ -845,6 +837,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rx->calls = RB_ROOT; spin_lock_init(&rx->incoming_lock); + skb_queue_head_init(&rx->recvmsg_oobq); + rx->pending_oobq = RB_ROOT; INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); INIT_LIST_HEAD(&rx->recvmsg_q); @@ -878,8 +872,10 @@ static int rxrpc_shutdown(struct socket *sock, int flags) lock_sock(sk); if (sk->sk_state < RXRPC_CLOSE) { + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; sk->sk_shutdown = SHUTDOWN_MASK; + spin_unlock_irq(&rx->recvmsg_lock); } else { ret = -ESHUTDOWN; } @@ -891,12 +887,30 @@ static int rxrpc_shutdown(struct socket *sock, int flags) } /* + * Purge the out-of-band queue. + */ +static void rxrpc_purge_oob_queue(struct sock *sk) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&rx->recvmsg_oobq))) + rxrpc_kernel_free_oob(skb); + while (!RB_EMPTY_ROOT(&rx->pending_oobq)) { + skb = rb_entry(rx->pending_oobq.rb_node, struct sk_buff, rbnode); + rb_erase(&skb->rbnode, &rx->pending_oobq); + rxrpc_kernel_free_oob(skb); + } +} + +/* * RxRPC socket destructor */ static void rxrpc_sock_destructor(struct sock *sk) { _enter("%p", sk); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); @@ -935,7 +949,9 @@ static int rxrpc_release_sock(struct sock *sk) break; } + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; + spin_unlock_irq(&rx->recvmsg_lock); if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); @@ -947,6 +963,7 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_discard_prealloc(rx); rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 08c0a32db8c7..376e33dce8c1 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -30,6 +30,8 @@ struct rxrpc_crypt { struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; +struct rxrpc_txqueue; +struct rxgk_context; /* * Mark applied to socket buffers in skb->mark. skb->priority is used @@ -38,6 +40,7 @@ struct rxrpc_txbuf; enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_ERROR, /* Error notification */ + RXRPC_SKB_MARK_CHALLENGE, /* Challenge notification */ RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ @@ -98,6 +101,7 @@ struct rxrpc_net { atomic_t stat_tx_data_send; atomic_t stat_tx_data_send_frag; atomic_t stat_tx_data_send_fail; + atomic_t stat_tx_data_send_msgsize; atomic_t stat_tx_data_underflow; atomic_t stat_tx_data_cwnd_reset; atomic_t stat_rx_data; @@ -109,6 +113,8 @@ struct rxrpc_net { atomic_t stat_tx_ack_skip; atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; + atomic_t stat_tx_jumbo[10]; + atomic_t stat_rx_jumbo[10]; atomic_t stat_why_req_ack[8]; @@ -142,10 +148,12 @@ struct rxrpc_backlog { struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; - rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ - rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ + const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ + struct sk_buff_head recvmsg_oobq; /* OOB messages for recvmsg to pick up */ + struct rb_root pending_oobq; /* OOB messages awaiting userspace to respond to */ + u64 oob_id_counter; /* OOB message ID counter */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ @@ -156,6 +164,7 @@ struct rxrpc_sock { struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ +#define RXRPC_SOCK_MANAGE_RESPONSE 1 /* User wants to manage RESPONSE packets */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT @@ -199,7 +208,7 @@ struct rxrpc_host_header { */ struct rxrpc_skb_priv { union { - struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ + struct rxrpc_connection *poke_conn; /* Conn referred to (poke packet) */ struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ @@ -210,10 +219,22 @@ struct rxrpc_skb_priv { rxrpc_seq_t first_ack; /* First packet in acks table */ rxrpc_seq_t prev_ack; /* Highest seq seen */ rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */ + u16 nr_acks; /* Number of acks+nacks */ u8 reason; /* Reason for ack */ - u8 nr_acks; /* Number of acks+nacks */ - u8 nr_nacks; /* Number of nacks */ } ack; + struct { + struct rxrpc_connection *conn; /* Connection referred to */ + union { + u32 rxkad_nonce; + }; + } chall; + struct { + rxrpc_serial_t challenge_serial; + u32 kvno; + u32 version; + u16 len; + u16 ticket_len; + } resp; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; @@ -267,9 +288,24 @@ struct rxrpc_security { /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); + /* Validate a challenge packet */ + bool (*validate_challenge)(struct rxrpc_connection *conn, + struct sk_buff *skb); + + /* Fill out the cmsg for recvmsg() to pass on a challenge to userspace. + * The security class gets to add additional information. + */ + int (*challenge_to_recvmsg)(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg); + + /* Parse sendmsg() control message and respond to challenge. */ + int (*sendmsg_respond_to_challenge)(struct sk_buff *challenge, + struct msghdr *msg); + /* respond to a challenge */ - int (*respond_to_challenge)(struct rxrpc_connection *, - struct sk_buff *); + int (*respond_to_challenge)(struct rxrpc_connection *conn, + struct sk_buff *challenge); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, @@ -277,6 +313,11 @@ struct rxrpc_security { /* clear connection security */ void (*clear)(struct rxrpc_connection *); + + /* Default ticket -> key decoder */ + int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); }; /* @@ -320,6 +361,15 @@ struct rxrpc_local { struct list_head new_client_calls; /* Newly created client calls need connection */ spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ + union { + /* Provide a kvec table sufficiently large to manage either a + * DATA packet with a maximum set of jumbo subpackets or a PING + * ACK padded out to 64K with zeropages for PMTUD. + */ + struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ? + 1 + RXRPC_MAX_NR_JUMBO : 3 + 16]; + struct bio_vec bvec[3 + 16]; + }; }; /* @@ -335,28 +385,31 @@ struct rxrpc_peer { struct hlist_head error_targets; /* targets for net error distribution */ struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ + unsigned long app_data; /* Application data (e.g. afs_server) */ time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ - unsigned int if_mtu; /* interface MTU for this peer */ - unsigned int mtu; /* network MTU for this peer */ - unsigned int maxdata; /* data size (MTU - hdrsize) */ - unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ struct sockaddr_rxrpc srx; /* remote address */ - /* calculated RTT cache */ -#define RXRPC_RTT_CACHE_SIZE 32 - spinlock_t rtt_input_lock; /* RTT lock for input routine */ - ktime_t rtt_last_req; /* Time of last RTT request */ - unsigned int rtt_count; /* Number of samples we've got */ + /* Path MTU discovery [RFC8899] */ + unsigned int pmtud_trial; /* Current MTU probe size */ + unsigned int pmtud_good; /* Largest working MTU probe we've tried */ + unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */ + bool pmtud_lost; /* T if MTU probe was lost */ + bool pmtud_probing; /* T if we have an active probe outstanding */ + bool pmtud_pending; /* T if a call to this peer should send a probe */ + u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ + bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ + unsigned int ackr_max_data; /* Maximum data advertised by peer */ + unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ + unsigned int max_data; /* Maximum packet data capacity for this peer */ + unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ + unsigned short tx_seg_max; /* Maximum number of transmissable segments */ - u32 srtt_us; /* smoothed round trip time << 3 in usecs */ - u32 mdev_us; /* medium deviation */ - u32 mdev_max_us; /* maximal mdev for the last rtt period */ - u32 rttvar_us; /* smoothed mdev_max */ - u32 rto_us; /* Retransmission timeout in usec */ - u8 backoff; /* Backoff timeout (as shift) */ + /* Calculated RTT cache */ + unsigned int recent_srtt_us; + unsigned int recent_rto_us; u8 cong_ssthresh; /* Congestion slow-start threshold */ }; @@ -514,7 +567,17 @@ struct rxrpc_connection { struct rxrpc_crypt csum_iv; /* packet checksum base */ u32 nonce; /* response re-use preventer */ } rxkad; + struct { + struct rxgk_context *keys[4]; /* (Re-)keying buffer */ + u64 start_time; /* The start time for TK derivation */ + u8 nonce[20]; /* Response re-use preventer */ + u32 enctype; /* Kerberos 5 encoding type */ + u32 key_number; /* Current key number */ + } rxgk; }; + rwlock_t security_use_lock; /* Security use/modification lock */ + struct sk_buff *tx_response; /* Response packet to be transmitted */ + unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ @@ -525,6 +588,8 @@ struct rxrpc_connection { int debug_id; /* debug ID for printks */ rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ + rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */ + unsigned int pmtud_call; /* ID of call used for probe */ u32 service_id; /* Service ID, possibly upgraded */ u32 security_level; /* Security level selected */ u8 security_ix; /* security type */ @@ -557,6 +622,7 @@ enum rxrpc_call_flag { RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */ + RXRPC_CALL_TX_NO_MORE, /* No more data to transmit (MSG_MORE deasserted) */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ @@ -567,6 +633,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */ RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */ + RXRPC_CALL_CONN_CHALLENGING, /* The connection is being challenged */ }; /* @@ -587,7 +654,6 @@ enum rxrpc_call_state { RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ - RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ @@ -599,13 +665,25 @@ enum rxrpc_call_state { /* * Call Tx congestion management modes. */ -enum rxrpc_congest_mode { - RXRPC_CALL_SLOW_START, - RXRPC_CALL_CONGEST_AVOIDANCE, - RXRPC_CALL_PACKET_LOSS, - RXRPC_CALL_FAST_RETRANSMIT, - NR__RXRPC_CONGEST_MODES -}; +enum rxrpc_ca_state { + RXRPC_CA_SLOW_START, + RXRPC_CA_CONGEST_AVOIDANCE, + RXRPC_CA_PACKET_LOSS, + RXRPC_CA_FAST_RETRANSMIT, + NR__RXRPC_CA_STATES +} __mode(byte); + +/* + * Current purpose of call RACK timer. According to the RACK-TLP protocol + * [RFC8985], the transmission timer (call->rack_timo_at) may only be used for + * one of these at once. + */ +enum rxrpc_rack_timer_mode { + RXRPC_CALL_RACKTIMER_OFF, /* Timer not running */ + RXRPC_CALL_RACKTIMER_RACK_REORDER, /* RACK reordering timer */ + RXRPC_CALL_RACKTIMER_TLP_PTO, /* TLP timeout */ + RXRPC_CALL_RACKTIMER_RTO, /* Retransmission timeout */ +} __mode(byte); /* * RxRPC call definition @@ -624,8 +702,7 @@ struct rxrpc_call { struct mutex user_mutex; /* User access mutex */ struct sockaddr_rxrpc dest_srx; /* Destination address */ ktime_t delay_ack_at; /* When DELAY ACK needs to happen */ - ktime_t ack_lost_at; /* When ACK is figured as lost */ - ktime_t resend_at; /* When next resend needs to happen */ + ktime_t rack_timo_at; /* When ACK is figured as lost */ ktime_t ping_at; /* When next to send a ping */ ktime_t keepalive_at; /* When next to send a keepalive ping */ ktime_t expect_rx_by; /* When we expect to get a packet by */ @@ -666,25 +743,35 @@ struct rxrpc_call { u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 security_level; /* Security level selected */ + u32 security_enctype; /* Security-specific encoding type (or 0) */ int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ + /* Sendmsg data tracking. */ + rxrpc_seq_t send_top; /* Highest Tx slot filled by sendmsg. */ + struct rxrpc_txqueue *send_queue; /* Queue that sendmsg is writing into */ + /* Transmitted data tracking. */ - spinlock_t tx_lock; /* Transmit queue lock */ - struct list_head tx_sendmsg; /* Sendmsg prepared packets */ - struct list_head tx_buffer; /* Buffer of transmissible packets */ + struct rxrpc_txqueue *tx_queue; /* Start of transmission buffers */ + struct rxrpc_txqueue *tx_qtail; /* End of transmission buffers */ + rxrpc_seq_t tx_qbase; /* First slot in tx_queue */ rxrpc_seq_t tx_bottom; /* First packet in buffer */ rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ - rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + rxrpc_serial_t tx_last_serial; /* Serial of last DATA transmitted */ u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */ - u8 tx_winsize; /* Maximum size of Tx window */ + u16 tx_nr_sent; /* Number of packets sent, but unacked */ + u16 tx_nr_lost; /* Number of packets marked lost */ + u16 tx_nr_resent; /* Number of packets resent, but unacked */ + u16 tx_winsize; /* Maximum size of Tx window */ #define RXRPC_TX_MAX_WINDOW 128 + u8 tx_jumbo_max; /* Maximum subpkts peer will accept */ ktime_t tx_last_sent; /* Last time a transmission occurred */ /* Received data tracking */ struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ + struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */ struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */ rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */ @@ -697,15 +784,33 @@ struct rxrpc_call { * packets) rather than bytes. */ #define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN -#define RXRPC_MIN_CWND (RXRPC_TX_SMSS > 2190 ? 2 : RXRPC_TX_SMSS > 1095 ? 3 : 4) - u8 cong_cwnd; /* Congestion window size */ +#define RXRPC_MIN_CWND 4 + enum rxrpc_ca_state cong_ca_state; /* Congestion control state */ u8 cong_extra; /* Extra to send for congestion management */ - u8 cong_ssthresh; /* Slow-start threshold */ - enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */ - u8 cong_dup_acks; /* Count of ACKs showing missing packets */ - u8 cong_cumul_acks; /* Cumulative ACK count */ + u16 cong_cwnd; /* Congestion window size */ + u16 cong_ssthresh; /* Slow-start threshold */ + u16 cong_dup_acks; /* Count of ACKs showing missing packets */ + u16 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ - struct sk_buff *cong_last_nack; /* Last ACK with nacks received */ + + /* RACK-TLP [RFC8985] state. */ + ktime_t rack_xmit_ts; /* Latest transmission timestamp */ + ktime_t rack_rtt; /* RTT of most recently ACK'd segment */ + ktime_t rack_rtt_ts; /* Timestamp of rack_rtt */ + ktime_t rack_reo_wnd; /* Reordering window */ + unsigned int rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */ + int rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */ + rxrpc_seq_t rack_fack; /* Highest sequence so far ACK'd */ + rxrpc_seq_t rack_end_seq; /* Highest sequence seen */ + rxrpc_seq_t rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */ + bool rack_dsack_round_none; /* T if dsack_round is "None" */ + bool rack_reordering_seen; /* T if detected reordering event */ + enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */ + bool tlp_is_retrans; /* T if unacked TLP retransmission */ + rxrpc_serial_t tlp_serial; /* Serial of TLP probe (or 0 if none in progress) */ + rxrpc_seq_t tlp_seq; /* Sequence of TLP probe */ + unsigned int tlp_rtt_taken; /* Last time RTT taken */ + ktime_t tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */ /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ @@ -730,32 +835,45 @@ struct rxrpc_call { /* Transmission-phase ACK management (ACKs we've received). */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ - rxrpc_seq_t acks_first_seq; /* first sequence number received */ + rxrpc_seq_t acks_hard_ack; /* Highest sequence hard acked */ rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ - rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ + unsigned short acks_nr_sacks; /* Number of soft acks recorded */ + unsigned short acks_nr_snacks; /* Number of soft nacks recorded */ + + /* Calculated RTT cache */ + ktime_t rtt_last_req; /* Time of last RTT request */ + unsigned int rtt_count; /* Number of samples we've got */ + unsigned int rtt_taken; /* Number of samples taken (wrapping) */ + struct minmax min_rtt; /* Estimated minimum RTT */ + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + u32 mdev_us; /* medium deviation */ + u32 mdev_max_us; /* maximal mdev for the last rtt period */ + u32 rttvar_us; /* smoothed mdev_max */ + u32 rto_us; /* Retransmission timeout in usec */ + u8 backoff; /* Backoff timeout (as shift) */ }; /* * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { - u16 nr_acks; /* Number of ACKs in packet */ - u16 nr_new_acks; /* Number of new ACKs in packet */ - u16 nr_new_nacks; /* Number of new nacks in packet */ - u16 nr_retained_nacks; /* Number of nacks retained between ACKs */ - u8 ack_reason; - bool saw_nacks; /* Saw NACKs in packet */ - bool new_low_nack; /* T if new low NACK found */ - bool retrans_timeo; /* T if reTx due to timeout happened */ - u8 flight_size; /* Number of unreceived transmissions */ - /* Place to stash values for tracing */ - enum rxrpc_congest_mode mode:8; - u8 cwnd; - u8 ssthresh; - u8 dup_acks; - u8 cumulative_acks; + rxrpc_serial_t ack_serial; /* Serial number of ACK */ + rxrpc_serial_t acked_serial; /* Serial number ACK'd */ + u16 in_flight; /* Number of unreceived transmissions */ + u16 nr_new_hacks; /* Number of rotated new ACKs */ + u16 nr_new_sacks; /* Number of new soft ACKs in packet */ + u16 nr_new_snacks; /* Number of new soft nacks in packet */ + u8 ack_reason; + bool new_low_snack:1; /* T if new low soft NACK found */ + bool retrans_timeo:1; /* T if reTx due to timeout happened */ + bool need_retransmit:1; /* T if we need transmission */ + bool rtt_sample_avail:1; /* T if RTT sample available */ + bool in_fast_or_rto_recovery:1; + bool exiting_fast_or_rto_recovery:1; + bool tlp_probe_acked:1; /* T if the TLP probe seq was acked */ + u8 /*enum rxrpc_congest_change*/ change; }; /* @@ -793,25 +911,24 @@ struct rxrpc_send_params { * Buffer of data to be output as a packet. */ struct rxrpc_txbuf { - struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */ - struct list_head tx_link; /* Link in live Enc queue or Tx queue */ - ktime_t last_sent; /* Time at which last transmitted */ refcount_t ref; rxrpc_seq_t seq; /* Sequence number of this packet */ rxrpc_serial_t serial; /* Last serial number transmitted with */ unsigned int call_debug_id; unsigned int debug_id; - unsigned int len; /* Amount of data in buffer */ - unsigned int space; /* Remaining data space */ - unsigned int offset; /* Offset of fill point */ + unsigned short len; /* Amount of data in buffer */ + unsigned short space; /* Remaining data space */ + unsigned short offset; /* Offset of fill point */ + unsigned short crypto_header; /* Size of crypto header */ + unsigned short sec_header; /* Size of security header */ + unsigned short pkt_len; /* Size of packet content */ + unsigned short alloc_size; /* Amount of bufferage allocated */ unsigned int flags; #define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */ #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ __be16 cksum; /* Checksum to go in header */ - unsigned short ack_rwind; /* ACK receive window */ - u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ - u8 nr_kvec; /* Amount of kvec[] used */ - struct kvec kvec[3]; + bool jumboable; /* Can be non-terminal jumbo subpacket */ + void *data; /* Data with preceding jumbo header */ }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) @@ -824,6 +941,46 @@ static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) return !rxrpc_sending_to_server(txb); } +/* + * Transmit queue element, including RACK [RFC8985] per-segment metadata. The + * transmission timestamp is in usec from the base. + */ +struct rxrpc_txqueue { + /* Start with the members we want to prefetch. */ + struct rxrpc_txqueue *next; + ktime_t xmit_ts_base; + rxrpc_seq_t qbase; + u8 nr_reported_acks; /* Number of segments explicitly acked/nacked */ + unsigned long segment_acked; /* Bit-per-buf: Set if ACK'd */ + unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */ + unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */ + unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */ + unsigned long ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */ + + /* The arrays we want to pack into as few cache lines as possible. */ + struct { +#define RXRPC_NR_TXQUEUE BITS_PER_LONG +#define RXRPC_TXQ_MASK (RXRPC_NR_TXQUEUE - 1) + struct rxrpc_txbuf *bufs[RXRPC_NR_TXQUEUE]; + unsigned int segment_serial[RXRPC_NR_TXQUEUE]; + unsigned int segment_xmit_ts[RXRPC_NR_TXQUEUE]; + } ____cacheline_aligned; +}; + +/* + * Data transmission request. + */ +struct rxrpc_send_data_req { + ktime_t now; /* Current time */ + struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */ + rxrpc_seq_t seq; /* Sequence of first data */ + int n; /* Number of DATA packets to glue into jumbo */ + bool retrans; /* T if this is a retransmission */ + bool did_send; /* T if did actually send */ + bool tlp_probe; /* T if this is a TLP probe */ + int /* enum rxrpc_txdata_trace */ trace; +}; + #include <trace/events/rxrpc.h> /* @@ -841,6 +998,21 @@ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn } /* + * Allocate the next serial n numbers on a connection. 0 must be skipped. + */ +static inline rxrpc_serial_t rxrpc_get_next_serials(struct rxrpc_connection *conn, + unsigned int n) +{ + rxrpc_serial_t serial; + + serial = conn->tx_serial; + if (serial + n <= n) + serial = 1; + conn->tx_serial = serial + n; + return serial; +} + +/* * af_rxrpc.c */ extern atomic_t rxrpc_n_rx_skbs; @@ -856,7 +1028,6 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb); -void rxrpc_accept_incoming_calls(struct rxrpc_local *); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); /* @@ -866,10 +1037,10 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); -void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *); -void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb); - -bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); +void rxrpc_resend_tlp(struct rxrpc_call *call); +void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace); +bool rxrpc_input_call_event(struct rxrpc_call *call); /* * call_object.c @@ -884,7 +1055,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct rxrpc_call_params *, gfp_t, - unsigned int); + unsigned int) + __releases(&rx->sk.sk_lock) + __acquires(&call->user_mutex); void rxrpc_start_call_timer(struct rxrpc_call *call); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); @@ -969,7 +1142,6 @@ void rxrpc_connect_client_calls(struct rxrpc_local *local); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); -void rxrpc_put_client_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_discard_expired_client_conns(struct rxrpc_local *local); void rxrpc_clean_up_local_conns(struct rxrpc_local *); @@ -1049,6 +1221,32 @@ void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); /* + * input_rack.c + */ +void rxrpc_input_rack_one(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix); +void rxrpc_input_rack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long new_acks); +void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary); +ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now); +void rxrpc_tlp_send_probe(struct rxrpc_call *call); +void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary); +void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by); + +/* Initialise TLP state [RFC8958 7.1]. */ +static inline void rxrpc_tlp_init(struct rxrpc_call *call) +{ + call->tlp_serial = 0; + call->tlp_seq = call->acks_hard_ack; + call->tlp_is_retrans = false; +} + +/* * io_thread.c */ int rxrpc_encap_rcv(struct sock *, struct sk_buff *); @@ -1056,9 +1254,12 @@ void rxrpc_error_report(struct sock *); bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err); int rxrpc_io_thread(void *data); +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { - wake_up_process(local->io_thread); + if (!local->io_thread) + return; + wake_up_process(READ_ONCE(local->io_thread)); } static inline bool rxrpc_protocol_error(struct sk_buff *skb, enum rxrpc_abort_reason why) @@ -1147,21 +1348,33 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) } /* + * out_of_band.c + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb); +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb); +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len); + +/* * output.c */ +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len); void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); int rxrpc_send_abort_packet(struct rxrpc_call *); +void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *skb); /* * peer_event.c */ void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *); void rxrpc_peer_keepalive_worker(struct work_struct *); +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail); /* * peer_object.c @@ -1210,10 +1423,17 @@ static inline int rxrpc_abort_eproto(struct rxrpc_call *call, /* * rtt.c */ -void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int, - rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); -ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans); -void rxrpc_peer_init_rtt(struct rxrpc_peer *); +void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + int rtt_slot, + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, + ktime_t send_time, ktime_t resp_time); +ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans); +void rxrpc_call_init_rtt(struct rxrpc_call *call); + +/* + * rxgk.c + */ +extern const struct rxrpc_security rxgk_yfs; /* * rxkad.c @@ -1286,8 +1506,6 @@ static inline void rxrpc_sysctl_exit(void) {} extern atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp); -struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size); -void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); @@ -1313,6 +1531,53 @@ static inline bool after_eq(u32 seq1, u32 seq2) return (s32)(seq1 - seq2) >= 0; } +static inline u32 earliest(u32 seq1, u32 seq2) +{ + return before(seq1, seq2) ? seq1 : seq2; +} + +static inline u32 latest(u32 seq1, u32 seq2) +{ + return after(seq1, seq2) ? seq1 : seq2; +} + +static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq) +{ + return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase; +} + +static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + rxrpc_get_skb(skb, rxrpc_skb_get_call_rx); + __skb_queue_tail(&call->rx_queue, skb); + rxrpc_poke_call(call, rxrpc_call_poke_rx_packet); +} + +/* + * Calculate how much space there is for transmitting more DATA packets. + */ +static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call) +{ + int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); + int transmitted = call->tx_top - call->tx_bottom; + + return max(winsize - transmitted, 0); +} + +static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call) +{ + return call->acks_nr_sacks + call->tx_nr_lost; +} + +/* + * Calculate the number of transmitted DATA packets assumed to be in flight + * [approx RFC6675]. + */ +static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call) +{ + return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent; +} + /* * debug tracing */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 0f5a1d77b890..49fccee1a726 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -34,7 +34,6 @@ static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call, static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, struct rxrpc_backlog *b, rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -123,9 +122,10 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->user_call_ID = user_call_ID; call->notify_rx = notify_rx; - if (user_attach_call) { + if (rx->app_ops && + rx->app_ops->user_attach_call) { rxrpc_get_call(call, rxrpc_call_get_kernel_service); - user_attach_call(call, user_call_ID); + rx->app_ops->user_attach_call(call, user_call_ID); } rxrpc_get_call(call, rxrpc_call_get_userid); @@ -149,6 +149,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, id_in_use: write_unlock(&rx->call_lock); + rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, -EBADSLT); rxrpc_cleanup_call(call); _leave(" = -EBADSLT"); return -EBADSLT; @@ -188,8 +189,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) /* Make sure that there aren't any incoming calls in progress before we * clear the preallocation buffers. */ - spin_lock(&rx->incoming_lock); - spin_unlock(&rx->incoming_lock); + spin_lock_irq(&rx->incoming_lock); + spin_unlock_irq(&rx->incoming_lock); head = b->peer_backlog_head; tail = b->peer_backlog_tail; @@ -219,9 +220,10 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; rcu_assign_pointer(call->socket, rx); - if (rx->discard_new_call) { + if (rx->app_ops && + rx->app_ops->discard_new_call) { _debug("discard %lx", call->user_call_ID); - rx->discard_new_call(call, call->user_call_ID); + rx->app_ops->discard_new_call(call, call->user_call_ID); if (call->notify_rx) call->notify_rx = rxrpc_dummy_notify; rxrpc_put_call(call, rxrpc_call_put_kernel); @@ -253,6 +255,9 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, unsigned short call_tail, conn_tail, peer_tail; unsigned short call_count, conn_count; + if (!b) + return NULL; + /* #calls >= #conns >= #peers must hold true. */ call_head = smp_load_acquire(&b->call_backlog_head); call_tail = b->call_backlog_tail; @@ -343,7 +348,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA) return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call); - read_lock(&local->services_lock); + read_lock_irq(&local->services_lock); /* Weed out packets to services we're not offering. Packets that would * begin a call are explicitly rejected and the rest are just @@ -387,8 +392,9 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, rxrpc_incoming_call(rx, call, skb); conn = call->conn; - if (rx->notify_new_call) - rx->notify_new_call(&rx->sk, call, call->user_call_ID); + if (rx->app_ops && + rx->app_ops->notify_new_call) + rx->app_ops->notify_new_call(&rx->sk, call, call->user_call_ID); spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) { @@ -399,34 +405,34 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, spin_unlock(&conn->state_lock); spin_unlock(&rx->incoming_lock); - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); if (hlist_unhashed(&call->error_link)) { - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } _leave(" = %p{%d}", call, call->debug_id); - rxrpc_input_call_event(call, skb); + rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); return true; unsupported_service: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EOPNOTSUPP); unsupported_security: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); _leave(" = f [%u]", skb->mark); return false; discard: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return true; } @@ -440,8 +446,7 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) if (rx->sk.sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID, - GFP_KERNEL, + return rxrpc_service_prealloc_one(rx, b, NULL, user_call_ID, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); } @@ -449,20 +454,18 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls * @sock: The socket on which to preallocate * @notify_rx: Event notification function for the call - * @user_attach_call: Func to attach call to user_call_ID * @user_call_ID: The tag to attach to the preallocated call * @gfp: The allocation conditions. * @debug_id: The tracing debug ID. * - * Charge up the socket with preallocated calls, each with a user ID. A - * function should be provided to effect the attachment from the user's side. - * The user is given a ref to hold on the call. + * Charge up the socket with preallocated calls, each with a user ID. The + * ->user_attach_call() callback function should be provided to effect the + * attachment from the user's side. The user is given a ref to hold on the + * call. * * Note that the call may be come connected before this function returns. */ -int rxrpc_kernel_charge_accept(struct socket *sock, - rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -472,8 +475,7 @@ int rxrpc_kernel_charge_accept(struct socket *sock, if (sock->sk->sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, notify_rx, - user_attach_call, user_call_ID, + return rxrpc_service_prealloc_one(rx, b, notify_rx, user_call_ID, gfp, debug_id); } EXPORT_SYMBOL(rxrpc_kernel_charge_accept); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 7bbb68504766..fec59d9338b9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -44,8 +44,8 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial); - if (call->peer->srtt_us) - delay = (call->peer->srtt_us >> 3) * NSEC_PER_USEC; + if (call->srtt_us) + delay = (call->srtt_us >> 3) * NSEC_PER_USEC; else delay = ms_to_ktime(READ_ONCE(rxrpc_soft_ack_delay)); ktime_add_ms(delay, call->tx_backoff); @@ -55,147 +55,104 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, } /* - * Handle congestion being detected by the retransmit timeout. + * Retransmit one or more packets. */ -static void rxrpc_congestion_timeout(struct rxrpc_call *call) +static bool rxrpc_retransmit_data(struct rxrpc_call *call, + struct rxrpc_send_data_req *req) { - set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags); + struct rxrpc_txqueue *tq = req->tq; + unsigned int ix = req->seq & RXRPC_TXQ_MASK; + struct rxrpc_txbuf *txb = tq->bufs[ix]; + + _enter("%x,%x,%x,%x", tq->qbase, req->seq, ix, txb->debug_id); + + req->retrans = true; + trace_rxrpc_retransmit(call, req, txb); + + txb->flags |= RXRPC_TXBUF_RESENT; + rxrpc_send_data_packet(call, req); + rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); + + req->tq = NULL; + req->n = 0; + req->did_send = true; + req->now = ktime_get_real(); + return true; } /* * Perform retransmission of NAK'd and unack'd packets. */ -void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) +static void rxrpc_resend(struct rxrpc_call *call) { - struct rxrpc_ackpacket *ack = NULL; - struct rxrpc_skb_priv *sp; - struct rxrpc_txbuf *txb; - rxrpc_seq_t transmitted = call->tx_transmitted; - ktime_t next_resend = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); - ktime_t resend_at = KTIME_MAX, now, delay; - bool unacked = false, did_send = false; - unsigned int i; - - _enter("{%d,%d}", call->acks_hard_ack, call->tx_top); - - now = ktime_get_real(); - - if (list_empty(&call->tx_buffer)) - goto no_resend; + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .trace = rxrpc_txdata_retransmit, + }; + struct rxrpc_txqueue *tq; - trace_rxrpc_resend(call, ack_skb); - txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link); + _enter("{%d,%d}", call->tx_bottom, call->tx_top); - /* Scan the soft ACK table without dropping the lock and resend any - * explicitly NAK'd packets. - */ - if (ack_skb) { - sp = rxrpc_skb(ack_skb); - ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); + trace_rxrpc_resend(call, call->acks_highest_serial); - for (i = 0; i < sp->ack.nr_acks; i++) { - rxrpc_seq_t seq; + /* Scan the transmission queue, looking for lost packets. */ + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long lost = tq->segment_lost; - if (ack->acks[i] & 1) - continue; - seq = sp->ack.first_ack + i; - if (after(txb->seq, transmitted)) - break; - if (after(txb->seq, seq)) - continue; /* A new hard ACK probably came in */ - list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - if (txb->seq == seq) - goto found_txb; - } - goto no_further_resend; - - found_txb: - resend_at = ktime_add(txb->last_sent, rto); - if (after(txb->serial, call->acks_highest_serial)) { - if (ktime_after(resend_at, now) && - ktime_before(resend_at, next_resend)) - next_resend = resend_at; - continue; /* Ack point not yet reached */ - } + if (after(tq->qbase, call->tx_transmitted)) + break; - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); + _debug("retr %16lx %u c=%08x [%x]", + tq->segment_acked, tq->nr_reported_acks, call->debug_id, tq->qbase); + _debug("lost %16lx", lost); - trace_rxrpc_retransmit(call, txb->seq, txb->serial, - ktime_sub(resend_at, now)); + trace_rxrpc_resend_lost(call, tq, lost); + while (lost) { + unsigned int ix = __ffs(lost); + struct rxrpc_txbuf *txb = tq->bufs[ix]; - txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_one(call, txb); - did_send = true; - now = ktime_get_real(); + __clear_bit(ix, &lost); + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_lost); - if (list_is_last(&txb->call_link, &call->tx_buffer)) - goto no_further_resend; - txb = list_next_entry(txb, call_link); + req.tq = tq; + req.seq = tq->qbase + ix; + req.n = 1; + rxrpc_retransmit_data(call, &req); } } - /* Fast-forward through the Tx queue to the point the peer says it has - * seen. Anything between the soft-ACK table and that point will get - * ACK'd or NACK'd in due course, so don't worry about it here; here we - * need to consider retransmitting anything beyond that point. - */ - if (after_eq(call->acks_prev_seq, call->tx_transmitted)) - goto no_further_resend; - - list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - resend_at = ktime_add(txb->last_sent, rto); - - if (before_eq(txb->seq, call->acks_prev_seq)) - continue; - if (after(txb->seq, call->tx_transmitted)) - break; /* Not transmitted yet */ - - if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE && - before(txb->serial, ntohl(ack->serial))) - goto do_resend; /* Wasn't accounted for by a more recent ping. */ - - if (ktime_after(resend_at, now)) { - if (ktime_before(resend_at, next_resend)) - next_resend = resend_at; - continue; - } - - do_resend: - unacked = true; - - txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_one(call, txb); - did_send = true; - rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); - now = ktime_get_real(); - } + rxrpc_get_rto_backoff(call, req.did_send); + _leave(""); +} -no_further_resend: -no_resend: - if (resend_at < KTIME_MAX) { - delay = rxrpc_get_rto_backoff(call->peer, did_send); - resend_at = ktime_add(resend_at, delay); - trace_rxrpc_timer_set(call, resend_at - now, rxrpc_timer_trace_resend_reset); +/* + * Resend the highest-seq DATA packet so far transmitted for RACK-TLP [RFC8985 7.3]. + */ +void rxrpc_resend_tlp(struct rxrpc_call *call) +{ + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .seq = call->tx_transmitted, + .n = 1, + .tlp_probe = true, + .trace = rxrpc_txdata_tlp_retransmit, + }; + + /* There's a chance it'll be on the tail segment of the queue. */ + req.tq = READ_ONCE(call->tx_qtail); + if (req.tq && + before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) { + rxrpc_retransmit_data(call, &req); + return; } - call->resend_at = resend_at; - - if (unacked) - rxrpc_congestion_timeout(call); - - /* If there was nothing that needed retransmission then it's likely - * that an ACK got lost somewhere. Send a ping to find out instead of - * retransmitting data. - */ - if (!did_send) { - ktime_t next_ping = ktime_add_us(call->acks_latest_ts, - call->peer->srtt_us >> 3); - if (ktime_sub(next_ping, now) <= 0) - rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, - rxrpc_propose_ack_ping_for_0_retrans); + for (req.tq = call->tx_queue; req.tq; req.tq = req.tq->next) { + if (after_eq(call->tx_transmitted, req.tq->qbase) && + before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) { + rxrpc_retransmit_data(call, &req); + return; + } } - - _leave(""); } /* @@ -231,68 +188,93 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) } } -static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) -{ - unsigned int winsize = min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra); - rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize; - rxrpc_seq_t tx_top = call->tx_top; - int space; - - space = wtop - tx_top; - return space > 0; -} - /* - * Decant some if the sendmsg prepared queue into the transmission buffer. + * Transmit some as-yet untransmitted data, to a maximum of the supplied limit. */ -static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) +static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace) { - struct rxrpc_txbuf *txb; + int space = rxrpc_tx_window_space(call); if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { - if (list_empty(&call->tx_sendmsg)) + if (call->send_top == call->tx_top) return; rxrpc_expose_client_call(call); } - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { - spin_lock(&call->tx_lock); - list_del(&txb->call_link); - spin_unlock(&call->tx_lock); + while (space > 0) { + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .seq = call->tx_transmitted + 1, + .n = 0, + .trace = trace, + }; + struct rxrpc_txqueue *tq; + struct rxrpc_txbuf *txb; + rxrpc_seq_t send_top, seq; + int limit = min(space, max(call->peer->pmtud_jumbo, 1)); + + /* Order send_top before the contents of the new txbufs and + * txqueue pointers + */ + send_top = smp_load_acquire(&call->send_top); + if (call->tx_top == send_top) + break; - call->tx_top = txb->seq; - list_add_tail(&txb->call_link, &call->tx_buffer); + trace_rxrpc_transmit(call, send_top, space); - if (txb->flags & RXRPC_LAST_PACKET) - rxrpc_close_tx_phase(call); + tq = call->tx_qtail; + seq = call->tx_top; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant); - rxrpc_transmit_one(call, txb); + do { + int ix; - if (!rxrpc_tx_window_has_space(call)) - break; + seq++; + ix = seq & RXRPC_TXQ_MASK; + if (!ix) { + tq = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant_advance); + } + if (!req.tq) + req.tq = tq; + txb = tq->bufs[ix]; + req.n++; + if (!txb->jumboable) + break; + } while (req.n < limit && before(seq, send_top)); + + if (txb->flags & RXRPC_LAST_PACKET) { + rxrpc_close_tx_phase(call); + tq = NULL; + } + call->tx_qtail = tq; + call->tx_top = seq; + + space -= req.n; + rxrpc_send_data_packet(call, &req); } } -static void rxrpc_transmit_some_data(struct rxrpc_call *call) +void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace) { switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_ACK_REQUEST: - if (list_empty(&call->tx_sendmsg)) + if (call->tx_bottom == READ_ONCE(call->send_top)) return; rxrpc_begin_service_reply(call); fallthrough; case RXRPC_CALL_SERVER_SEND_REPLY: case RXRPC_CALL_CLIENT_SEND_REQUEST: - if (!rxrpc_tx_window_has_space(call)) + if (!rxrpc_tx_window_space(call)) return; - if (list_empty(&call->tx_sendmsg)) { + if (call->tx_bottom == READ_ONCE(call->send_top)) { rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow); return; } - rxrpc_decant_prepared_tx(call); + rxrpc_transmit_fresh_data(call, limit, trace); break; default: return; @@ -305,8 +287,8 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) */ static void rxrpc_send_initial_ping(struct rxrpc_call *call) { - if (call->peer->rtt_count < 3 || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + if (call->rtt_count < 3 || + ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_params); @@ -315,10 +297,11 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call) /* * Handle retransmission and deferred ACK/abort generation. */ -bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) +bool rxrpc_input_call_event(struct rxrpc_call *call) { + struct sk_buff *skb; ktime_t now, t; - bool resend = false; + bool did_receive = false, saw_ack = false; s32 abort_code; rxrpc_see_call(call, rxrpc_call_see_input); @@ -328,9 +311,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)], call->events); - if (__rxrpc_call_is_complete(call)) - goto out; - /* Handle abort request locklessly, vs rxrpc_propose_abort(). */ abort_code = smp_load_acquire(&call->send_abort); if (abort_code) { @@ -339,11 +319,33 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) goto out; } - if (skb && skb->mark == RXRPC_SKB_MARK_ERROR) - goto out; + do { + skb = __skb_dequeue(&call->rx_queue); + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + if (__rxrpc_call_is_complete(call) || + skb->mark == RXRPC_SKB_MARK_ERROR) { + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + goto out; + } + + saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK; + + rxrpc_input_call_packet(call, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + did_receive = true; + } - if (skb) - rxrpc_input_call_packet(call, skb); + t = ktime_sub(call->rack_timo_at, ktime_get_real()); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, + rxrpc_timer_trace_rack_off + call->rack_timer_mode); + call->rack_timo_at = KTIME_MAX; + rxrpc_rack_timer_expired(call, t); + } + + } while (!skb_queue_empty(&call->rx_queue)); /* If we see our async-event poke, check for timeout trippage. */ now = ktime_get_real(); @@ -376,13 +378,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_delayed_ack); } - t = ktime_sub(call->ack_lost_at, now); - if (t <= 0) { - trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_lost_ack); - call->ack_lost_at = KTIME_MAX; - set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events); - } - t = ktime_sub(call->ping_at, now); if (t <= 0) { trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping); @@ -391,15 +386,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_keepalive); } - t = ktime_sub(call->resend_at, now); - if (t <= 0) { - trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_resend); - call->resend_at = KTIME_MAX; - resend = true; - } - - rxrpc_transmit_some_data(call); - now = ktime_get_real(); t = ktime_sub(call->keepalive_at, now); if (t <= 0) { @@ -409,35 +395,40 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_keepalive); } - if (skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK) - rxrpc_congestion_degrade(call); - } - if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events)) rxrpc_send_initial_ping(call); + rxrpc_transmit_some_data(call, UINT_MAX, rxrpc_txdata_new_data); + + if (saw_ack) + rxrpc_congestion_degrade(call); + + if (did_receive && + (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_SEND_REQUEST || + __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SEND_REPLY)) { + t = ktime_sub(call->rack_timo_at, ktime_get_real()); + trace_rxrpc_rack(call, t); + } + /* Process events */ if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_lost_ack); - if (resend && + if (call->tx_nr_lost > 0 && __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY && !test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags)) - rxrpc_resend(call, NULL); + rxrpc_resend(call); if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, rxrpc_propose_ack_rx_idle); if (call->ackr_nr_unacked > 2) { - if (call->peer->rtt_count < 3) + if (call->rtt_count < 3) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_rtt); - else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_old_rtt); @@ -455,8 +446,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) set(call->expect_req_by); set(call->expect_rx_by); set(call->delay_ack_at); - set(call->ack_lost_at); - set(call->resend_at); + set(call->rack_timo_at); set(call->keepalive_at); set(call->ping_at); @@ -467,7 +457,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } else { unsigned long nowj = jiffies, delayj, nextj; - delayj = max(nsecs_to_jiffies(delay), 1); + delayj = umax(nsecs_to_jiffies(delay), 1); nextj = nowj + delayj; if (time_before(nextj, call->timer.expires) || !timer_pending(&call->timer)) { @@ -479,14 +469,17 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) out: if (__rxrpc_call_is_complete(call)) { - del_timer_sync(&call->timer); + timer_delete_sync(&call->timer); if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) rxrpc_disconnect_call(call); if (call->security) call->security->free_call_crypto(call); + } else { + if (did_receive && + call->peer->ackr_adv_pmtud && + call->peer->pmtud_pending) + rxrpc_send_probe_for_pmtud(call); } - if (call->acks_hard_ack != call->tx_bottom) - rxrpc_shrink_call_tx_buffer(call); _leave(""); return true; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 01fa71e8b1f7..15067ff7b1f2 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -22,7 +22,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", - [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", @@ -49,7 +48,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) bool busy; if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) { - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); busy = !list_empty(&call->attend_link); trace_rxrpc_poke_call(call, busy, what); if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke)) @@ -57,7 +56,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) if (!busy) { list_add_tail(&call->attend_link, &local->call_attend_q); } - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); if (!busy) rxrpc_wake_up_io_thread(local); } @@ -65,7 +64,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) static void rxrpc_call_timer_expired(struct timer_list *t) { - struct rxrpc_call *call = from_timer(call, t, timer); + struct rxrpc_call *call = timer_container_of(call, t, timer); _enter("%d", call->debug_id); @@ -146,23 +145,21 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); - INIT_LIST_HEAD(&call->tx_sendmsg); - INIT_LIST_HEAD(&call->tx_buffer); skb_queue_head_init(&call->recvmsg_queue); + skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); - spin_lock_init(&call->tx_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; call->tx_total_len = -1; + call->tx_jumbo_max = 1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; call->ackr_window = 1; call->ackr_wtop = 1; call->delay_ack_at = KTIME_MAX; - call->ack_lost_at = KTIME_MAX; - call->resend_at = KTIME_MAX; + call->rack_timo_at = KTIME_MAX; call->ping_at = KTIME_MAX; call->keepalive_at = KTIME_MAX; call->expect_rx_by = KTIME_MAX; @@ -174,14 +171,11 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->rx_winsize = rxrpc_rx_window_size; call->tx_winsize = 16; - if (RXRPC_TX_SMSS > 2190) - call->cong_cwnd = 2; - else if (RXRPC_TX_SMSS > 1095) - call->cong_cwnd = 3; - else - call->cong_cwnd = 4; + call->cong_cwnd = RXRPC_MIN_CWND; call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; + rxrpc_call_init_rtt(call); + call->rxnet = rxnet; call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; atomic_inc(&rxnet->nr_calls); @@ -225,9 +219,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); if (p->timeouts.normal) - call->next_rx_timo = min(p->timeouts.normal, 1); + call->next_rx_timo = umin(p->timeouts.normal, 1); if (p->timeouts.idle) - call->next_req_timo = min(p->timeouts.idle, 1); + call->next_req_timo = umin(p->timeouts.idle, 1); if (p->timeouts.hard) call->hard_timo = p->timeouts.hard; @@ -307,9 +301,9 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call); rxrpc_get_call(call, rxrpc_call_get_io_thread); - spin_lock(&local->client_call_lock); + spin_lock_irq(&local->client_call_lock); list_add_tail(&call->wait_link, &local->new_client_calls); - spin_unlock(&local->client_call_lock); + spin_unlock_irq(&local->client_call_lock); rxrpc_wake_up_io_thread(local); return 0; @@ -328,7 +322,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) - __releases(&rx->sk.sk_lock.slock) + __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; @@ -439,7 +433,7 @@ error_attached_to_socket: /* * Set up an incoming call. call->conn points to the connection. - * This is called in BH context and isn't allowed to fail. + * This is called with interrupts disabled and isn't allowed to fail. */ void rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_call *call, @@ -458,17 +452,16 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->cong_tstamp = skb->tstamp; __set_bit(RXRPC_CALL_EXPOSED, &call->flags); - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + __set_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags); break; case RXRPC_CONN_SERVICE: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; case RXRPC_CONN_ABORTED: @@ -536,11 +529,29 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) } /* - * Clean up the Rx skb ring. + * Clean up the transmission buffers. */ -static void rxrpc_cleanup_ring(struct rxrpc_call *call) +static void rxrpc_cleanup_tx_buffers(struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq, *next; + + for (tq = call->tx_queue; tq; tq = next) { + next = tq->next; + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + if (tq->bufs[i]) + rxrpc_put_txbuf(tq->bufs[i], rxrpc_txbuf_put_cleaned); + trace_rxrpc_tq(call, tq, 0, rxrpc_tq_cleaned); + kfree(tq); + } +} + +/* + * Clean up the receive buffers. + */ +static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call) { rxrpc_purge_queue(&call->recvmsg_queue); + rxrpc_purge_queue(&call->rx_queue); rxrpc_purge_queue(&call->rx_oos_queue); } @@ -563,7 +574,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_put_call_slot(call); /* Make sure we don't get any more notifications */ - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", @@ -576,7 +587,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); @@ -676,23 +687,11 @@ static void rxrpc_rcu_free_call(struct rcu_head *rcu) static void rxrpc_destroy_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); - struct rxrpc_txbuf *txb; - del_timer_sync(&call->timer); - - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - rxrpc_cleanup_ring(call); - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } + timer_delete_sync(&call->timer); + rxrpc_cleanup_tx_buffers(call); + rxrpc_cleanup_rx_buffers(call); rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_deactivate_bundle(call->bundle); @@ -712,7 +711,7 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); - del_timer(&call->timer); + timer_delete(&call->timer); if (rcu_read_lock_held()) /* Can't use the rxrpc workqueue as we need to cancel/flush @@ -761,3 +760,23 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); } + +/** + * rxrpc_kernel_query_call_security - Query call's security parameters + * @call: The call to query + * @_service_id: Where to return the service ID + * @_enctype: Where to return the "encoding type" + * + * This queries the security parameters of a call, setting *@_service_id and + * *@_enctype and returning the security class. + * + * Return: The security class protocol number. + */ +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype) +{ + *_service_id = call->dest_srx.srx_service; + *_enctype = call->security_enctype; + return call->security_ix; +} +EXPORT_SYMBOL(rxrpc_kernel_query_call_security); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index d25bf1cf3670..63bbcc567f59 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -231,7 +231,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) distance = id - id_cursor; if (distance < 0) distance = -distance; - limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024); + limit = umax(atomic_read(&rxnet->nr_conns) * 4, 1024); if (distance > limit) goto mark_dont_reuse; @@ -437,9 +437,9 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, call->dest_srx.srx_service = conn->service_id; call->cong_ssthresh = call->peer->cong_ssthresh; if (call->cong_cwnd >= call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; else - call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_ca_state = RXRPC_CA_SLOW_START; chan->call_id = call_id; chan->call_debug_id = call->debug_id; @@ -508,15 +508,18 @@ static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) void rxrpc_connect_client_calls(struct rxrpc_local *local) { struct rxrpc_call *call; + LIST_HEAD(new_client_calls); - while ((call = list_first_entry_or_null(&local->new_client_calls, - struct rxrpc_call, wait_link)) - ) { + spin_lock_irq(&local->client_call_lock); + list_splice_tail_init(&local->new_client_calls, &new_client_calls); + spin_unlock_irq(&local->client_call_lock); + + while ((call = list_first_entry_or_null(&new_client_calls, + struct rxrpc_call, wait_link))) { struct rxrpc_bundle *bundle = call->bundle; - spin_lock(&local->client_call_lock); list_move_tail(&call->wait_link, &bundle->waiting_calls); - spin_unlock(&local->client_call_lock); + rxrpc_see_call(call, rxrpc_call_see_waiting_call); if (rxrpc_bundle_has_space(bundle)) rxrpc_activate_channels(bundle); @@ -544,9 +547,9 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); trace_rxrpc_client(conn, channel, rxrpc_client_exposed); - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } } @@ -586,7 +589,10 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call _debug("call is waiting"); ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); + /* May still be on ->new_client_calls. */ + spin_lock_irq(&local->client_call_lock); list_del_init(&call->wait_link); + spin_unlock_irq(&local->client_call_lock); return; } @@ -812,7 +818,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) local->kill_all_client_conns = true; - del_timer_sync(&local->client_conn_reap_timer); + timer_delete_sync(&local->client_conn_reap_timer); while ((conn = list_first_entry_or_null(&local->idle_client_conns, struct rxrpc_connection, cache_link))) { diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 598b4ee389fc..232b6986da83 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -19,14 +19,14 @@ /* * Set the completion state on an aborted connection. */ -static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff *skb, +static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, s32 abort_code, int err, enum rxrpc_call_completion compl) { bool aborted = false; if (conn->state != RXRPC_CONN_ABORTED) { - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state != RXRPC_CONN_ABORTED) { conn->abort_code = abort_code; conn->error = err; @@ -37,7 +37,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff set_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events); aborted = true; } - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); } return aborted; @@ -49,12 +49,20 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, s32 abort_code, int err, enum rxrpc_abort_reason why) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - if (rxrpc_set_conn_aborted(conn, skb, abort_code, err, + u32 cid = conn->proto.cid, call = 0, seq = 0; + + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + cid = sp->hdr.cid; + call = sp->hdr.callNumber; + seq = sp->hdr.seq; + } + + if (rxrpc_set_conn_aborted(conn, abort_code, err, RXRPC_CALL_LOCALLY_ABORTED)) { - trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, - sp->hdr.seq, abort_code, err); + trace_rxrpc_abort(0, why, cid, call, seq, abort_code, err); rxrpc_poke_conn(conn, rxrpc_conn_get_poke_abort); } return -EPROTO; @@ -63,11 +71,12 @@ int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, /* * Mark a connection as being remotely aborted. */ -static bool rxrpc_input_conn_abort(struct rxrpc_connection *conn, +static void rxrpc_input_conn_abort(struct rxrpc_connection *conn, struct sk_buff *skb) { - return rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, - RXRPC_CALL_REMOTELY_ABORTED); + trace_rxrpc_rx_conn_abort(conn, skb); + rxrpc_set_conn_aborted(conn, skb->priority, -ECONNABORTED, + RXRPC_CALL_REMOTELY_ABORTED); } /* @@ -91,7 +100,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct rxrpc_acktrailer trailer; size_t len; int ret, ioc; - u32 serial, mtu, call_id, padding; + u32 serial, max_mtu, if_mtu, call_id, padding; _enter("%d", conn->debug_id); @@ -149,8 +158,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; case RXRPC_PACKET_TYPE_ACK: - mtu = conn->peer->if_mtu; - mtu -= conn->peer->hdrsize; + if_mtu = conn->peer->if_mtu - conn->peer->hdrsize; + if (conn->peer->ackr_adv_pmtud) { + max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(1444, if_mtu); + max_mtu = if_mtu; + } pkt.ack.bufferSpace = 0; pkt.ack.maxSkew = htons(skb ? skb->priority : 0); pkt.ack.firstPacket = htonl(chan->last_seq + 1); @@ -158,10 +172,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; - trailer.maxMTU = htonl(rxrpc_rx_mtu); - trailer.ifMTU = htonl(mtu); + trailer.maxMTU = htonl(max_mtu); + trailer.ifMTU = htonl(if_mtu); trailer.rwind = htonl(rxrpc_rx_window_size); - trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max); + trailer.jumbo_max = 0; pkt.whdr.flags |= RXRPC_SLOW_START_OK; padding = 0; iov[0].iov_len += sizeof(pkt.ack); @@ -171,7 +185,8 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, trace_rxrpc_tx_ack(chan->call_debug_id, serial, ntohl(pkt.ack.firstPacket), ntohl(pkt.ack.serial), - pkt.ack.reason, 0, rxrpc_rx_window_size); + pkt.ack.reason, 0, rxrpc_rx_window_size, + rxrpc_propose_ack_retransmit); break; default: @@ -202,11 +217,14 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) for (i = 0; i < RXRPC_MAXCALLS; i++) { call = conn->channels[i].call; - if (call) + if (call) { + rxrpc_see_call(call, rxrpc_call_see_conn_abort); rxrpc_set_call_completion(call, conn->completion, conn->abort_code, conn->error); + rxrpc_poke_call(call, rxrpc_call_poke_conn_abort); + } } _leave(""); @@ -218,10 +236,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) */ static void rxrpc_call_is_secure(struct rxrpc_call *call) { - if (call && __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SECURING) { - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); + if (call && __test_and_clear_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) rxrpc_notify_socket(call); - } } /* @@ -240,7 +256,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_CHALLENGE: - return conn->security->respond_to_challenge(conn, skb); + ret = conn->security->respond_to_challenge(conn, skb); + sp->chall.conn = NULL; + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + return ret; case RXRPC_PACKET_TYPE_RESPONSE: ret = conn->security->verify_response(conn, skb); @@ -252,16 +271,18 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) conn->state = RXRPC_CONN_SERVICE; - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE) { /* Offload call state flipping to the I/O thread. As * we've already received the packet, put it on the * front of the queue. */ + sp->poke_conn = rxrpc_get_connection( + conn, rxrpc_conn_get_poke_secured); skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); skb_queue_head(&conn->local->rx_queue, skb); @@ -383,6 +404,61 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, } /* + * Post a CHALLENGE packet to the socket of one of a connection's calls so that + * it can get application data to include in the packet, possibly querying + * userspace. + */ +static bool rxrpc_post_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = NULL; + struct rxrpc_sock *rx; + bool respond = false; + + sp->chall.conn = + rxrpc_get_connection(conn, rxrpc_conn_get_challenge_input); + + if (!conn->security->challenge_to_recvmsg) { + rxrpc_post_packet_to_conn(conn, skb); + return true; + } + + rcu_read_lock(); + + for (int i = 0; i < ARRAY_SIZE(conn->channels); i++) { + if (conn->channels[i].call) { + call = conn->channels[i].call; + rx = rcu_dereference(call->socket); + if (!rx) { + call = NULL; + continue; + } + + respond = true; + if (test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags)) + break; + call = NULL; + } + } + + if (!respond) { + rcu_read_unlock(); + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + sp->chall.conn = NULL; + return false; + } + + if (call) + rxrpc_notify_socket_oob(call, skb); + rcu_read_unlock(); + + if (!call) + rxrpc_post_packet_to_conn(conn, skb); + return true; +} + +/* * Input a connection-level packet. */ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) @@ -402,6 +478,16 @@ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) return true; case RXRPC_PACKET_TYPE_CHALLENGE: + rxrpc_see_skb(skb, rxrpc_skb_see_oob_challenge); + if (rxrpc_is_conn_aborted(conn)) { + if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) + rxrpc_send_conn_abort(conn); + return true; + } + if (!conn->security->validate_challenge(conn, skb)) + return false; + return rxrpc_post_challenge(conn, skb); + case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_is_conn_aborted(conn)) { if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) @@ -427,17 +513,60 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); - switch (skb->mark) { - case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - if (conn->state != RXRPC_CONN_SERVICE) - break; + if (conn->tx_response) { + struct sk_buff *skb; - for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure(conn->channels[loop].call); - break; + spin_lock_irq(&conn->local->lock); + skb = conn->tx_response; + conn->tx_response = NULL; + spin_unlock_irq(&conn->local->lock); + + if (conn->state != RXRPC_CONN_ABORTED) + rxrpc_send_response(conn, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_response); + } + + if (skb) { + switch (skb->mark) { + case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: + if (conn->state != RXRPC_CONN_SERVICE) + break; + + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure(conn->channels[loop].call); + break; + } } /* Process delayed ACKs whose time has come. */ if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, false); } + +/* + * Post a RESPONSE message to the I/O thread for transmission. + */ +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_local *local = conn->local; + struct sk_buff *old; + + _enter("%x", sp->resp.challenge_serial); + + spin_lock_irq(&local->lock); + old = conn->tx_response; + if (old) { + struct rxrpc_skb_priv *osp = rxrpc_skb(skb); + + /* Always go with the response to the most recent challenge. */ + if (after(sp->resp.challenge_serial, osp->resp.challenge_serial)) + conn->tx_response = old; + else + old = skb; + } else { + conn->tx_response = skb; + } + spin_unlock_irq(&local->lock); + rxrpc_poke_conn(conn, rxrpc_conn_get_poke_response); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 0af4642aeec4..37340becb224 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -31,13 +31,13 @@ void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) if (WARN_ON_ONCE(!local)) return; - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); busy = !list_empty(&conn->attend_link); if (!busy) { rxrpc_get_connection(conn, why); list_add_tail(&conn->attend_link, &local->conn_attend_q); } - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); rxrpc_wake_up_io_thread(local); } @@ -67,11 +67,13 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, INIT_WORK(&conn->destructor, rxrpc_clean_up_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); + INIT_LIST_HEAD(&conn->attend_link); mutex_init(&conn->security_lock); mutex_init(&conn->tx_data_alloc_lock); skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; + rwlock_init(&conn->security_use_lock); spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); conn->idle_timestamp = jiffies; @@ -119,18 +121,13 @@ struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *lo switch (srx->transport.family) { case AF_INET: if (peer->srx.transport.sin.sin_port != - srx->transport.sin.sin_port || - peer->srx.transport.sin.sin_addr.s_addr != - srx->transport.sin.sin_addr.s_addr) + srx->transport.sin.sin_port) goto not_found; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: if (peer->srx.transport.sin6.sin6_port != - srx->transport.sin6.sin6_port || - memcmp(&peer->srx.transport.sin6.sin6_addr, - &srx->transport.sin6.sin6_addr, - sizeof(struct in6_addr)) != 0) + srx->transport.sin6.sin6_port) goto not_found; break; #endif @@ -201,9 +198,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) call->peer->cong_ssthresh = call->cong_ssthresh; if (!hlist_unhashed(&call->error_link)) { - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_del_init(&call->error_link); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } if (rxrpc_is_client_call(call)) { @@ -318,15 +315,22 @@ static void rxrpc_clean_up_connection(struct work_struct *work) !conn->channels[3].call); ASSERT(list_empty(&conn->cache_link)); - del_timer_sync(&conn->timer); + timer_delete_sync(&conn->timer); cancel_work_sync(&conn->processor); /* Processing may restart the timer */ - del_timer_sync(&conn->timer); + timer_delete_sync(&conn->timer); write_lock(&rxnet->conn_lock); list_del_init(&conn->proc_link); write_unlock(&rxnet->conn_lock); + if (conn->pmtud_probe) { + trace_rxrpc_pmtud_lost(conn, 0); + conn->peer->pmtud_probing = false; + conn->peer->pmtud_pending = true; + } + rxrpc_purge_queue(&conn->rx_queue); + rxrpc_free_skb(conn->tx_response, rxrpc_skb_put_response); rxrpc_kill_client_conn(conn); @@ -342,9 +346,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work) */ rxrpc_purge_queue(&conn->rx_queue); - if (conn->tx_data_alloc.va) - __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va), - conn->tx_data_alloc.pagecnt_bias); + page_frag_cache_drain(&conn->tx_data_alloc); call_rcu(&conn->rcu, rxrpc_rcu_free_connection); } @@ -365,7 +367,7 @@ void rxrpc_put_connection(struct rxrpc_connection *conn, dead = __refcount_dec_and_test(&conn->ref, &r); trace_rxrpc_conn(debug_id, r - 1, why); if (dead) { - del_timer(&conn->timer); + timer_delete(&conn->timer); cancel_work(&conn->processor); if (in_softirq() || work_busy(&conn->processor) || @@ -470,7 +472,7 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) atomic_dec(&rxnet->nr_conns); - del_timer_sync(&rxnet->service_conn_reap_timer); + timer_delete_sync(&rxnet->service_conn_reap_timer); rxrpc_queue_work(&rxnet->service_conn_reaper); flush_workqueue(rxrpc_workqueue); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 3dedb8c0618c..24aceb183c2c 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -9,6 +9,17 @@ #include "ar-internal.h" +/* Override priority when generating ACKs for received DATA */ +static const u8 rxrpc_ack_priority[RXRPC_ACK__INVALID] = { + [RXRPC_ACK_IDLE] = 1, + [RXRPC_ACK_DELAY] = 2, + [RXRPC_ACK_REQUESTED] = 3, + [RXRPC_ACK_DUPLICATE] = 4, + [RXRPC_ACK_EXCEEDS_WINDOW] = 5, + [RXRPC_ACK_NOSPACE] = 6, + [RXRPC_ACK_OUT_OF_SEQUENCE] = 7, +}; + static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, enum rxrpc_abort_reason why) { @@ -16,80 +27,68 @@ static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, } /* - * Do TCP-style congestion management [RFC 5681]. + * Do TCP-style congestion management [RFC5681]. */ static void rxrpc_congestion_management(struct rxrpc_call *call, - struct sk_buff *skb, - struct rxrpc_ack_summary *summary, - rxrpc_serial_t acked_serial) + struct rxrpc_ack_summary *summary) { - enum rxrpc_congest_change change = rxrpc_cong_no_change; - unsigned int cumulative_acks = call->cong_cumul_acks; - unsigned int cwnd = call->cong_cwnd; - bool resend = false; - - summary->flight_size = - (call->tx_top - call->acks_hard_ack) - summary->nr_acks; + summary->change = rxrpc_cong_no_change; + summary->in_flight = rxrpc_tx_in_flight(call); if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); - cwnd = 1; - if (cwnd >= call->cong_ssthresh && - call->cong_mode == RXRPC_CALL_SLOW_START) { - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - call->cong_tstamp = skb->tstamp; - cumulative_acks = 0; + call->cong_ssthresh = umax(summary->in_flight / 2, 2); + call->cong_cwnd = 1; + if (call->cong_cwnd >= call->cong_ssthresh && + call->cong_ca_state == RXRPC_CA_SLOW_START) { + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; + call->cong_tstamp = call->acks_latest_ts; + call->cong_cumul_acks = 0; } } - cumulative_acks += summary->nr_new_acks; - if (cumulative_acks > 255) - cumulative_acks = 255; + call->cong_cumul_acks += summary->nr_new_sacks; + call->cong_cumul_acks += summary->nr_new_hacks; + if (call->cong_cumul_acks > 255) + call->cong_cumul_acks = 255; - summary->cwnd = call->cong_cwnd; - summary->ssthresh = call->cong_ssthresh; - summary->cumulative_acks = cumulative_acks; - summary->dup_acks = call->cong_dup_acks; - - switch (call->cong_mode) { - case RXRPC_CALL_SLOW_START: - if (summary->saw_nacks) + switch (call->cong_ca_state) { + case RXRPC_CA_SLOW_START: + if (call->acks_nr_snacks > 0) goto packet_loss_detected; - if (summary->cumulative_acks > 0) - cwnd += 1; - if (cwnd >= call->cong_ssthresh) { - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - call->cong_tstamp = skb->tstamp; + if (call->cong_cumul_acks > 0) + call->cong_cwnd += 1; + if (call->cong_cwnd >= call->cong_ssthresh) { + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; + call->cong_tstamp = call->acks_latest_ts; } goto out; - case RXRPC_CALL_CONGEST_AVOIDANCE: - if (summary->saw_nacks) + case RXRPC_CA_CONGEST_AVOIDANCE: + if (call->acks_nr_snacks > 0) goto packet_loss_detected; /* We analyse the number of packets that get ACK'd per RTT * period and increase the window if we managed to fill it. */ - if (call->peer->rtt_count == 0) + if (call->rtt_count == 0) goto out; - if (ktime_before(skb->tstamp, + if (ktime_before(call->acks_latest_ts, ktime_add_us(call->cong_tstamp, - call->peer->srtt_us >> 3))) + call->srtt_us >> 3))) goto out_no_clear_ca; - change = rxrpc_cong_rtt_window_end; - call->cong_tstamp = skb->tstamp; - if (cumulative_acks >= cwnd) - cwnd++; + summary->change = rxrpc_cong_rtt_window_end; + call->cong_tstamp = call->acks_latest_ts; + if (call->cong_cumul_acks >= call->cong_cwnd) + call->cong_cwnd++; goto out; - case RXRPC_CALL_PACKET_LOSS: - if (!summary->saw_nacks) + case RXRPC_CA_PACKET_LOSS: + if (call->acks_nr_snacks == 0) goto resume_normality; - if (summary->new_low_nack) { - change = rxrpc_cong_new_low_nack; + if (summary->new_low_snack) { + summary->change = rxrpc_cong_new_low_nack; call->cong_dup_acks = 1; if (call->cong_extra > 1) call->cong_extra = 1; @@ -100,31 +99,35 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, if (call->cong_dup_acks < 3) goto send_extra_data; - change = rxrpc_cong_begin_retransmission; - call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); - cwnd = call->cong_ssthresh + 3; + summary->change = rxrpc_cong_begin_retransmission; + call->cong_ca_state = RXRPC_CA_FAST_RETRANSMIT; + call->cong_ssthresh = umax(summary->in_flight / 2, 2); + call->cong_cwnd = call->cong_ssthresh + 3; call->cong_extra = 0; call->cong_dup_acks = 0; - resend = true; + summary->need_retransmit = true; + summary->in_fast_or_rto_recovery = true; goto out; - case RXRPC_CALL_FAST_RETRANSMIT: - if (!summary->new_low_nack) { - if (summary->nr_new_acks == 0) - cwnd += 1; + case RXRPC_CA_FAST_RETRANSMIT: + rxrpc_tlp_init(call); + summary->in_fast_or_rto_recovery = true; + if (!summary->new_low_snack) { + if (summary->nr_new_sacks == 0) + call->cong_cwnd += 1; call->cong_dup_acks++; if (call->cong_dup_acks == 2) { - change = rxrpc_cong_retransmit_again; + summary->change = rxrpc_cong_retransmit_again; call->cong_dup_acks = 0; - resend = true; + summary->need_retransmit = true; } } else { - change = rxrpc_cong_progress; - cwnd = call->cong_ssthresh; - if (!summary->saw_nacks) + summary->change = rxrpc_cong_progress; + call->cong_cwnd = call->cong_ssthresh; + if (call->acks_nr_snacks == 0) { + summary->exiting_fast_or_rto_recovery = true; goto resume_normality; + } } goto out; @@ -134,30 +137,25 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, } resume_normality: - change = rxrpc_cong_cleared_nacks; + summary->change = rxrpc_cong_cleared_nacks; call->cong_dup_acks = 0; call->cong_extra = 0; - call->cong_tstamp = skb->tstamp; - if (cwnd < call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_tstamp = call->acks_latest_ts; + if (call->cong_cwnd < call->cong_ssthresh) + call->cong_ca_state = RXRPC_CA_SLOW_START; else - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; out: - cumulative_acks = 0; + call->cong_cumul_acks = 0; out_no_clear_ca: - if (cwnd >= RXRPC_TX_MAX_WINDOW) - cwnd = RXRPC_TX_MAX_WINDOW; - call->cong_cwnd = cwnd; - call->cong_cumul_acks = cumulative_acks; - summary->mode = call->cong_mode; - trace_rxrpc_congest(call, summary, acked_serial, change); - if (resend) - rxrpc_resend(call, skb); + if (call->cong_cwnd >= RXRPC_TX_MAX_WINDOW) + call->cong_cwnd = RXRPC_TX_MAX_WINDOW; + trace_rxrpc_congest(call, summary); return; packet_loss_detected: - change = rxrpc_cong_saw_nack; - call->cong_mode = RXRPC_CALL_PACKET_LOSS; + summary->change = rxrpc_cong_saw_nack; + call->cong_ca_state = RXRPC_CA_PACKET_LOSS; call->cong_dup_acks = 0; goto send_extra_data; @@ -166,7 +164,7 @@ send_extra_data: * state. */ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) || - summary->nr_acks != call->tx_top - call->acks_hard_ack) { + call->acks_nr_sacks != call->tx_top - call->tx_bottom) { call->cong_extra++; wake_up(&call->waitq); } @@ -178,26 +176,42 @@ send_extra_data: */ void rxrpc_congestion_degrade(struct rxrpc_call *call) { - ktime_t rtt, now; + ktime_t rtt, now, time_since; - if (call->cong_mode != RXRPC_CALL_SLOW_START && - call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE) + if (call->cong_ca_state != RXRPC_CA_SLOW_START && + call->cong_ca_state != RXRPC_CA_CONGEST_AVOIDANCE) return; if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY) return; - rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8)); + rtt = ns_to_ktime(call->srtt_us * (NSEC_PER_USEC / 8)); now = ktime_get_real(); - if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now)) + time_since = ktime_sub(now, call->tx_last_sent); + if (ktime_before(time_since, rtt)) return; - trace_rxrpc_reset_cwnd(call, now); + trace_rxrpc_reset_cwnd(call, time_since, rtt); rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); call->tx_last_sent = now; - call->cong_mode = RXRPC_CALL_SLOW_START; - call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh, - call->cong_cwnd * 3 / 4); - call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND); + call->cong_ca_state = RXRPC_CA_SLOW_START; + call->cong_ssthresh = umax(call->cong_ssthresh, call->cong_cwnd * 3 / 4); + call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND); +} + +/* + * Add an RTT sample derived from an ACK'd DATA packet. + */ +static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + int ix) +{ + ktime_t xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); + + rxrpc_call_add_rtt(call, rxrpc_rtt_rx_data_ack, -1, + summary->acked_serial, summary->ack_serial, + xmit_ts, call->acks_latest_ts); + __clear_bit(ix, &tq->rtt_samples); /* Prevent repeat RTT sample */ } /* @@ -206,37 +220,120 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, struct rxrpc_ack_summary *summary) { - struct rxrpc_txbuf *txb; - bool rot_last = false; + struct rxrpc_txqueue *tq = call->tx_queue; + rxrpc_seq_t seq = call->tx_bottom + 1; + bool rot_last = false, trace = false; - list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { - if (before_eq(txb->seq, call->acks_hard_ack)) - continue; - if (txb->flags & RXRPC_LAST_PACKET) { + _enter("%x,%x", call->tx_bottom, to); + + trace_rxrpc_tx_rotate(call, seq, to); + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate); + + if (call->acks_lowest_nak == call->tx_bottom) { + call->acks_lowest_nak = to; + } else if (after(to, call->acks_lowest_nak)) { + summary->new_low_snack = true; + call->acks_lowest_nak = to; + } + + /* We may have a left over fully-consumed buffer at the front that we + * couldn't drop before (rotate_and_keep below). + */ + if (seq == call->tx_qbase + RXRPC_NR_TXQUEUE) { + call->tx_qbase += RXRPC_NR_TXQUEUE; + call->tx_queue = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + tq = call->tx_queue; + } + + do { + unsigned int ix = seq - call->tx_qbase; + + _debug("tq=%x seq=%x i=%d f=%x", tq->qbase, seq, ix, tq->bufs[ix]->flags); + if (tq->bufs[ix]->flags & RXRPC_LAST_PACKET) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; } - if (txb->seq == to) - break; - } - if (rot_last) - set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); + if (summary->acked_serial == tq->segment_serial[ix] && + test_bit(ix, &tq->rtt_samples)) + rxrpc_add_data_rtt_sample(call, summary, tq, ix); + + if (ix == tq->nr_reported_acks) { + /* Packet directly hard ACK'd. */ + tq->nr_reported_acks++; + rxrpc_input_rack_one(call, summary, tq, ix); + if (seq == call->tlp_seq) + summary->tlp_probe_acked = true; + summary->nr_new_hacks++; + __set_bit(ix, &tq->segment_acked); + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_hack); + } else if (test_bit(ix, &tq->segment_acked)) { + /* Soft ACK -> hard ACK. */ + call->acks_nr_sacks--; + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_sack); + } else { + /* Soft NAK -> hard ACK. */ + call->acks_nr_snacks--; + rxrpc_input_rack_one(call, summary, tq, ix); + if (seq == call->tlp_seq) + summary->tlp_probe_acked = true; + summary->nr_new_hacks++; + __set_bit(ix, &tq->segment_acked); + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_snak); + } - _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last); + call->tx_nr_sent--; + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) + call->tx_nr_resent--; + __clear_bit(ix, &tq->ever_retransmitted); - if (call->acks_lowest_nak == call->acks_hard_ack) { - call->acks_lowest_nak = to; - } else if (after(to, call->acks_lowest_nak)) { - summary->new_low_nack = true; - call->acks_lowest_nak = to; + rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated); + tq->bufs[ix] = NULL; + + WRITE_ONCE(call->tx_bottom, seq); + trace_rxrpc_txqueue(call, (rot_last ? + rxrpc_txqueue_rotate_last : + rxrpc_txqueue_rotate)); + + seq++; + trace = true; + if (!(seq & RXRPC_TXQ_MASK)) { + trace_rxrpc_rack_update(call, summary); + trace = false; + prefetch(tq->next); + if (tq != call->tx_qtail) { + call->tx_qbase += RXRPC_NR_TXQUEUE; + call->tx_queue = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + tq = call->tx_queue; + } else { + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_keep); + tq = NULL; + break; + } + } + + } while (before_eq(seq, to)); + + if (trace) + trace_rxrpc_rack_update(call, summary); + + if (rot_last) { + set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); + if (tq) { + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + call->tx_queue = NULL; + } } - smp_store_release(&call->acks_hard_ack, to); + _debug("%x,%x,%x,%d", to, call->tx_bottom, call->tx_top, rot_last); - trace_rxrpc_txqueue(call, (rot_last ? - rxrpc_txqueue_rotate_last : - rxrpc_txqueue_rotate)); wake_up(&call->waitq); return rot_last; } @@ -252,13 +349,10 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, { ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); - call->resend_at = KTIME_MAX; - trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend); - - if (unlikely(call->cong_last_nack)) { - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - call->cong_last_nack = NULL; - } + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; + call->rack_timo_at = KTIME_MAX; + trace_rxrpc_rack_timer(call, 0, false); + trace_rxrpc_timer_can(call, rxrpc_timer_trace_rack_off + call->rack_timer_mode); switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: @@ -354,18 +448,26 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); bool last = sp->hdr.flags & RXRPC_LAST_PACKET; + spin_lock_irq(&call->recvmsg_queue.lock); + __skb_queue_tail(&call->recvmsg_queue, skb); rxrpc_input_update_ack_window(call, window, wtop); trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); if (last) + /* Change the state inside the lock so that recvmsg syncs + * correctly with it and using sendmsg() to send a reply + * doesn't race. + */ rxrpc_end_rx_phase(call, sp->hdr.serial); + + spin_unlock_irq(&call->recvmsg_queue.lock); } /* * Process a DATA packet. */ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, - bool *_notify) + bool *_notify, rxrpc_serial_t *_ack_serial, int *_ack_reason) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct sk_buff *oos; @@ -418,8 +520,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, /* Send an immediate ACK if we fill in a hole */ else if (!skb_queue_empty(&call->rx_oos_queue)) ack_reason = RXRPC_ACK_DELAY; - else - call->ackr_nr_unacked++; window++; if (after(window, wtop)) { @@ -433,7 +533,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg); - spin_lock(&call->recvmsg_queue.lock); rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue); *_notify = true; @@ -455,8 +554,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_receive_queue_oos); } - spin_unlock(&call->recvmsg_queue.lock); - call->ackr_sack_base = sack; } else { unsigned int slot; @@ -497,12 +594,16 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, } send_ack: - if (ack_reason >= 0) - rxrpc_send_ACK(call, ack_reason, serial, - rxrpc_propose_ack_input_data); - else - rxrpc_propose_delay_ACK(call, serial, - rxrpc_propose_ack_input_data); + if (ack_reason >= 0) { + if (rxrpc_ack_priority[ack_reason] > rxrpc_ack_priority[*_ack_reason]) { + *_ack_serial = serial; + *_ack_reason = ack_reason; + } else if (rxrpc_ack_priority[ack_reason] == rxrpc_ack_priority[*_ack_reason] && + ack_reason == RXRPC_ACK_REQUESTED) { + *_ack_serial = serial; + *_ack_reason = ack_reason; + } + } } /* @@ -513,9 +614,11 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb struct rxrpc_jumbo_header jhdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb), *jsp; struct sk_buff *jskb; + rxrpc_serial_t ack_serial = 0; unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = skb->len - offset; bool notify = false; + int ack_reason = 0, count = 1, stat_ix; while (sp->hdr.flags & RXRPC_JUMBO_PACKET) { if (len < RXRPC_JUMBO_SUBPKTLEN) @@ -535,7 +638,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb jsp = rxrpc_skb(jskb); jsp->offset = offset; jsp->len = RXRPC_JUMBO_DATALEN; - rxrpc_input_data_one(call, jskb, ¬ify); + rxrpc_input_data_one(call, jskb, ¬ify, &ack_serial, &ack_reason); rxrpc_free_skb(jskb, rxrpc_skb_put_jumbo_subpacket); sp->hdr.flags = jhdr.flags; @@ -544,12 +647,25 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb sp->hdr.serial++; offset += RXRPC_JUMBO_SUBPKTLEN; len -= RXRPC_JUMBO_SUBPKTLEN; + count++; } sp->offset = offset; sp->len = len; - rxrpc_input_data_one(call, skb, ¬ify); - if (notify) { + rxrpc_input_data_one(call, skb, ¬ify, &ack_serial, &ack_reason); + + stat_ix = umin(count, ARRAY_SIZE(call->rxnet->stat_rx_jumbo)) - 1; + atomic_inc(&call->rxnet->stat_rx_jumbo[stat_ix]); + + if (ack_reason > 0) { + rxrpc_send_ACK(call, ack_reason, ack_serial, + rxrpc_propose_ack_input_data); + } else { + call->ackr_nr_unacked++; + rxrpc_propose_delay_ACK(call, sp->hdr.serial, + rxrpc_propose_ack_input_data); + } + if (notify && !test_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) { trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); rxrpc_notify_socket(call); } @@ -643,7 +759,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_mb(); /* Read data before setting avail bit */ set_bit(i, &call->rtt_avail); - rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, + rxrpc_call_add_rtt(call, type, i, acked_serial, ack_serial, sent_at, resp_time); matched = true; } @@ -653,7 +769,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, */ if (after(acked_serial, orig_serial)) { trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_obsolete, i, - orig_serial, acked_serial, 0, 0); + orig_serial, acked_serial, 0, 0, 0); clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_wmb(); set_bit(i, &call->rtt_avail); @@ -661,7 +777,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, } if (!matched) - trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0); + trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0, 0); } /* @@ -671,10 +787,13 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb struct rxrpc_acktrailer *trailer) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_peer *peer; - unsigned int mtu; + struct rxrpc_peer *peer = call->peer; + unsigned int max_data, capacity; bool wake = false; - u32 rwind = ntohl(trailer->rwind); + u32 max_mtu = ntohl(trailer->maxMTU); + //u32 if_mtu = ntohl(trailer->ifMTU); + u32 rwind = ntohl(trailer->rwind); + u32 jumbo_max = ntohl(trailer->jumbo_max); if (rwind > RXRPC_TX_MAX_WINDOW) rwind = RXRPC_TX_MAX_WINDOW; @@ -685,57 +804,147 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb call->tx_winsize = rwind; } - if (call->cong_ssthresh > rwind) - call->cong_ssthresh = rwind; + max_mtu = clamp(max_mtu, 500, 65535); + peer->ackr_max_data = max_mtu; + + if (max_mtu < peer->max_data) { + trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu, + rxrpc_pmtud_reduce_ack); + peer->max_data = max_mtu; + } - mtu = min(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); + max_data = umin(max_mtu, peer->max_data); + capacity = max_data; + capacity += sizeof(struct rxrpc_jumbo_header); /* First subpacket has main hdr, not jumbo */ + capacity /= sizeof(struct rxrpc_jumbo_header) + RXRPC_JUMBO_DATALEN; - peer = call->peer; - if (mtu < peer->maxdata) { - spin_lock(&peer->lock); - peer->maxdata = mtu; - peer->mtu = mtu + peer->hdrsize; - spin_unlock(&peer->lock); + if (jumbo_max == 0) { + /* The peer says it supports pmtu discovery */ + peer->ackr_adv_pmtud = true; + } else { + peer->ackr_adv_pmtud = false; + capacity = clamp(capacity, 1, jumbo_max); } + call->tx_jumbo_max = capacity; + if (wake) wake_up(&call->waitq); } +#if defined(CONFIG_X86) && __GNUC__ && !defined(__clang__) +/* Clang doesn't support the %z constraint modifier */ +#define shiftr_adv_rotr(shift_from, rotate_into) ({ \ + asm(" shr%z1 %1\n" \ + " inc %0\n" \ + " rcr%z2 %2\n" \ + : "+d"(shift_from), "+m"(*(shift_from)), "+rm"(rotate_into) \ + ); \ + }) +#else +#define shiftr_adv_rotr(shift_from, rotate_into) ({ \ + typeof(rotate_into) __bit0 = *(shift_from) & 1; \ + *(shift_from) >>= 1; \ + shift_from++; \ + rotate_into >>= 1; \ + rotate_into |= __bit0 << (sizeof(rotate_into) * 8 - 1); \ + }) +#endif + /* - * Determine how many nacks from the previous ACK have now been satisfied. + * Deal with RTT samples from soft ACKs. */ -static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, - struct rxrpc_ack_summary *summary, - rxrpc_seq_t seq) +static void rxrpc_input_soft_rtt(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq) { - struct sk_buff *skb = call->cong_last_nack; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int i, new_acks = 0, retained_nacks = 0; - rxrpc_seq_t old_seq = sp->ack.first_ack; - u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); + for (int ix = 0; ix < RXRPC_NR_TXQUEUE; ix++) + if (summary->acked_serial == tq->segment_serial[ix]) + return rxrpc_add_data_rtt_sample(call, summary, tq, ix); +} - if (after_eq(seq, old_seq + sp->ack.nr_acks)) { - summary->nr_new_acks += sp->ack.nr_nacks; - summary->nr_new_acks += seq - (old_seq + sp->ack.nr_acks); - summary->nr_retained_nacks = 0; - } else if (seq == old_seq) { - summary->nr_retained_nacks = sp->ack.nr_nacks; - } else { - for (i = 0; i < sp->ack.nr_acks; i++) { - if (acks[i] == RXRPC_ACK_TYPE_NACK) { - if (before(old_seq + i, seq)) - new_acks++; - else - retained_nacks++; - } +/* + * Process a batch of soft ACKs specific to a transmission queue segment. + */ +static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long extracted_acks, + int nr_reported, + rxrpc_seq_t seq, + rxrpc_seq_t *lowest_nak) +{ + unsigned long old_reported = 0, flipped, new_acks = 0; + unsigned long a_to_n, n_to_a = 0; + int new, a, n; + + if (tq->nr_reported_acks > 0) + old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks); + + _enter("{%x,%lx,%d},%lx,%d,%x", + tq->qbase, tq->segment_acked, tq->nr_reported_acks, + extracted_acks, nr_reported, seq); + + _debug("[%x]", tq->qbase); + _debug("tq %16lx %u", tq->segment_acked, tq->nr_reported_acks); + _debug("sack %16lx %u", extracted_acks, nr_reported); + + /* See how many previously logged ACKs/NAKs have flipped. */ + flipped = (tq->segment_acked ^ extracted_acks) & old_reported; + if (flipped) { + n_to_a = ~tq->segment_acked & flipped; /* Old NAK -> ACK */ + a_to_n = tq->segment_acked & flipped; /* Old ACK -> NAK */ + a = hweight_long(n_to_a); + n = hweight_long(a_to_n); + _debug("flip %16lx", flipped); + _debug("ntoa %16lx %d", n_to_a, a); + _debug("aton %16lx %d", a_to_n, n); + call->acks_nr_sacks += a - n; + call->acks_nr_snacks += n - a; + summary->nr_new_sacks += a; + summary->nr_new_snacks += n; + } + + /* See how many new ACKs/NAKs have been acquired. */ + new = nr_reported - tq->nr_reported_acks; + if (new > 0) { + new_acks = extracted_acks & ~old_reported; + if (new_acks) { + a = hweight_long(new_acks); + n = new - a; + _debug("new_a %16lx new=%d a=%d n=%d", new_acks, new, a, n); + call->acks_nr_sacks += a; + call->acks_nr_snacks += n; + summary->nr_new_sacks += a; + summary->nr_new_snacks += n; + } else { + call->acks_nr_snacks += new; + summary->nr_new_snacks += new; } + } + + tq->nr_reported_acks = nr_reported; + tq->segment_acked = extracted_acks; + trace_rxrpc_apply_acks(call, tq); - summary->nr_new_acks += new_acks; - summary->nr_retained_nacks = retained_nacks; + if (extracted_acks != ~0UL) { + rxrpc_seq_t lowest = seq + ffz(extracted_acks); + + if (before(lowest, *lowest_nak)) + *lowest_nak = lowest; } - return old_seq + sp->ack.nr_acks; + if (summary->acked_serial) + rxrpc_input_soft_rtt(call, summary, tq); + + new_acks |= n_to_a; + if (new_acks) + rxrpc_input_rack(call, summary, tq, new_acks); + + if (call->tlp_serial && + rxrpc_seq_in_txq(tq, call->tlp_seq) && + test_bit(call->tlp_seq - tq->qbase, &new_acks)) + summary->tlp_probe_acked = true; } /* @@ -749,39 +958,50 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, */ static void rxrpc_input_soft_acks(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - struct sk_buff *skb, - rxrpc_seq_t seq, - rxrpc_seq_t since) + struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int i, old_nacks = 0; + struct rxrpc_txqueue *tq = call->tx_queue; + unsigned long extracted = ~0UL; + unsigned int nr = 0; + rxrpc_seq_t seq = call->acks_hard_ack + 1; rxrpc_seq_t lowest_nak = seq + sp->ack.nr_acks; u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - for (i = 0; i < sp->ack.nr_acks; i++) { - if (acks[i] == RXRPC_ACK_TYPE_ACK) { - summary->nr_acks++; - if (after_eq(seq, since)) - summary->nr_new_acks++; - } else { - summary->saw_nacks = true; - if (before(seq, since)) { - /* Overlap with previous ACK */ - old_nacks++; - } else { - summary->nr_new_nacks++; - sp->ack.nr_nacks++; - } + _enter("%x,%x,%u", tq->qbase, seq, sp->ack.nr_acks); - if (before(seq, lowest_nak)) - lowest_nak = seq; + while (after(seq, tq->qbase + RXRPC_NR_TXQUEUE - 1)) + tq = tq->next; + + for (unsigned int i = 0; i < sp->ack.nr_acks; i++) { + /* Decant ACKs until we hit a txqueue boundary. */ + shiftr_adv_rotr(acks, extracted); + if (i == 256) { + acks -= i; + i = 0; } seq++; + nr++; + if ((seq & RXRPC_TXQ_MASK) != 0) + continue; + + _debug("bound %16lx %u", extracted, nr); + + rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE, + seq - RXRPC_NR_TXQUEUE, &lowest_nak); + extracted = ~0UL; + nr = 0; + tq = tq->next; + prefetch(tq); } - if (lowest_nak != call->acks_lowest_nak) { - call->acks_lowest_nak = lowest_nak; - summary->new_low_nack = true; + if (nr) { + unsigned int nr_reported = seq & RXRPC_TXQ_MASK; + + extracted >>= RXRPC_NR_TXQUEUE - nr_reported; + _debug("tail %16lx %u", extracted, nr_reported); + rxrpc_input_soft_ack_tq(call, summary, tq, extracted, nr_reported, + seq & ~RXRPC_TXQ_MASK, &lowest_nak); } /* We *can* have more nacks than we did - the peer is permitted to drop @@ -789,9 +1009,14 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * possible for the nack distribution to change whilst the number of * nacks stays the same or goes down. */ - if (old_nacks < summary->nr_retained_nacks) - summary->nr_new_acks += summary->nr_retained_nacks - old_nacks; - summary->nr_retained_nacks = old_nacks; + if (lowest_nak != call->acks_lowest_nak) { + call->acks_lowest_nak = lowest_nak; + summary->new_low_snack = true; + } + + _debug("summary A=%d+%d N=%d+%d", + call->acks_nr_sacks, summary->nr_new_sacks, + call->acks_nr_snacks, summary->nr_new_snacks); } /* @@ -799,21 +1024,21 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * with respect to the ack state conveyed by preceding ACKs. */ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, - rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt) + rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt) { - rxrpc_seq_t base = READ_ONCE(call->acks_first_seq); + rxrpc_seq_t base = READ_ONCE(call->acks_hard_ack); - if (after(first_pkt, base)) + if (after(hard_ack, base)) return true; /* The window advanced */ - if (before(first_pkt, base)) + if (before(hard_ack, base)) return false; /* firstPacket regressed */ if (after_eq(prev_pkt, call->acks_prev_seq)) return true; /* previousPacket hasn't regressed. */ /* Some rx implementations put a serial number in previousPacket. */ - if (after_eq(prev_pkt, base + call->tx_winsize)) + if (after(prev_pkt, base + call->tx_winsize)) return false; return true; } @@ -831,53 +1056,34 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_acktrailer trailer; - rxrpc_serial_t ack_serial, acked_serial; - rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; _enter(""); offset = sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - ack_serial = sp->hdr.serial; - acked_serial = sp->ack.acked_serial; - first_soft_ack = sp->ack.first_ack; - prev_pkt = sp->ack.prev_ack; - nr_acks = sp->ack.nr_acks; - hard_ack = first_soft_ack - 1; - summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? - sp->ack.reason : RXRPC_ACK__INVALID); - - trace_rxrpc_rx_ack(call, ack_serial, acked_serial, - first_soft_ack, prev_pkt, - summary.ack_reason, nr_acks); - rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); + summary.ack_serial = sp->hdr.serial; + first_soft_ack = sp->ack.first_ack; + prev_pkt = sp->ack.prev_ack; + nr_acks = sp->ack.nr_acks; + hard_ack = first_soft_ack - 1; + summary.acked_serial = sp->ack.acked_serial; + summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? + sp->ack.reason : RXRPC_ACK__INVALID); - if (acked_serial != 0) { - switch (summary.ack_reason) { - case RXRPC_ACK_PING_RESPONSE: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_ping_response); - break; - case RXRPC_ACK_REQUESTED: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_requested_ack); - break; - default: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_other_ack); - break; - } - } + trace_rxrpc_rx_ack(call, sp); + rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); + prefetch(call->tx_queue); /* If we get an EXCEEDS_WINDOW ACK from the server, it probably * indicates that the client address changed due to NAT. The server * lost the call because it switched to a different peer. */ if (unlikely(summary.ack_reason == RXRPC_ACK_EXCEEDS_WINDOW) && - first_soft_ack == 1 && + hard_ack == 0 && prev_pkt == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, @@ -890,9 +1096,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) * if we still have it buffered to the beginning. */ if (unlikely(summary.ack_reason == RXRPC_ACK_OUT_OF_SEQUENCE) && - first_soft_ack == 1 && + hard_ack == 0 && prev_pkt == 0 && - call->acks_hard_ack == 0 && + call->tx_bottom == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); @@ -900,11 +1106,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } /* Discard any out-of-order or duplicate ACKs (outside lock). */ - if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { - trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, - first_soft_ack, call->acks_first_seq, - prev_pkt, call->acks_prev_seq); - goto send_response; + if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) { + trace_rxrpc_rx_discard_ack(call, summary.ack_serial, hard_ack, prev_pkt); + goto send_response; /* Still respond if requested. */ } trailer.maxMTU = 0; @@ -916,34 +1120,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) skb_condense(skb); - if (call->cong_last_nack) { - since = rxrpc_input_check_prev_ack(call, &summary, first_soft_ack); - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - call->cong_last_nack = NULL; - } else { - summary.nr_new_acks = first_soft_ack - call->acks_first_seq; - call->acks_lowest_nak = first_soft_ack + nr_acks; - since = first_soft_ack; - } - - call->acks_latest_ts = skb->tstamp; - call->acks_first_seq = first_soft_ack; + call->acks_latest_ts = ktime_get_real(); + call->acks_hard_ack = hard_ack; call->acks_prev_seq = prev_pkt; - switch (summary.ack_reason) { - case RXRPC_ACK_PING: - break; - default: - if (acked_serial && after(acked_serial, call->acks_highest_serial)) - call->acks_highest_serial = acked_serial; - break; + if (summary.acked_serial) { + switch (summary.ack_reason) { + case RXRPC_ACK_PING_RESPONSE: + rxrpc_complete_rtt_probe(call, call->acks_latest_ts, + summary.acked_serial, summary.ack_serial, + rxrpc_rtt_rx_ping_response); + break; + default: + if (after(summary.acked_serial, call->acks_highest_serial)) + call->acks_highest_serial = summary.acked_serial; + summary.rtt_sample_avail = true; + break; + } } /* Parse rwind and mtu sizes if provided. */ if (trailer.maxMTU) rxrpc_input_ack_trailer(call, skb, &trailer); - if (first_soft_ack == 0) + if (hard_ack + 1 == 0) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); /* Ignore ACKs unless we are or have just been transmitting. */ @@ -957,13 +1157,13 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto send_response; } - if (before(hard_ack, call->acks_hard_ack) || + if (before(hard_ack, call->tx_bottom) || after(hard_ack, call->tx_top)) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_outside_window); if (nr_acks > call->tx_top - hard_ack) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow); - if (after(hard_ack, call->acks_hard_ack)) { + if (after(hard_ack, call->tx_bottom)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); goto send_response; @@ -973,25 +1173,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) { if (offset > (int)skb->len - nr_acks) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack); - rxrpc_input_soft_acks(call, &summary, skb, first_soft_ack, since); - rxrpc_get_skb(skb, rxrpc_skb_get_last_nack); - call->cong_last_nack = skb; + rxrpc_input_soft_acks(call, &summary, skb); } if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && - summary.nr_acks == call->tx_top - hard_ack && + call->acks_nr_sacks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) - rxrpc_propose_ping(call, ack_serial, + rxrpc_propose_ping(call, summary.ack_serial, rxrpc_propose_ack_ping_for_lost_reply); - rxrpc_congestion_management(call, skb, &summary, acked_serial); + /* Drive the congestion management algorithm first and then RACK-TLP as + * the latter depends on the state/change in state in the former. + */ + rxrpc_congestion_management(call, &summary); + rxrpc_rack_detect_loss_and_arm_timer(call, &summary); + rxrpc_tlp_process_ack(call, &summary); + if (call->tlp_serial && after_eq(summary.acked_serial, call->tlp_serial)) + call->tlp_serial = 0; send_response: if (summary.ack_reason == RXRPC_ACK_PING) - rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, + rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, summary.ack_serial, rxrpc_propose_ack_respond_to_ping); else if (sp->hdr.flags & RXRPC_REQUEST_ACK) - rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, summary.ack_serial, rxrpc_propose_ack_respond_to_ack); } @@ -1090,5 +1295,5 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) break; } - rxrpc_input_call_event(call, skb); + rxrpc_input_call_event(call); } diff --git a/net/rxrpc/input_rack.c b/net/rxrpc/input_rack.c new file mode 100644 index 000000000000..13c371261e0a --- /dev/null +++ b/net/rxrpc/input_rack.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RACK-TLP [RFC8958] Implementation + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "ar-internal.h" + +static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1, + ktime_t t2, rxrpc_seq_t seq2) +{ + if (ktime_after(t1, t2)) + return true; + return t1 == t2 && after(seq1, seq2); +} + +/* + * Mark a packet lost. + */ +static void rxrpc_rack_mark_lost(struct rxrpc_call *call, + struct rxrpc_txqueue *tq, unsigned int ix) +{ + if (__test_and_set_bit(ix, &tq->segment_lost)) { + if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) + call->tx_nr_resent--; + } else { + call->tx_nr_lost++; + } + tq->segment_xmit_ts[ix] = UINT_MAX; +} + +/* + * Get the transmission time of a packet in the Tx queue. + */ +static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix) +{ + if (tq->segment_xmit_ts[ix] == UINT_MAX) + return KTIME_MAX; + return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); +} + +/* + * Get a bitmask of nack bits for a queue segment and mask off any that aren't + * yet reported. + */ +static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq) +{ + unsigned long nacks = ~tq->segment_acked; + + if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) + nacks &= (1UL << tq->nr_reported_acks) - 1; + return nacks; +} + +/* + * Update the RACK state for the most recently sent packet that has been + * delivered [RFC8958 6.2 Step 2]. + */ +static void rxrpc_rack_update(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts); + + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + + if (test_bit(ix, &tq->segment_retransmitted)) { + /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */ + if (before(call->acks_highest_serial, tq->segment_serial[ix])) + return; + if (rtt < minmax_get(&call->min_rtt)) + return; + } + + /* The RACK algorithm requires the segment ACKs to be traversed in + * order of segment transmission - but the only thing this seems to + * matter for is that RACK.rtt is set to the rtt of the most recently + * transmitted segment. We should be able to achieve the same by only + * setting RACK.rtt if the xmit time is greater. + */ + if (ktime_after(xmit_ts, call->rack_rtt_ts)) { + call->rack_rtt = rtt; + call->rack_rtt_ts = xmit_ts; + } + + if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) { + call->rack_rtt = rtt; + call->rack_xmit_ts = xmit_ts; + call->rack_end_seq = seq; + } +} + +/* + * Detect data segment reordering [RFC8958 6.2 Step 3]. + */ +static void rxrpc_rack_detect_reordering(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_seq_t seq = tq->qbase + ix; + + /* Track the highest sequence number so far ACK'd. This is not + * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer + * could put a NACK in the last SACK slot. + */ + if (after(seq, call->rack_fack)) + call->rack_fack = seq; + else if (before(seq, call->rack_fack) && + test_bit(ix, &tq->segment_retransmitted)) + call->rack_reordering_seen = true; +} + +void rxrpc_input_rack_one(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_rack_update(call, summary, tq, ix); + rxrpc_rack_detect_reordering(call, summary, tq, ix); +} + +void rxrpc_input_rack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long new_acks) +{ + while (new_acks) { + unsigned int ix = __ffs(new_acks); + + __clear_bit(ix, &new_acks); + rxrpc_input_rack_one(call, summary, tq, ix); + } + + trace_rxrpc_rack_update(call, summary); +} + +/* + * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated + * duration of the reordering window. + * + * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can + * be given a 'DUPLICATE' reason with the serial number referring to the + * duplicated DATA packet. Rx does not inform as to whether this was a + * reception of the same packet twice or of a retransmission of a packet we + * already received (though this could be determined by the transmitter based + * on the serial number). + */ +static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ + rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */ + bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE; + int dup_thresh = 3; + + /* DSACK-based reordering window adaptation */ + if (!call->rack_dsack_round_none && + after_eq(snd_una, call->rack_dsack_round)) + call->rack_dsack_round_none = true; + + /* Grow the reordering window per round that sees DSACK. Reset the + * window after 16 DSACK-free recoveries. + */ + if (call->rack_dsack_round_none && have_dsack_option) { + call->rack_dsack_round_none = false; + call->rack_dsack_round = snd_nxt; + call->rack_reo_wnd_mult++; + call->rack_reo_wnd_persist = 16; + } else if (summary->exiting_fast_or_rto_recovery) { + call->rack_reo_wnd_persist--; + if (call->rack_reo_wnd_persist <= 0) + call->rack_reo_wnd_mult = 1; + } + + if (!call->rack_reordering_seen) { + if (summary->in_fast_or_rto_recovery) + return 0; + if (call->acks_nr_sacks >= dup_thresh) + return 0; + } + + return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4, + call->srtt_us >> 3)); +} + +/* + * Detect losses [RFC8958 6.2 Step 5]. + */ +static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + struct rxrpc_txqueue *tq; + ktime_t timeout = 0, lost_after, now = ktime_get_real(); + + call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary); + lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); + trace_rxrpc_rack_scan_loss(call); + + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long nacks = rxrpc_tq_nacks(tq); + + if (after(tq->qbase, call->tx_transmitted)) + break; + trace_rxrpc_rack_scan_loss_tq(call, tq, nacks); + + /* Skip ones marked lost but not yet retransmitted */ + nacks &= ~tq->segment_lost | tq->segment_retransmitted; + + while (nacks) { + unsigned int ix = __ffs(nacks); + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t remaining; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + + __clear_bit(ix, &nacks); + + if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq, + xmit_ts, seq)) { + remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now); + if (remaining <= 0) { + rxrpc_rack_mark_lost(call, tq, ix); + trace_rxrpc_rack_detect_loss(call, summary, seq); + } else { + timeout = max(remaining, timeout); + } + } + } + } + + return timeout; +} + +/* + * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5]. + */ +void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + ktime_t timeout = rxrpc_rack_detect_loss(call, summary); + + if (timeout) { + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER; + call->rack_timo_at = ktime_add(ktime_get_real(), timeout); + trace_rxrpc_rack_timer(call, timeout, false); + trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo); + } +} + +/* + * Handle RACK-TLP RTO expiration [RFC8958 6.3]. + */ +static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq; + rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ + ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); + ktime_t deadline = ktime_sub(ktime_get_real(), lost_after); + + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long unacked = ~tq->segment_acked; + + trace_rxrpc_rack_mark_loss_tq(call, tq); + while (unacked) { + unsigned int ix = __ffs(unacked); + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + + if (after(seq, call->tx_transmitted)) + return; + __clear_bit(ix, &unacked); + + if (seq == snd_una || + ktime_before(xmit_ts, deadline)) + rxrpc_rack_mark_lost(call, tq, ix); + } + } +} + +/* + * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2]. + */ +ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now) +{ + unsigned int flight_size = rxrpc_tx_in_flight(call); + ktime_t rto_at = ktime_add(call->tx_last_sent, + rxrpc_get_rto_backoff(call, false)); + ktime_t pto; + + if (call->rtt_count > 0) { + /* Use 2*SRTT as the timeout. */ + pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4); + if (flight_size) + pto = ktime_add(pto, call->tlp_max_ack_delay); + } else { + pto = NSEC_PER_SEC; + } + + if (ktime_after(ktime_add(now, pto), rto_at)) + pto = ktime_sub(rto_at, now); + return pto; +} + +/* + * Send a TLP loss probe on PTO expiration [RFC8958 7.3]. + */ +void rxrpc_tlp_send_probe(struct rxrpc_call *call) +{ + unsigned int in_flight = rxrpc_tx_in_flight(call); + + if (after_eq(call->acks_hard_ack, call->tx_transmitted)) + return; /* Everything we transmitted has been acked. */ + + /* There must be no other loss probe still in flight and we need to + * have taken a new RTT sample since last probe or the start of + * connection. + */ + if (!call->tlp_serial && + call->tlp_rtt_taken != call->rtt_taken) { + call->tlp_is_retrans = false; + if (after(call->send_top, call->tx_transmitted) && + rxrpc_tx_window_space(call) > 0) { + /* Transmit the lowest-sequence unsent DATA */ + call->tx_last_serial = 0; + rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data); + call->tlp_serial = call->tx_last_serial; + call->tlp_seq = call->tx_transmitted; + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new); + in_flight = rxrpc_tx_in_flight(call); + } else { + /* Retransmit the highest-sequence DATA sent */ + call->tx_last_serial = 0; + rxrpc_resend_tlp(call); + call->tlp_is_retrans = true; + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit); + } + } else { + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy); + } + + if (in_flight != 0) { + ktime_t rto = rxrpc_get_rto_backoff(call, false); + + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO; + call->rack_timo_at = ktime_add(ktime_get_real(), rto); + trace_rxrpc_rack_timer(call, rto, false); + trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto); + } +} + +/* + * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4]. + */ +void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) +{ + if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack)) + return; + + if (!call->tlp_is_retrans) { + /* TLP of new data delivered */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data); + call->tlp_serial = 0; + } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE && + summary->acked_serial == call->tlp_serial) { + /* General Case: Detected packet losses using RACK [7.4.1] */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked); + call->tlp_serial = 0; + } else if (after(call->acks_hard_ack, call->tlp_seq)) { + /* Repaired the single loss */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond); + call->tlp_serial = 0; + // TODO: Invoke congestion control to react to the loss + // event the probe has repaired + } else if (summary->tlp_probe_acked) { + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked); + /* Special Case: Detected a single loss repaired by the loss + * probe [7.4.2] + */ + call->tlp_serial = 0; + } else { + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete); + } +} + +/* + * Handle RACK timer expiration; returns true to request a resend. + */ +void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by) +{ + struct rxrpc_ack_summary summary = {}; + enum rxrpc_rack_timer_mode mode = call->rack_timer_mode; + + trace_rxrpc_rack_timer(call, overran_by, true); + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; + + switch (mode) { + case RXRPC_CALL_RACKTIMER_RACK_REORDER: + rxrpc_rack_detect_loss_and_arm_timer(call, &summary); + break; + case RXRPC_CALL_RACKTIMER_TLP_PTO: + rxrpc_tlp_send_probe(call); + break; + case RXRPC_CALL_RACKTIMER_RTO: + // Might need to poke the congestion algo in some way + rxrpc_rack_mark_losses_on_rto(call); + break; + //case RXRPC_CALL_RACKTIMER_ZEROWIN: + default: + pr_warn("Unexpected rack timer %u", call->rack_timer_mode); + } +} diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index f2701068ed9e..0a260df45d25 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -19,11 +19,14 @@ static int none_init_connection_security(struct rxrpc_connection *conn, */ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { - return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 0, gfp); + return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp); } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { + txb->pkt_len = txb->len; + if (txb->len == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; return 0; } @@ -39,11 +42,18 @@ static void none_free_call_crypto(struct rxrpc_call *call) { } -static int none_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool none_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, - rxrpc_eproto_rxnull_challenge); + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxrpc_eproto_rxnull_challenge); + return true; +} + +static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; } static int none_verify_response(struct rxrpc_connection *conn, @@ -79,7 +89,8 @@ const struct rxrpc_security rxrpc_no_security = { .alloc_txbuf = none_alloc_txbuf, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, - .respond_to_challenge = none_respond_to_challenge, + .validate_challenge = none_validate_challenge, + .sendmsg_respond_to_challenge = none_sendmsg_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, }; diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 0300baa9afcd..27b650d30f4d 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -27,11 +27,17 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) { struct sk_buff_head *rx_queue; struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); + struct task_struct *io_thread; if (unlikely(!local)) { kfree_skb(skb); return 0; } + io_thread = READ_ONCE(local->io_thread); + if (!io_thread) { + kfree_skb(skb); + return 0; + } if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); @@ -47,7 +53,7 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) #endif skb_queue_tail(rx_queue, skb); - rxrpc_wake_up_io_thread(local); + wake_up_process(io_thread); return 0; } @@ -332,7 +338,6 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; unsigned int channel; - bool ret; if (sp->hdr.securityIndex != conn->security_ix) return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security, @@ -358,6 +363,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (sp->hdr.callNumber == 0) return rxrpc_input_conn_packet(conn, skb); + /* Deal with path MTU discovery probing. */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK && + conn->pmtud_probe && + after_eq(sp->ack.acked_serial, conn->pmtud_probe)) + rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false); + /* Call-bound packets are routed by connection channel. */ channel = sp->hdr.cid & RXRPC_CHANNELMASK; chan = &conn->channels[channel]; @@ -413,9 +424,9 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, peer_srx, skb); } - ret = rxrpc_input_call_event(call, skb); + rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - return ret; + return true; } /* @@ -432,6 +443,8 @@ int rxrpc_io_thread(void *data) ktime_t now; #endif bool should_stop; + LIST_HEAD(conn_attend_q); + LIST_HEAD(call_attend_q); complete(&local->io_thread_ready); @@ -442,43 +455,26 @@ int rxrpc_io_thread(void *data) for (;;) { rxrpc_inc_stat(local->rxnet, stat_io_loop); - /* Deal with connections that want immediate attention. */ - conn = list_first_entry_or_null(&local->conn_attend_q, - struct rxrpc_connection, - attend_link); - if (conn) { - spin_lock_bh(&local->lock); - list_del_init(&conn->attend_link); - spin_unlock_bh(&local->lock); - - rxrpc_input_conn_event(conn, NULL); - rxrpc_put_connection(conn, rxrpc_conn_put_poke); - continue; + /* Inject a delay into packets if requested. */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + now = ktime_get_real(); + while ((skb = skb_peek(&local->rx_delay_queue))) { + if (ktime_before(now, skb->tstamp)) + break; + skb = skb_dequeue(&local->rx_delay_queue); + skb_queue_tail(&local->rx_queue, skb); } +#endif - if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, - &local->client_conn_flags)) - rxrpc_discard_expired_client_conns(local); - - /* Deal with calls that want immediate attention. */ - if ((call = list_first_entry_or_null(&local->call_attend_q, - struct rxrpc_call, - attend_link))) { - spin_lock_bh(&local->lock); - list_del_init(&call->attend_link); - spin_unlock_bh(&local->lock); - - trace_rxrpc_call_poked(call); - rxrpc_input_call_event(call, NULL); - rxrpc_put_call(call, rxrpc_call_put_poke); - continue; + if (!skb_queue_empty(&local->rx_queue)) { + spin_lock_irq(&local->rx_queue.lock); + skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); + spin_unlock_irq(&local->rx_queue.lock); + trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue)); } - if (!list_empty(&local->new_client_calls)) - rxrpc_connect_client_calls(local); - - /* Process received packets and errors. */ - if ((skb = __skb_dequeue(&rx_queue))) { + /* Distribute packets and errors. */ + while ((skb = __skb_dequeue(&rx_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: @@ -493,8 +489,8 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - rxrpc_input_conn_event(sp->conn, skb); - rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke); + rxrpc_input_conn_event(sp->poke_conn, skb); + rxrpc_put_connection(sp->poke_conn, rxrpc_conn_put_poke); rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured); break; default: @@ -502,27 +498,48 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_unknown); break; } - continue; } - /* Inject a delay into packets if requested. */ -#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY - now = ktime_get_real(); - while ((skb = skb_peek(&local->rx_delay_queue))) { - if (ktime_before(now, skb->tstamp)) - break; - skb = skb_dequeue(&local->rx_delay_queue); - skb_queue_tail(&local->rx_queue, skb); + /* Deal with connections that want immediate attention. */ + if (!list_empty_careful(&local->conn_attend_q)) { + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); + spin_unlock_irq(&local->lock); } -#endif - if (!skb_queue_empty(&local->rx_queue)) { - spin_lock_irq(&local->rx_queue.lock); - skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); - spin_unlock_irq(&local->rx_queue.lock); - continue; + while ((conn = list_first_entry_or_null(&conn_attend_q, + struct rxrpc_connection, + attend_link))) { + spin_lock_irq(&local->lock); + list_del_init(&conn->attend_link); + spin_unlock_irq(&local->lock); + rxrpc_input_conn_event(conn, NULL); + rxrpc_put_connection(conn, rxrpc_conn_put_poke); } + if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, + &local->client_conn_flags)) + rxrpc_discard_expired_client_conns(local); + + /* Deal with calls that want immediate attention. */ + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->call_attend_q, &call_attend_q); + spin_unlock_irq(&local->lock); + + while ((call = list_first_entry_or_null(&call_attend_q, + struct rxrpc_call, + attend_link))) { + spin_lock_irq(&local->lock); + list_del_init(&call->attend_link); + spin_unlock_irq(&local->lock); + trace_rxrpc_call_poked(call); + rxrpc_input_call_event(call); + rxrpc_put_call(call, rxrpc_call_put_poke); + } + + if (!list_empty(&local->new_client_calls)) + rxrpc_connect_client_calls(local); + set_current_state(TASK_INTERRUPTIBLE); should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || @@ -552,7 +569,7 @@ int rxrpc_io_thread(void *data) } timeout = nsecs_to_jiffies(delay_ns); - timeout = max(timeout, 1UL); + timeout = umax(timeout, 1); schedule_timeout(timeout); __set_current_state(TASK_RUNNING); continue; @@ -565,7 +582,7 @@ int rxrpc_io_thread(void *data) __set_current_state(TASK_RUNNING); rxrpc_see_local(local, rxrpc_local_stop); rxrpc_destroy_local(local); - local->io_thread = NULL; + WRITE_ONCE(local->io_thread, NULL); rxrpc_see_local(local, rxrpc_local_stopped); return 0; } diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 33e8302a79e3..9fdc1f031c9d 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -129,6 +129,160 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, return 0; } +static u64 xdr_dec64(const __be32 *xdr) +{ + return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]); +} + +static time64_t rxrpc_s64_to_time64(s64 time_in_100ns) +{ + bool neg = false; + u64 tmp = time_in_100ns; + + if (time_in_100ns < 0) { + tmp = -time_in_100ns; + neg = true; + } + do_div(tmp, 10000000); + return neg ? -tmp : tmp; +} + +/* + * Parse a YFS-RxGK type XDR format token + * - the caller guarantees we have at least 4 words + * + * struct token_rxgk { + * opr_time begintime; + * opr_time endtime; + * afs_int64 level; + * afs_int64 lifetime; + * afs_int64 bytelife; + * afs_int64 enctype; + * opaque key<>; + * opaque ticket<>; + * }; + */ +static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, + size_t datalen, + const __be32 *xdr, unsigned int toklen) +{ + struct rxrpc_key_token *token, **pptoken; + time64_t expiry; + size_t plen; + const __be32 *ticket, *key; + s64 tmp; + u32 tktlen, keylen; + + _enter(",{%x,%x,%x,%x},%x", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + if (6 * 2 + 2 > toklen / 4) + goto reject; + + key = xdr + (6 * 2 + 1); + keylen = ntohl(key[-1]); + _debug("keylen: %x", keylen); + keylen = round_up(keylen, 4); + if ((6 * 2 + 2) * 4 + keylen > toklen) + goto reject; + + ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1); + tktlen = ntohl(ticket[-1]); + _debug("tktlen: %x", tktlen); + tktlen = round_up(tktlen, 4); + if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) { + kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]", + (6 * 2 + 2) * 4 + keylen + tktlen, toklen, + keylen, tktlen); + goto reject; + } + + plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen; + prep->quotalen = datalen + plen; + + plen -= sizeof(*token); + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + goto nomem; + + token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL); + if (!token->rxgk) + goto nomem_token; + + token->security_index = RXRPC_SECURITY_YFS_RXGK; + token->rxgk->begintime = xdr_dec64(xdr + 0 * 2); + token->rxgk->endtime = xdr_dec64(xdr + 1 * 2); + token->rxgk->level = tmp = xdr_dec64(xdr + 2 * 2); + if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT) + goto reject_token; + token->rxgk->lifetime = xdr_dec64(xdr + 3 * 2); + token->rxgk->bytelife = xdr_dec64(xdr + 4 * 2); + token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2); + if (tmp < 0 || tmp > UINT_MAX) + goto reject_token; + token->rxgk->key.len = ntohl(key[-1]); + token->rxgk->key.data = token->rxgk->_key; + token->rxgk->ticket.len = ntohl(ticket[-1]); + + if (token->rxgk->endtime != 0) { + expiry = rxrpc_s64_to_time64(token->rxgk->endtime); + if (expiry < 0) + goto expired; + if (expiry < prep->expiry) + prep->expiry = expiry; + } + + memcpy(token->rxgk->key.data, key, token->rxgk->key.len); + + /* Pad the ticket so that we can use it directly in XDR */ + token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4), + GFP_KERNEL); + if (!token->rxgk->ticket.data) + goto nomem_yrxgk; + memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len); + + _debug("SCIX: %u", token->security_index); + _debug("EXPY: %llx", token->rxgk->endtime); + _debug("LIFE: %llx", token->rxgk->lifetime); + _debug("BYTE: %llx", token->rxgk->bytelife); + _debug("ENC : %u", token->rxgk->enctype); + _debug("LEVL: %u", token->rxgk->level); + _debug("KLEN: %u", token->rxgk->key.len); + _debug("TLEN: %u", token->rxgk->ticket.len); + _debug("KEY0: %*phN", token->rxgk->key.len, token->rxgk->key.data); + _debug("TICK: %*phN", + min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data); + + /* count the number of tokens attached */ + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); + + /* attach the data */ + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + + _leave(" = 0"); + return 0; + +nomem_yrxgk: + kfree(token->rxgk); +nomem_token: + kfree(token); +nomem: + return -ENOMEM; +reject_token: + kfree(token); +reject: + return -EKEYREJECTED; +expired: + kfree(token->rxgk); + kfree(token); + return -EKEYEXPIRED; +} + /* * attempt to parse the data as the XDR format * - the caller guarantees we have more than 7 words @@ -228,6 +382,9 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) case RXRPC_SECURITY_RXKAD: ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen); break; + case RXRPC_SECURITY_YFS_RXGK: + ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen); + break; default: ret2 = -EPROTONOSUPPORT; break; @@ -390,6 +547,10 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token) case RXRPC_SECURITY_RXKAD: kfree(token->kad); break; + case RXRPC_SECURITY_YFS_RXGK: + kfree(token->rxgk->ticket.data); + kfree(token->rxgk); + break; default: pr_err("Unknown token type %x on rxrpc key\n", token->security_index); @@ -433,6 +594,9 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m) case RXRPC_SECURITY_RXKAD: seq_puts(m, "ka"); break; + case RXRPC_SECURITY_YFS_RXGK: + seq_puts(m, "ygk"); + break; default: /* we have a ticket we can't encode */ seq_printf(m, "%u", token->security_index); break; @@ -531,6 +695,8 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key); * * Generate a null RxRPC key that can be used to indicate anonymous security is * required for a particular domain. + * + * Return: The new key or a negative error code. */ struct key *rxrpc_get_null_key(const char *keyname) { @@ -595,6 +761,13 @@ static long rxrpc_read(const struct key *key, toksize += RND(token->kad->ticket_len); break; + case RXRPC_SECURITY_YFS_RXGK: + toksize += 6 * 8 + 2 * 4; + if (!token->no_leak_key) + toksize += RND(token->rxgk->key.len); + toksize += RND(token->rxgk->ticket.len); + break; + default: /* we have a ticket we can't encode */ pr_err("Unsupported key token type (%u)\n", token->security_index); @@ -674,6 +847,20 @@ static long rxrpc_read(const struct key *key, ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); break; + case RXRPC_SECURITY_YFS_RXGK: + ENCODE64(token->rxgk->begintime); + ENCODE64(token->rxgk->endtime); + ENCODE64(token->rxgk->level); + ENCODE64(token->rxgk->lifetime); + ENCODE64(token->rxgk->bytelife); + ENCODE64(token->rxgk->enctype); + if (token->no_leak_key) + ENCODE(0); + else + ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data); + ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data); + break; + default: pr_err("Unsupported key token type (%u)\n", token->security_index); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 504453c688d7..a74a4b43904f 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -215,9 +215,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) /* we want to set the don't fragment bit */ rxrpc_local_dont_fragment(local, true); - - /* We want receive timestamps. */ - sock_enable_timestamps(usk); break; default: @@ -232,7 +229,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } wait_for_completion(&local->io_thread_ready); - local->io_thread = io_thread; + WRITE_ONCE(local->io_thread, io_thread); _leave(" = 0"); return 0; @@ -452,9 +449,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local) #endif rxrpc_purge_queue(&local->rx_queue); rxrpc_purge_client_connections(local); - if (local->tx_alloc.va) - __page_frag_cache_drain(virt_to_page(local->tx_alloc.va), - local->tx_alloc.pagecnt_bias); + page_frag_cache_drain(&local->tx_alloc); } /* diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 657cf35089a6..8fcc8139d771 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255; * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet * made by gluing normal packets together that we're willing to handle. */ -unsigned int rxrpc_rx_mtu = 5692; +unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46); /* * The maximum number of fragments in a received jumbo packet that we tell the * sender that we're willing to handle. */ -unsigned int rxrpc_rx_jumbo_max = 4; +unsigned int rxrpc_rx_jumbo_max = 46; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY /* diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index a4c135d0fbcc..9a9834145e81 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -105,10 +105,10 @@ static __net_exit void rxrpc_exit_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; - del_timer_sync(&rxnet->peer_keepalive_timer); + timer_delete_sync(&rxnet->peer_keepalive_timer); cancel_work_sync(&rxnet->peer_keepalive_work); /* Remove the timer again as the worker may have restarted it. */ - del_timer_sync(&rxnet->peer_keepalive_timer); + timer_delete_sync(&rxnet->peer_keepalive_timer); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); rxrpc_destroy_all_peers(rxnet); diff --git a/net/rxrpc/oob.c b/net/rxrpc/oob.c new file mode 100644 index 000000000000..05ca9c1faa57 --- /dev/null +++ b/net/rxrpc/oob.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Out of band message handling (e.g. challenge-response) + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/gfp.h> +#include <linux/skbuff.h> +#include <linux/export.h> +#include <linux/sched/signal.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +enum rxrpc_oob_command { + RXRPC_OOB_CMD_UNSET, + RXRPC_OOB_CMD_RESPOND, +} __mode(byte); + +struct rxrpc_oob_params { + u64 oob_id; /* ID number of message if reply */ + s32 abort_code; + enum rxrpc_oob_command command; + bool have_oob_id:1; +}; + +/* + * Post an out-of-band message for attention by the socket or kernel service + * associated with a reference call. + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_sock *rx; + struct sock *sk; + + rcu_read_lock(); + + rx = rcu_dereference(call->socket); + if (rx) { + sk = &rx->sk; + spin_lock_irq(&rx->recvmsg_lock); + + if (sk->sk_state < RXRPC_CLOSE) { + skb->skb_mstamp_ns = rx->oob_id_counter++; + rxrpc_get_skb(skb, rxrpc_skb_get_post_oob); + skb_queue_tail(&rx->recvmsg_oobq, skb); + + trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); + if (rx->app_ops) + rx->app_ops->notify_oob(sk, skb); + } + + spin_unlock_irq(&rx->recvmsg_lock); + if (!rx->app_ops && !sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + } + + rcu_read_unlock(); +} + +/* + * Locate the OOB message to respond to by its ID. + */ +static struct sk_buff *rxrpc_find_pending_oob(struct rxrpc_sock *rx, u64 oob_id) +{ + struct rb_node *p; + struct sk_buff *skb; + + p = rx->pending_oobq.rb_node; + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); + + if (oob_id < skb->skb_mstamp_ns) + p = p->rb_left; + else if (oob_id > skb->skb_mstamp_ns) + p = p->rb_right; + else + return skb; + } + + return NULL; +} + +/* + * Add an OOB message into the pending-response set. We always assign the next + * value from a 64-bit counter to the oob_id, so just assume we're always going + * to be on the right-hand edge of the tree and that the counter won't wrap. + * The tree is also given a ref to the message. + */ +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb) +{ + struct rb_node **pp = &rx->pending_oobq.rb_node, *p = NULL; + + while (*pp) { + p = *pp; + pp = &(*pp)->rb_right; + } + + rb_link_node(&skb->rbnode, p, pp); + rb_insert_color(&skb->rbnode, &rx->pending_oobq); +} + +/* + * Extract control messages from the sendmsg() control buffer. + */ +static int rxrpc_sendmsg_oob_cmsg(struct msghdr *msg, struct rxrpc_oob_params *p) +{ + struct cmsghdr *cmsg; + int len; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - sizeof(struct cmsghdr); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_OOB_ID: + if (len != sizeof(p->oob_id) || p->have_oob_id) + return -EINVAL; + memcpy(&p->oob_id, CMSG_DATA(cmsg), sizeof(p->oob_id)); + p->have_oob_id = true; + break; + case RXRPC_RESPOND: + if (p->command != RXRPC_OOB_CMD_UNSET) + return -EINVAL; + p->command = RXRPC_OOB_CMD_RESPOND; + break; + case RXRPC_ABORT: + if (len != sizeof(p->abort_code) || p->abort_code) + return -EINVAL; + memcpy(&p->abort_code, CMSG_DATA(cmsg), sizeof(p->abort_code)); + if (p->abort_code == 0) + return -EINVAL; + break; + case RXRPC_RESP_RXGK_APPDATA: + if (p->command != RXRPC_OOB_CMD_RESPOND) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + if (!p->have_oob_id) + return -EBADSLT; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * Allow userspace to respond to an OOB using sendmsg(). + */ +static int rxrpc_respond_to_oob(struct rxrpc_sock *rx, + struct rxrpc_oob_params *p, + struct msghdr *msg) +{ + struct rxrpc_connection *conn; + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + int ret; + + skb = rxrpc_find_pending_oob(rx, p->oob_id); + if (skb) + rb_erase(&skb->rbnode, &rx->pending_oobq); + release_sock(&rx->sk); + if (!skb) + return -EBADSLT; + + sp = rxrpc_skb(skb); + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + ret = -EPROTO; + if (skb->mark != RXRPC_OOB_CHALLENGE) + break; + conn = sp->chall.conn; + ret = -EOPNOTSUPP; + if (!conn->security->sendmsg_respond_to_challenge) + break; + if (p->abort_code) { + rxrpc_abort_conn(conn, NULL, p->abort_code, -ECONNABORTED, + rxrpc_abort_response_sendmsg); + ret = 0; + } else { + ret = conn->security->sendmsg_respond_to_challenge(skb, msg); + } + break; + default: + ret = -EINVAL; + break; + } + + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + +/* + * Send an out-of-band message or respond to a received out-of-band message. + * - caller gives us the socket lock + * - the socket may be either a client socket or a server socket + */ +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) +{ + struct rxrpc_oob_params p = {}; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_oob_cmsg(msg, &p); + if (ret < 0) + goto error_release_sock; + + if (p.have_oob_id) + return rxrpc_respond_to_oob(rx, &p, msg); + + release_sock(&rx->sk); + + switch (p.command) { + default: + ret = -EINVAL; + break; + } + + _leave(" = %d", ret); + return ret; + +error_release_sock: + release_sock(&rx->sk); + return ret; +} + +/** + * rxrpc_kernel_query_oob - Query the parameters of an out-of-band message + * @oob: The message to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * + * Extract useful parameters from an out-of-band message. The source peer + * parameters are returned through the argument list and the message type is + * returned. + * + * Return: + * * %RXRPC_OOB_CHALLENGE - Challenge wanting a response. + */ +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + enum rxrpc_oob_type type = oob->mark; + + switch (type) { + case RXRPC_OOB_CHALLENGE: + *_peer = sp->chall.conn->peer; + *_peer_appdata = sp->chall.conn->peer->app_data; + break; + default: + WARN_ON_ONCE(1); + *_peer = NULL; + *_peer_appdata = 0; + break; + } + + return type; +} +EXPORT_SYMBOL(rxrpc_kernel_query_oob); + +/** + * rxrpc_kernel_dequeue_oob - Dequeue and return the front OOB message + * @sock: The socket to query + * @_type: Where to return the message type + * + * Dequeue the front OOB message, if there is one, and return it and + * its type. + * + * Return: The sk_buff representing the OOB message or %NULL if the queue was + * empty. + */ +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *oob; + + oob = skb_dequeue(&rx->recvmsg_oobq); + if (oob) + *_type = oob->mark; + return oob; +} +EXPORT_SYMBOL(rxrpc_kernel_dequeue_oob); + +/** + * rxrpc_kernel_free_oob - Free an out-of-band message + * @oob: The OOB message to free + * + * Free an OOB message along with any resources it holds. + */ +void rxrpc_kernel_free_oob(struct sk_buff *oob) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + + switch (oob->mark) { + case RXRPC_OOB_CHALLENGE: + rxrpc_put_connection(sp->chall.conn, rxrpc_conn_put_oob); + break; + } + + rxrpc_free_skb(oob, rxrpc_skb_put_purge_oob); +} +EXPORT_SYMBOL(rxrpc_kernel_free_oob); + +/** + * rxrpc_kernel_query_challenge - Query the parameters of a challenge + * @challenge: The challenge to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * @_service_id: Where to return the connection service ID + * @_security_index: Where to return the connection security index + * + * Extract useful parameters from a CHALLENGE message. + */ +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + *_peer = sp->chall.conn->peer; + *_peer_appdata = sp->chall.conn->peer->app_data; + *_service_id = sp->hdr.serviceId; + *_security_index = sp->hdr.securityIndex; +} +EXPORT_SYMBOL(rxrpc_kernel_query_challenge); + +/** + * rxrpc_kernel_reject_challenge - Allow a kernel service to reject a challenge + * @challenge: The challenge to be rejected + * @abort_code: The abort code to stick into the ABORT packet + * @error: Local error value + * @why: Indication as to why. + * + * Allow a kernel service to reject a challenge by aborting the connection if + * it's still in an abortable state. The error is returned so this function + * can be used with a return statement. + * + * Return: The %error parameter. + */ +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + _enter("{%x},%d,%d,%u", sp->hdr.serial, abort_code, error, why); + + rxrpc_abort_conn(sp->chall.conn, NULL, abort_code, error, why); + return error; +} +EXPORT_SYMBOL(rxrpc_kernel_reject_challenge); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 5ea9601efd05..ef7b3096c95e 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -18,7 +18,7 @@ extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) { struct sockaddr *sa = msg->msg_name; struct sock *sk = socket->sk; @@ -72,22 +72,96 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call, ktime_t now) } /* + * Allocate transmission buffers for an ACK and attach them to local->kv[]. + */ +static int rxrpc_alloc_ack(struct rxrpc_call *call, size_t sack_size) +{ + struct rxrpc_wire_header *whdr; + struct rxrpc_acktrailer *trailer; + struct rxrpc_ackpacket *ack; + struct kvec *kv = call->local->kvec; + gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS; + void *buf, *buf2 = NULL; + u8 *filler; + + buf = page_frag_alloc(&call->local->tx_alloc, + sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp); + if (!buf) + return -ENOMEM; + + if (sack_size) { + buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp); + if (!buf2) { + page_frag_free(buf); + return -ENOMEM; + } + } + + whdr = buf; + ack = buf + sizeof(*whdr); + filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; + trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; + + kv[0].iov_base = whdr; + kv[0].iov_len = sizeof(*whdr) + sizeof(*ack); + kv[1].iov_base = buf2; + kv[1].iov_len = sack_size; + kv[2].iov_base = filler; + kv[2].iov_len = 3 + sizeof(*trailer); + return 3; /* Number of kvec[] used. */ +} + +static void rxrpc_free_ack(struct rxrpc_call *call) +{ + page_frag_free(call->local->kvec[0].iov_base); + if (call->local->kvec[1].iov_base) + page_frag_free(call->local->kvec[1].iov_base); +} + +/* + * Record the beginning of an RTT probe. + */ +static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, + ktime_t now, enum rxrpc_rtt_tx_trace why) +{ + unsigned long avail = call->rtt_avail; + int rtt_slot = 9; + + if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK)) + goto no_slot; + + rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK); + if (!test_and_clear_bit(rtt_slot, &call->rtt_avail)) + goto no_slot; + + call->rtt_serial[rtt_slot] = serial; + call->rtt_sent_at[rtt_slot] = now; + smp_wmb(); /* Write data before avail bit */ + set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + + trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); + return; + +no_slot: + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); +} + +/* * Fill out an ACK packet. */ -static void rxrpc_fill_out_ack(struct rxrpc_call *call, - struct rxrpc_txbuf *txb, - u8 ack_reason, - rxrpc_serial_t serial) +static int rxrpc_fill_out_ack(struct rxrpc_call *call, int nr_kv, u8 ack_reason, + rxrpc_serial_t serial_to_ack, rxrpc_serial_t *_ack_serial) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3; + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; + struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); - unsigned int qsize, sack, wrap, to; + unsigned int qsize, sack, wrap, to, max_mtu, if_mtu; rxrpc_seq_t window, wtop; + ktime_t now = ktime_get_real(); int rsize; - u32 mtu, jmax; - u8 *filler = txb->kvec[2].iov_base; - u8 *sackp = txb->kvec[1].iov_base; + u8 *filler = kv[2].iov_base; + u8 *sackp = kv[1].iov_base; rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); @@ -95,14 +169,25 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, wtop = call->ackr_wtop; sack = call->ackr_sack_base % RXRPC_SACK_SIZE; + *_ack_serial = rxrpc_get_next_serial(call->conn); + + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->serial = htonl(*_ack_serial); whdr->seq = 0; whdr->type = RXRPC_PACKET_TYPE_ACK; - txb->flags |= RXRPC_SLOW_START_OK; + whdr->flags = call->conn->out_clientflag | RXRPC_SLOW_START_OK; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->dest_srx.srx_service); + ack->bufferSpace = 0; ack->maxSkew = 0; ack->firstPacket = htonl(window); ack->previousPacket = htonl(call->rx_highest_seq); - ack->serial = htonl(serial); + ack->serial = htonl(serial_to_ack); ack->reason = ack_reason; ack->nAcks = wtop - window; filler[0] = 0; @@ -110,15 +195,13 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, filler[2] = 0; if (ack_reason == RXRPC_ACK_PING) - txb->flags |= RXRPC_REQUEST_ACK; + whdr->flags |= RXRPC_REQUEST_ACK; if (after(wtop, window)) { - txb->len += ack->nAcks; - txb->kvec[1].iov_base = sackp; - txb->kvec[1].iov_len = ack->nAcks; + kv[1].iov_len = ack->nAcks; wrap = RXRPC_SACK_SIZE - sack; - to = min_t(unsigned int, ack->nAcks, RXRPC_SACK_SIZE); + to = umin(ack->nAcks, RXRPC_SACK_SIZE); if (sack + ack->nAcks <= RXRPC_SACK_SIZE) { memcpy(sackp, call->ackr_sack_table + sack, ack->nAcks); @@ -132,56 +215,42 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, ack->reason = RXRPC_ACK_IDLE; } - mtu = call->peer->if_mtu; - mtu -= call->peer->hdrsize; - jmax = rxrpc_rx_jumbo_max; qsize = (window - 1) - call->rx_consumed; rsize = max_t(int, call->rx_winsize - qsize, 0); - txb->ack_rwind = rsize; - trailer->maxMTU = htonl(rxrpc_rx_mtu); - trailer->ifMTU = htonl(mtu); - trailer->rwind = htonl(rsize); - trailer->jumbo_max = htonl(jmax); -} - -/* - * Record the beginning of an RTT probe. - */ -static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, - ktime_t now, enum rxrpc_rtt_tx_trace why) -{ - unsigned long avail = call->rtt_avail; - int rtt_slot = 9; - - if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK)) - goto no_slot; - - rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK); - if (!test_and_clear_bit(rtt_slot, &call->rtt_avail)) - goto no_slot; - call->rtt_serial[rtt_slot] = serial; - call->rtt_sent_at[rtt_slot] = now; - smp_wmb(); /* Write data before avail bit */ - set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + if_mtu = call->peer->if_mtu - call->peer->hdrsize; + if (call->peer->ackr_adv_pmtud) { + max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(if_mtu, 1444); + max_mtu = if_mtu; + } - trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); - return; + trailer->maxMTU = htonl(max_mtu); + trailer->ifMTU = htonl(if_mtu); + trailer->rwind = htonl(rsize); + trailer->jumbo_max = 0; /* Advertise pmtu discovery */ -no_slot: - trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); + if (ack_reason == RXRPC_ACK_PING) + rxrpc_begin_rtt_probe(call, *_ack_serial, now, rxrpc_rtt_tx_ping); + if (whdr->flags & RXRPC_REQUEST_ACK) + call->rtt_last_req = now; + rxrpc_set_keepalive(call, now); + return nr_kv; } /* * Transmit an ACK packet. */ -static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static void rxrpc_send_ack_packet(struct rxrpc_call *call, int nr_kv, size_t len, + rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; + struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3; struct rxrpc_connection *conn; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); struct msghdr msg; - ktime_t now; int ret; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) @@ -195,33 +264,34 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t msg.msg_controllen = 0; msg.msg_flags = MSG_SPLICE_PAGES; - whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; - - txb->serial = rxrpc_get_next_serial(conn); - whdr->serial = htonl(txb->serial); - trace_rxrpc_tx_ack(call->debug_id, txb->serial, + trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(ack->firstPacket), ntohl(ack->serial), ack->reason, ack->nAcks, - txb->ack_rwind); + ntohl(trailer->rwind), why); rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); - iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, txb->len); - rxrpc_local_dont_fragment(conn->local, false); - ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len); + iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, len); + rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) { - trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, + trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); + if (why == rxrpc_propose_ack_ping_for_mtu_probe && + ret == -EMSGSIZE) + rxrpc_input_probe_for_pmtud(conn, serial, true); } else { trace_rxrpc_tx_packet(call->debug_id, whdr, rxrpc_tx_point_call_ack); - now = ktime_get_real(); - if (ack->reason == RXRPC_ACK_PING) - rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_ping); - if (txb->flags & RXRPC_REQUEST_ACK) - call->peer->rtt_last_req = now; - rxrpc_set_keepalive(call, now); + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + call->peer->pmtud_pending = false; + call->peer->pmtud_probing = true; + call->conn->pmtud_probe = serial; + call->conn->pmtud_call = call->debug_id; + trace_rxrpc_pmtud_tx(call); + } } rxrpc_tx_backoff(call, ret); } @@ -230,31 +300,62 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t * Queue an ACK for immediate transmission. */ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, - rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) + rxrpc_serial_t serial_to_ack, enum rxrpc_propose_ack_trace why) { - struct rxrpc_txbuf *txb; + struct kvec *kv = call->local->kvec; + rxrpc_serial_t ack_serial; + size_t len; + int nr_kv; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return; rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); - txb = rxrpc_alloc_ack_txbuf(call, call->ackr_wtop - call->ackr_window); - if (!txb) { + nr_kv = rxrpc_alloc_ack(call, call->ackr_wtop - call->ackr_window); + if (nr_kv < 0) { kleave(" = -ENOMEM"); return; } - txb->ack_why = why; + nr_kv = rxrpc_fill_out_ack(call, nr_kv, ack_reason, serial_to_ack, &ack_serial); + len = kv[0].iov_len; + len += kv[1].iov_len; + len += kv[2].iov_len; + + /* Extend a path MTU probe ACK. */ + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header); + + if (len > probe_mtu) + goto skip; + while (len < probe_mtu) { + size_t part = umin(probe_mtu - len, PAGE_SIZE); + + kv[nr_kv].iov_base = page_address(ZERO_PAGE(0)); + kv[nr_kv].iov_len = part; + len += part; + nr_kv++; + } + } - rxrpc_fill_out_ack(call, txb, ack_reason, serial); call->ackr_nr_unacked = 0; atomic_set(&call->ackr_nr_consumed, 0); clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); - trace_rxrpc_send_ack(call, why, ack_reason, serial); - rxrpc_send_ack_packet(call, txb); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); + trace_rxrpc_send_ack(call, why, ack_reason, ack_serial); + rxrpc_send_ack_packet(call, nr_kv, len, ack_serial, why); +skip: + rxrpc_free_ack(call); +} + +/* + * Send an ACK probe for path MTU discovery. + */ +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call) +{ + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_mtu_probe); } /* @@ -324,14 +425,21 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) /* * Prepare a (sub)packet for transmission. */ -static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, - rxrpc_serial_t serial) +static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_txbuf *txb, + struct rxrpc_wire_header *whdr, + rxrpc_serial_t serial, int subpkt) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxrpc_jumbo_header *jumbo = txb->data - sizeof(*jumbo); enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; + struct kvec *kv = &call->local->kvec[1 + subpkt]; + size_t len = txb->pkt_len; + bool last; + u8 flags; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,%zd", txb->seq, len); txb->serial = serial; @@ -339,6 +447,15 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t txb->seq == 1) whdr->userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; + txb->flags &= ~RXRPC_REQUEST_ACK; + flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; + last = txb->flags & RXRPC_LAST_PACKET; + + if (subpkt < req->n - 1) { + len = RXRPC_JUMBO_DATALEN; + goto dont_set_request_ack; + } + /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. * @@ -346,113 +463,208 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (txb->flags & RXRPC_REQUEST_ACK) - why = rxrpc_reqack_already_on; - else if ((txb->flags & RXRPC_LAST_PACKET) && rxrpc_sending_to_client(txb)) + if (last && rxrpc_sending_to_client(txb)) why = rxrpc_reqack_no_srv_last; else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) why = rxrpc_reqack_ack_lost; else if (txb->flags & RXRPC_TXBUF_RESENT) why = rxrpc_reqack_retrans; - else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2) + else if (call->cong_ca_state == RXRPC_CA_SLOW_START && call->cong_cwnd <= RXRPC_MIN_CWND) why = rxrpc_reqack_slow_start; else if (call->tx_winsize <= 2) why = rxrpc_reqack_small_txwin; - else if (call->peer->rtt_count < 3 && txb->seq & 1) + else if (call->rtt_count < 3) why = rxrpc_reqack_more_rtt; - else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) + else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) why = rxrpc_reqack_old_rtt; + else if (!last && !after(READ_ONCE(call->send_top), txb->seq)) + why = rxrpc_reqack_app_stall; else goto dont_set_request_ack; rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]); trace_rxrpc_req_ack(call->debug_id, txb->seq, why); - if (why != rxrpc_reqack_no_srv_last) - txb->flags |= RXRPC_REQUEST_ACK; + if (why != rxrpc_reqack_no_srv_last) { + flags |= RXRPC_REQUEST_ACK; + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, -1, serial); + call->rtt_last_req = req->now; + } dont_set_request_ack: - whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; - whdr->serial = htonl(txb->serial); - whdr->cksum = txb->cksum; + /* There's a jumbo header prepended to the data if we need it. */ + if (subpkt < req->n - 1) + flags |= RXRPC_JUMBO_PACKET; + else + flags &= ~RXRPC_JUMBO_PACKET; + if (subpkt == 0) { + whdr->flags = flags; + whdr->cksum = txb->cksum; + kv->iov_base = txb->data; + } else { + jumbo->flags = flags; + jumbo->pad = 0; + jumbo->cksum = txb->cksum; + kv->iov_base = jumbo; + len += sizeof(*jumbo); + } - trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, false); + trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, req->trace); + kv->iov_len = len; + return len; } /* - * Prepare a packet for transmission. + * Prepare a transmission queue object for initial transmission. Returns the + * number of microseconds since the transmission queue base timestamp. */ -static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq, + struct rxrpc_send_data_req *req) { - rxrpc_serial_t serial; - - /* Each transmission of a Tx packet needs a new serial number */ - serial = rxrpc_get_next_serial(call->conn); - - rxrpc_prepare_data_subpacket(call, txb, serial); - - return txb->len; + if (!tq) + return 0; + if (tq->xmit_ts_base == KTIME_MIN) { + tq->xmit_ts_base = req->now; + return 0; + } + return ktime_to_us(ktime_sub(req->now, tq->xmit_ts_base)); } /* - * Set timeouts after transmitting a packet. + * Prepare a (jumbo) packet for transmission. */ -static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_wire_header *whdr) { - ktime_t now = ktime_get_real(); - bool ack_requested = txb->flags & RXRPC_REQUEST_ACK; + struct rxrpc_txqueue *tq = req->tq; + rxrpc_serial_t serial; + unsigned int xmit_ts; + rxrpc_seq_t seq = req->seq; + size_t len = 0; + bool start_tlp = false; - call->tx_last_sent = now; - txb->last_sent = now; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit); - if (ack_requested) { - rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_data); + /* Each transmission of a Tx packet needs a new serial number */ + serial = rxrpc_get_next_serials(call->conn, req->n); + + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->seq = htonl(seq); + whdr->serial = htonl(serial); + whdr->type = RXRPC_PACKET_TYPE_DATA; + whdr->flags = 0; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->conn->service_id); + + call->tx_last_serial = serial + req->n - 1; + call->tx_last_sent = req->now; + xmit_ts = rxrpc_prepare_txqueue(tq, req); + prefetch(tq->next); + + for (int i = 0;;) { + int ix = seq & RXRPC_TXQ_MASK; + struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK]; + + _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq); + + /* Record (re-)transmission for RACK [RFC8985 6.1]. */ + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + if (req->retrans) { + __set_bit(ix, &tq->ever_retransmitted); + __set_bit(ix, &tq->segment_retransmitted); + call->tx_nr_resent++; + } else { + call->tx_nr_sent++; + start_tlp = true; + } + tq->segment_xmit_ts[ix] = xmit_ts; + tq->segment_serial[ix] = serial; + if (i + 1 == req->n) + /* Only sample the last subpacket in a jumbo. */ + __set_bit(ix, &tq->rtt_samples); + len += rxrpc_prepare_data_subpacket(call, req, txb, whdr, serial, i); + serial++; + seq++; + i++; + if (i >= req->n) + break; + if (!(seq & RXRPC_TXQ_MASK)) { + tq = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit_advance); + xmit_ts = rxrpc_prepare_txqueue(tq, req); + } + } - call->peer->rtt_last_req = now; - if (call->peer->rtt_count > 1) { - ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); + /* Set timeouts */ + if (req->tlp_probe) { + /* Sending TLP loss probe [RFC8985 7.3]. */ + call->tlp_serial = serial - 1; + call->tlp_seq = seq - 1; + } else if (start_tlp) { + /* Schedule TLP loss probe [RFC8985 7.2]. */ + ktime_t pto; + + if (!test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + /* The first packet may take longer to elicit a response. */ + pto = NSEC_PER_SEC; + else + pto = rxrpc_tlp_calc_pto(call, req->now); - call->ack_lost_at = ktime_add(now, delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack); - } + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_TLP_PTO; + call->rack_timo_at = ktime_add(req->now, pto); + trace_rxrpc_rack_timer(call, pto, false); + trace_rxrpc_timer_set(call, pto, rxrpc_timer_trace_rack_tlp_pto); } if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo)); - call->expect_rx_by = ktime_add(now, delay); + call->expect_rx_by = ktime_add(req->now, delay); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx); } - rxrpc_set_keepalive(call, now); + rxrpc_set_keepalive(call, req->now); + page_frag_free(whdr); + return len; } /* - * send a packet through the transport endpoint + * Send one or more packets through the transport endpoint */ -static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxrpc_wire_header *whdr; struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; + struct rxrpc_txqueue *tq = req->tq; + struct rxrpc_txbuf *txb; struct msghdr msg; - size_t len; - int ret; + rxrpc_seq_t seq = req->seq; + size_t len = sizeof(*whdr); + bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags); + int ret, stat_ix; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1); - len = rxrpc_prepare_data_packet(call, txb); + whdr = page_frag_alloc(&call->local->tx_alloc, sizeof(*whdr), GFP_NOFS); + if (!whdr) + return; /* Drop the packet if no memory. */ - if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { - static int lose; - if ((lose++ & 7) == 7) { - ret = 0; - trace_rxrpc_tx_data(call, txb->seq, txb->serial, - txb->flags, true); - goto done; - } - } + call->local->kvec[0].iov_base = whdr; + call->local->kvec[0].iov_len = sizeof(*whdr); + + stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1; + atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]); - iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, len); + len += rxrpc_prepare_data_packet(call, req, whdr); + txb = tq->bufs[seq & RXRPC_TXQ_MASK]; + + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1 + req->n, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -460,16 +672,11 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t msg.msg_controllen = 0; msg.msg_flags = MSG_SPLICE_PAGES; - /* Track what we've attempted to transmit at least once so that the - * retransmission algorithm doesn't try to resend what we haven't sent - * yet. + /* Send the packet with the don't fragment bit set unless we think it's + * too big or if this is a retransmission. */ - if (txb->seq == call->tx_transmitted + 1) - call->tx_transmitted = txb->seq; - - /* send the packet with the don't fragment bit set if we currently - * think it's small enough */ - if (txb->len >= call->peer->maxdata) { + if (seq == call->tx_transmitted + 1 && + len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; } else { @@ -477,7 +684,25 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t frag = rxrpc_tx_point_call_data_nofrag; } -retry: + /* Track what we've attempted to transmit at least once so that the + * retransmission algorithm doesn't try to resend what we haven't sent + * yet. + */ + if (seq == call->tx_transmitted + 1) + call->tx_transmitted = seq + req->n - 1; + + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { + static int lose; + + if ((lose++ & 7) == 7) { + ret = 0; + trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, + rxrpc_txdata_inject_loss); + conn->peer->last_tx_at = ktime_get_seconds(); + goto done; + } + } + /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface @@ -488,7 +713,11 @@ retry: ret = do_udp_sendmsg(conn->local->socket, &msg, len); conn->peer->last_tx_at = ktime_get_seconds(); - if (ret < 0) { + if (ret == -EMSGSIZE) { + rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); + ret = 0; + } else if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { @@ -496,28 +725,23 @@ retry: } rxrpc_tx_backoff(call, ret); - if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_frag) { - rxrpc_local_dont_fragment(conn->local, false); - frag = rxrpc_tx_point_call_data_frag; - goto retry; - } -done: - if (ret >= 0) { - rxrpc_tstamp_data_packets(call, txb); - } else { - /* Cancel the call if the initial transmission fails, - * particularly if that's due to network routing issues that - * aren't going away anytime soon. The layer above can arrange - * the retransmission. + if (ret < 0) { + /* Cancel the call if the initial transmission fails or if we + * hit due to network routing issues that aren't going away + * anytime soon. The layer above can arrange the + * retransmission. */ - if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + if (new_call || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH || + ret == -ECONNREFUSED) rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_USER_ABORT, ret); } - _leave(" = %d [%u]", ret, call->peer->maxdata); - return ret; +done: + _leave(" = %d [%u]", ret, call->peer->max_data); } /* @@ -694,39 +918,62 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) } /* - * Schedule an instant Tx resend. + * Send a RESPONSE message. */ -static inline void rxrpc_instant_resend(struct rxrpc_call *call, - struct rxrpc_txbuf *txb) +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response) { - if (!__rxrpc_call_is_complete(call)) - kdebug("resend"); -} + struct rxrpc_skb_priv *sp = rxrpc_skb(response); + struct scatterlist sg[16]; + struct bio_vec *bvec = conn->local->bvec; + struct msghdr msg; + size_t len = sp->resp.len; + __be32 wserial; + u32 serial = 0; + int ret, nr_sg; -/* - * Transmit one packet. - */ -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb) -{ - int ret; + _enter("C=%x,%x", conn->debug_id, sp->resp.challenge_serial); - ret = rxrpc_send_data_packet(call, txb); - if (ret < 0) { - switch (ret) { - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - 0, ret); - break; - default: - _debug("need instant resend %d", ret); - rxrpc_instant_resend(call, txb); - } - } else { - ktime_t delay = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, 0, len); + if (ret < 0) + goto fail; + nr_sg = ret; + ret = -EIO; + if (WARN_ON_ONCE(nr_sg > ARRAY_SIZE(conn->local->bvec))) + goto fail; - call->resend_at = ktime_add(ktime_get_real(), delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend_tx); - } + for (int i = 0; i < nr_sg; i++) + bvec_set_page(&bvec[i], sg_page(&sg[i]), sg[i].length, sg[i].offset); + + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, nr_sg, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + serial = rxrpc_get_next_serials(conn, 1); + wserial = htonl(serial); + + trace_rxrpc_tx_response(conn, serial, sp); + + ret = skb_store_bits(response, offsetof(struct rxrpc_wire_header, serial), + &wserial, sizeof(wserial)); + if (ret < 0) + goto fail; + + rxrpc_local_dont_fragment(conn->local, false); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret < 0) + goto fail; + + conn->peer->last_tx_at = ktime_get_seconds(); + return; + +fail: + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_response); + kleave(" = %d", ret); } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 552ba84a255c..7f4729234957 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -102,6 +102,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, */ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { + unsigned int max_data; + /* wind down the local interface MTU */ if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) peer->if_mtu = mtu; @@ -120,11 +122,15 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) } } - if (mtu < peer->mtu) { - spin_lock(&peer->lock); - peer->mtu = mtu; - peer->maxdata = peer->mtu - peer->hdrsize; - spin_unlock(&peer->lock); + max_data = max_t(int, mtu - peer->hdrsize, 500); + if (max_data < peer->max_data) { + if (peer->pmtud_good > max_data) + peer->pmtud_good = max_data; + if (peer->pmtud_bad > max_data + 1) + peer->pmtud_bad = max_data + 1; + + trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp); + peer->max_data = max_data; } } @@ -161,6 +167,13 @@ void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb) goto out; } + if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 && + serr->ee.ee_type == ICMPV6_PKT_TOOBIG && + serr->ee.ee_code == 0)) { + rxrpc_adjust_mtu(peer, serr->ee.ee_info); + goto out; + } + rxrpc_store_error(peer, skb); out: rxrpc_put_peer(peer, rxrpc_peer_put_input_error); @@ -205,23 +218,23 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb, struct rxrpc_call *call; HLIST_HEAD(error_targets); - spin_lock(&peer->lock); + spin_lock_irq(&peer->lock); hlist_move_list(&peer->error_targets, &error_targets); while (!hlist_empty(&error_targets)) { call = hlist_entry(error_targets.first, struct rxrpc_call, error_link); hlist_del_init(&call->error_link); - spin_unlock(&peer->lock); + spin_unlock_irq(&peer->lock); rxrpc_see_call(call, rxrpc_call_see_distribute_error); rxrpc_set_call_completion(call, compl, 0, -err); - rxrpc_input_call_event(call, skb); + rxrpc_input_call_event(call); - spin_lock(&peer->lock); + spin_lock_irq(&peer->lock); } - spin_unlock(&peer->lock); + spin_unlock_irq(&peer->lock); } /* @@ -238,7 +251,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, bool use; int slot; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); while (!list_empty(collector)) { peer = list_entry(collector->next, @@ -249,7 +262,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, continue; use = __rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (use) { keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; @@ -269,17 +282,17 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, */ slot += cursor; slot &= mask; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } rxrpc_put_peer(peer, rxrpc_peer_put_keepalive); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); } /* @@ -309,7 +322,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) * second; the bucket at cursor + 1 goes at now + 1s and so * on... */ - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_splice_init(&rxnet->peer_keepalive_new, &collector); stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive); @@ -321,7 +334,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) } base = now; - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxnet->peer_keepalive_base = base; rxnet->peer_keepalive_cursor = cursor; @@ -347,3 +360,84 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) _leave(""); } + +/* + * Do path MTU probing. + */ +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail) +{ + struct rxrpc_peer *peer = conn->peer; + unsigned int max_data = peer->max_data; + int good, trial, bad, jumbo; + + good = peer->pmtud_good; + trial = peer->pmtud_trial; + bad = peer->pmtud_bad; + if (good >= bad - 1) { + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + return; + } + + if (!peer->pmtud_probing) + goto send_probe; + + if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) { + /* Retry a lost probe. */ + if (!peer->pmtud_lost) { + trace_rxrpc_pmtud_lost(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = true; + goto send_probe; + } + + /* The probed size didn't seem to get through. */ + bad = trial; + peer->pmtud_bad = bad; + if (bad <= max_data) + max_data = bad - 1; + } else { + /* It did get through. */ + good = trial; + peer->pmtud_good = good; + if (good > max_data) + max_data = good; + } + + max_data = umin(max_data, peer->ackr_max_data); + if (max_data != peer->max_data) + peer->max_data = max_data; + + jumbo = max_data + sizeof(struct rxrpc_jumbo_header); + jumbo /= RXRPC_JUMBO_SUBPKTLEN; + peer->pmtud_jumbo = jumbo; + + trace_rxrpc_pmtud_rx(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + + if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2)) + trial = RXRPC_JUMBO(2); + else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4)) + trial = RXRPC_JUMBO(4); + else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3)) + trial = RXRPC_JUMBO(3); + else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6)) + trial = RXRPC_JUMBO(6); + else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5)) + trial = RXRPC_JUMBO(5); + else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8)) + trial = RXRPC_JUMBO(8); + else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7)) + trial = RXRPC_JUMBO(7); + else + trial = (good + bad) / 2; + peer->pmtud_trial = trial; + + if (good >= bad) + return; + +send_probe: + peer->pmtud_pending = true; +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 49dcda67a0d5..e2f35e6c04d6 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -162,6 +162,11 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, #endif peer->if_mtu = 1500; + if (peer->max_data < peer->if_mtu - peer->hdrsize) { + trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize, + rxrpc_pmtud_reduce_route); + peer->max_data = peer->if_mtu - peer->hdrsize; + } memset(&fl, 0, sizeof(fl)); switch (peer->srx.transport.family) { @@ -199,8 +204,16 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, } peer->if_mtu = dst_mtu(dst); + peer->hdrsize += dst->header_len + dst->trailer_len; + peer->tx_seg_max = dst->dev->gso_max_segs; dst_release(dst); + peer->max_data = umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize); + peer->pmtud_good = 500; + peer->pmtud_bad = peer->if_mtu - peer->hdrsize + 1; + peer->pmtud_trial = umin(peer->max_data, peer->pmtud_bad - 1); + peer->pmtud_pending = true; + _leave(" [if_mtu %u]", peer->if_mtu); } @@ -222,11 +235,8 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); - spin_lock_init(&peer->rtt_input_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); - - rxrpc_peer_init_rtt(peer); - + peer->recent_srtt_us = UINT_MAX; peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; trace_rxrpc_peer(peer->debug_id, 1, why); } @@ -242,9 +252,7 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; - rxrpc_assess_MTU_size(local, peer); - peer->mtu = peer->if_mtu; - peer->rtt_last_req = ktime_get_real(); + switch (peer->srx.transport.family) { case AF_INET: @@ -268,7 +276,9 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, } peer->hdrsize += sizeof(struct rxrpc_wire_header); - peer->maxdata = peer->mtu - peer->hdrsize; + peer->max_data = peer->if_mtu - peer->hdrsize; + + rxrpc_assess_MTU_size(local, peer); } /* @@ -304,6 +314,7 @@ static void rxrpc_free_peer(struct rxrpc_peer *peer) * Set up a new incoming peer. There shouldn't be any other matching peers * since we've already done a search in the list from the non-reentrant context * (the data_ready handler) that is the only place we can add new peers. + * Called with interrupts disabled. */ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { @@ -348,7 +359,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, return NULL; } - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); @@ -361,7 +372,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, &rxnet->peer_keepalive_new); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (peer) rxrpc_free_peer(candidate); @@ -411,10 +422,10 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) ASSERT(hlist_empty(&peer->error_targets)); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_free_peer(peer); } @@ -450,7 +461,7 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) continue; hlist_for_each_entry(peer, &rxnet->peer_hash[i], hash_link) { - pr_err("Leaked peer %u {%u} %pISp\n", + pr_err("Leaked peer %x {%u} %pISp\n", peer->debug_id, refcount_read(&peer->ref), &peer->srx.transport); @@ -464,10 +475,12 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) * @call: The call to query * * Get a record for the remote peer in a call. + * + * Return: The call's peer record. */ struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call) { - return call->peer; + return rxrpc_get_peer(call->peer, rxrpc_peer_get_application); } EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); @@ -475,11 +488,13 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT * @peer: The peer to query * - * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples. + * Get the call's peer smoothed RTT. + * + * Return: The RTT in uS or %UINT_MAX if we have no samples. */ unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) { - return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX; + return READ_ONCE(peer->recent_srtt_us); } EXPORT_SYMBOL(rxrpc_kernel_get_srtt); @@ -488,7 +503,10 @@ EXPORT_SYMBOL(rxrpc_kernel_get_srtt); * @peer: The peer to query * * Get a pointer to the address from a peer record. The caller is responsible - * for making sure that the address is not deallocated. + * for making sure that the address is not deallocated. A fake address will be + * substituted if %peer in NULL. + * + * Return: The rxrpc address record or a fake record. */ const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer) { @@ -501,7 +519,10 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_srx); * @peer: The peer to query * * Get a pointer to the transport address from a peer record. The caller is - * responsible for making sure that the address is not deallocated. + * responsible for making sure that the address is not deallocated. A fake + * address will be substituted if %peer in NULL. + * + * Return: The transport address record or a fake record. */ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) { @@ -509,3 +530,33 @@ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) (peer ? &peer->srx.transport : &rxrpc_null_addr.transport); } EXPORT_SYMBOL(rxrpc_kernel_remote_addr); + +/** + * rxrpc_kernel_set_peer_data - Set app-specific data on a peer. + * @peer: The peer to alter + * @app_data: The data to set + * + * Set the app-specific data on a peer. AF_RXRPC makes no effort to retain + * anything the data might refer to. + * + * Return: The previous app_data. + */ +unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data) +{ + return xchg(&peer->app_data, app_data); +} +EXPORT_SYMBOL(rxrpc_kernel_set_peer_data); + +/** + * rxrpc_kernel_get_peer_data - Get app-specific data from a peer. + * @peer: The peer to query + * + * Retrieve the app-specific data from a peer. + * + * Return: The peer's app data. + */ +unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer) +{ + return peer->app_data; +} +EXPORT_SYMBOL(rxrpc_kernel_get_peer_data); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 263a2251e3d2..d803562ca0ac 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -52,7 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); enum rxrpc_call_state state; - rxrpc_seq_t acks_hard_ack; + rxrpc_seq_t tx_bottom; char lbuff[50], rbuff[50]; long timeout = 0; @@ -79,7 +79,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) if (state != RXRPC_CALL_SERVER_PREALLOC) timeout = ktime_ms_delta(READ_ONCE(call->expect_rx_by), ktime_get_real()); - acks_hard_ack = READ_ONCE(call->acks_hard_ack); + tx_bottom = READ_ONCE(call->tx_bottom); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" " %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n", @@ -93,7 +93,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) rxrpc_call_states[state], call->abort_code, call->debug_id, - acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack, + tx_bottom, READ_ONCE(call->tx_top) - tx_bottom, call->ackr_window, call->ackr_wtop - call->ackr_window, call->rx_serial, call->cong_cwnd, @@ -283,9 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_puts(seq, - "Proto Local " - " Remote " - " Use SST MTU LastUse RTT RTO\n" + "Proto Local Remote Use SST Maxd LastUse RTT RTO\n" ); return 0; } @@ -298,16 +296,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) now = ktime_get_seconds(); seq_printf(seq, - "UDP %-47.47s %-47.47s %3u" - " %3u %5u %6llus %8u %8u\n", + "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8d %8d\n", lbuff, rbuff, refcount_read(&peer->ref), peer->cong_ssthresh, - peer->mtu, + peer->max_data, now - peer->last_tx_at, - peer->srtt_us >> 3, - peer->rto_us); + READ_ONCE(peer->recent_srtt_us), + READ_ONCE(peer->recent_rto_us)); return 0; } @@ -476,10 +473,11 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq)); seq_printf(seq, - "Data : send=%u sendf=%u fail=%u\n", + "Data : send=%u sendf=%u fail=%u emsz=%u\n", atomic_read(&rxnet->stat_tx_data_send), atomic_read(&rxnet->stat_tx_data_send_frag), - atomic_read(&rxnet->stat_tx_data_send_fail)); + atomic_read(&rxnet->stat_tx_data_send_fail), + atomic_read(&rxnet->stat_tx_data_send_msgsize)); seq_printf(seq, "Data-Tx : nr=%u retrans=%u uf=%u cwr=%u\n", atomic_read(&rxnet->stat_tx_data), @@ -508,7 +506,7 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]), atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE])); seq_printf(seq, - "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n", + "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u z=%u\n", atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]), @@ -517,13 +515,14 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]), - atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE])); + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]), + atomic_read(&rxnet->stat_rx_acks[0])); seq_printf(seq, - "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n", + "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt])); + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall])); seq_printf(seq, "Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]), @@ -531,6 +530,30 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin])); seq_printf(seq, + "Jumbo-Tx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", + atomic_read(&rxnet->stat_tx_jumbo[0]), + atomic_read(&rxnet->stat_tx_jumbo[1]), + atomic_read(&rxnet->stat_tx_jumbo[2]), + atomic_read(&rxnet->stat_tx_jumbo[3]), + atomic_read(&rxnet->stat_tx_jumbo[4]), + atomic_read(&rxnet->stat_tx_jumbo[5]), + atomic_read(&rxnet->stat_tx_jumbo[6]), + atomic_read(&rxnet->stat_tx_jumbo[7]), + atomic_read(&rxnet->stat_tx_jumbo[8]), + atomic_read(&rxnet->stat_tx_jumbo[9])); + seq_printf(seq, + "Jumbo-Rx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", + atomic_read(&rxnet->stat_rx_jumbo[0]), + atomic_read(&rxnet->stat_rx_jumbo[1]), + atomic_read(&rxnet->stat_rx_jumbo[2]), + atomic_read(&rxnet->stat_rx_jumbo[3]), + atomic_read(&rxnet->stat_rx_jumbo[4]), + atomic_read(&rxnet->stat_rx_jumbo[5]), + atomic_read(&rxnet->stat_rx_jumbo[6]), + atomic_read(&rxnet->stat_rx_jumbo[7]), + atomic_read(&rxnet->stat_rx_jumbo[8]), + atomic_read(&rxnet->stat_rx_jumbo[9])); + seq_printf(seq, "Buffers : txb=%u rxb=%u\n", atomic_read(&rxrpc_nr_txbuf), atomic_read(&rxrpc_n_rx_skbs)); @@ -567,6 +590,8 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size) atomic_set(&rxnet->stat_tx_ack_skip, 0); memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks)); memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks)); + memset(&rxnet->stat_tx_jumbo, 0, sizeof(rxnet->stat_tx_jumbo)); + memset(&rxnet->stat_rx_jumbo, 0, sizeof(rxnet->stat_rx_jumbo)); memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack)); diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 4fe6b4d20ada..f8bfec12bc7e 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -92,11 +92,16 @@ struct rxrpc_jumbo_header { /* * The maximum number of subpackets that can possibly fit in a UDP packet is: * - * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1 - * = ((65535 - 28 - 28) / 1416) + 1 - * = 46 non-terminal packets and 1 terminal packet. + * (max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412) + * = ((65535 - 28 + 4) / 1416) + * = 45 non-terminal packets and 1 terminal packet. */ -#define RXRPC_MAX_NR_JUMBO 47 +#define RXRPC_MAX_NR_JUMBO 46 + +/* Size of a jumbo packet with N subpackets, excluding UDP+IP */ +#define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \ + RXRPC_JUMBO_DATALEN + \ + ((N) - 1) * RXRPC_JUMBO_SUBPKTLEN) /*****************************************************************************/ /* @@ -176,4 +181,24 @@ struct rxkad_response { __be32 ticket_len; /* Kerberos ticket length */ } __packed; +/* + * GSSAPI security type-4 and type-6 data header. + */ +struct rxgk_header { + __be32 epoch; + __be32 cid; + __be32 call_number; + __be32 seq; + __be32 sec_index; + __be32 data_len; +} __packed; + +/* + * GSSAPI security type-4 and type-6 response packet header. + */ +struct rxgk_response { + __be64 start_time; + __be32 token_len; +} __packed; + #endif /* _LINUX_RXRPC_PACKET_H */ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a482f88c5fc5..86a27fb55a1c 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -36,16 +36,16 @@ void rxrpc_notify_socket(struct rxrpc_call *call) sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { - spin_lock(&call->notify_lock); + spin_lock_irq(&call->notify_lock); call->notify_rx(sk, call, call->user_call_ID); - spin_unlock(&call->notify_lock); + spin_unlock_irq(&call->notify_lock); } else { - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); @@ -155,6 +155,82 @@ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) } /* + * Transcribe a call's user ID to a control message. + */ +static int rxrpc_recvmsg_user_id(struct rxrpc_call *call, struct msghdr *msg, + int flags) +{ + if (!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) + return 0; + + if (flags & MSG_CMSG_COMPAT) { + unsigned int id32 = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned int), &id32); + } else { + unsigned long idl = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned long), &idl); + } +} + +/* + * Deal with a CHALLENGE packet. + */ +static int rxrpc_recvmsg_challenge(struct socket *sock, struct msghdr *msg, + struct sk_buff *challenge, unsigned int flags) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + struct rxrpc_connection *conn = sp->chall.conn; + + return conn->security->challenge_to_recvmsg(conn, challenge, msg); +} + +/* + * Process OOB packets. Called with the socket locked. + */ +static int rxrpc_recvmsg_oob(struct socket *sock, struct msghdr *msg, + unsigned int flags) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *skb; + bool need_response = false; + int ret; + + skb = skb_peek(&rx->recvmsg_oobq); + if (!skb) + return -EAGAIN; + rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg); + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_OOB_ID, sizeof(u64), + &skb->skb_mstamp_ns); + if (ret < 0) + return ret; + + switch ((enum rxrpc_oob_type)skb->mark) { + case RXRPC_OOB_CHALLENGE: + need_response = true; + ret = rxrpc_recvmsg_challenge(sock, msg, skb, flags); + break; + default: + WARN_ONCE(1, "recvmsg() can't process unknown OOB type %u\n", + skb->mark); + ret = -EIO; + break; + } + + if (!(flags & MSG_PEEK)) + skb_unlink(skb, &rx->recvmsg_oobq); + if (need_response) + rxrpc_add_pending_oob(rx, skb); + else + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + +/* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA * (returns 1). If more packets are required, it returns -EAGAIN and if the @@ -165,6 +241,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, size_t len, int flags, size_t *_offset) { struct rxrpc_skb_priv *sp; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct sk_buff *skb; rxrpc_seq_t seq = 0; size_t remain; @@ -207,7 +284,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, sp->offset, sp->len, ret2); if (ret2 < 0) { - kdebug("verify = %d", ret2); ret = ret2; goto out; } @@ -255,6 +331,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); + + if (!rx->app_ops && + !skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + trace_rxrpc_recvdata(call, rxrpc_recvmsg_oobq, seq, + rx_pkt_offset, rx_pkt_len, ret); + break; + } } out: @@ -262,6 +345,7 @@ out: call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; } + done: trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); @@ -301,6 +385,7 @@ try_again: /* Return immediately if a client socket has no outstanding calls */ if (RB_EMPTY_ROOT(&rx->calls) && list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq) && rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); return -EAGAIN; @@ -322,7 +407,8 @@ try_again: if (ret) goto wait_error; - if (list_empty(&rx->recvmsg_q)) { + if (list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq)) { if (signal_pending(current)) goto wait_interrupted; trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); @@ -332,19 +418,29 @@ try_again: goto try_again; } + /* Deal with OOB messages before we consider getting normal data. */ + if (!skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + ret = rxrpc_recvmsg_oob(sock, msg, flags); + release_sock(&rx->sk); + if (ret == -EAGAIN) + goto try_again; + goto error_no_call; + } + /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. * We also want to weed out calls that got requeued whilst we were * shovelling data out. */ - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && - skb_queue_empty(&call->recvmsg_queue)) { + skb_queue_empty(&call->recvmsg_queue) && + skb_queue_empty(&rx->recvmsg_oobq)) { list_del_init(&call->recvmsg_link); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0); rxrpc_put_call(call, rxrpc_call_put_recvmsg); @@ -355,7 +451,7 @@ try_again: list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_get_recvmsg); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); call_debug_id = call->debug_id; trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); @@ -377,21 +473,9 @@ try_again: if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); - if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - if (flags & MSG_CMSG_COMPAT) { - unsigned int id32 = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned int), &id32); - } else { - unsigned long idl = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned long), &idl); - } - if (ret < 0) - goto error_unlock_call; - } + ret = rxrpc_recvmsg_user_id(call, msg, flags); + if (ret < 0) + goto error_unlock_call; if (msg->msg_name && call->peer) { size_t len = sizeof(call->dest_srx); @@ -445,9 +529,9 @@ error_unlock_call: error_requeue_call: if (!(flags & MSG_PEEK)) { - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); @@ -477,14 +561,14 @@ wait_error: * @_service: Where to store the actual service ID (may be upgraded) * * Allow a kernel service to receive data and pick up information about the - * state of a call. Returns 0 if got what was asked for and there's more - * available, 1 if we got what was asked for and we're at the end of the data - * and -EAGAIN if we need more data. + * state of a call. Note that *@_abort should also be initialised to %0. * - * Note that we may return -EAGAIN to drain empty packets at the end of the - * data, even if we've already copied over the requested data. + * Note that we may return %-EAGAIN to drain empty packets at the end + * of the data, even if we've already copied over the requested data. * - * *_abort should also be initialised to 0. + * Return: %0 if got what was asked for and there's more available, %1 + * if we got what was asked for and we're at the end of the data and + * %-EAGAIN if we need more data. */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, struct iov_iter *iter, size_t *_len, diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index cdab7b7d08a0..7474f88d7b18 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -12,22 +12,22 @@ #include "ar-internal.h" #define RXRPC_RTO_MAX (120 * USEC_PER_SEC) -#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ +#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ -static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) +static u32 rxrpc_rto_min_us(struct rxrpc_call *call) { return 200; } -static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) +static u32 __rxrpc_set_rto(const struct rxrpc_call *call) { - return (peer->srtt_us >> 3) + peer->rttvar_us; + return (call->srtt_us >> 3) + call->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) { - return min(rto, RXRPC_RTO_MAX); + return clamp(200000, rto + 100000, RXRPC_RTO_MAX); } /* @@ -40,10 +40,10 @@ static u32 rxrpc_bound_rto(u32 rto) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) +static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ - u32 srtt = peer->srtt_us; + u32 srtt = call->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -66,7 +66,7 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (peer->mdev_us >> 2); /* similar update on mdev */ + m -= (call->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -78,31 +78,31 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) if (m > 0) m >>= 3; } else { - m -= (peer->mdev_us >> 2); /* similar update on mdev */ + m -= (call->mdev_us >> 2); /* similar update on mdev */ } - peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ - if (peer->mdev_us > peer->mdev_max_us) { - peer->mdev_max_us = peer->mdev_us; - if (peer->mdev_max_us > peer->rttvar_us) - peer->rttvar_us = peer->mdev_max_us; + call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (call->mdev_us > call->mdev_max_us) { + call->mdev_max_us = call->mdev_us; + if (call->mdev_max_us > call->rttvar_us) + call->rttvar_us = call->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ - peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ - peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); - peer->mdev_max_us = peer->rttvar_us; + call->mdev_us = m << 1; /* make sure rto = 3*rtt */ + call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call)); + call->mdev_max_us = call->rttvar_us; } - peer->srtt_us = max(1U, srtt); + call->srtt_us = umax(srtt, 1); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static void rxrpc_set_rto(struct rxrpc_peer *peer) +static void rxrpc_set_rto(struct rxrpc_call *call) { u32 rto; @@ -113,7 +113,7 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ - rto = __rxrpc_set_rto(peer); + rto = __rxrpc_set_rto(call); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -124,61 +124,73 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ - peer->rto_us = rxrpc_bound_rto(rto); + call->rto_us = rxrpc_bound_rto(rto); } -static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) +static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) +{ + /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */ + u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024; + + minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024, + (u32)rtt_us ? : jiffies_to_usecs(1)); +} + +static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { if (rtt_us < 0) return; - //rxrpc_update_rtt_min(peer, rtt_us); - rxrpc_rtt_estimator(peer, rtt_us); - rxrpc_set_rto(peer); + /* Update RACK min RTT [RFC8985 6.1 Step 1]. */ + rxrpc_update_rtt_min(call, resp_time, rtt_us); + + rxrpc_rtt_estimator(call, rtt_us); + rxrpc_set_rto(call); - /* RFC6298: only reset backoff on valid RTT measurement. */ - peer->backoff = 0; + /* Only reset backoff on valid RTT measurement [RFC6298]. */ + call->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has - * exclusive access to the peer RTT data. + * exclusive access to the call RTT data. */ -void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, +void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { - struct rxrpc_peer *peer = call->peer; s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; - spin_lock(&peer->rtt_input_lock); - rxrpc_ack_update_rtt(peer, rtt_us); - if (peer->rtt_count < 3) - peer->rtt_count++; - spin_unlock(&peer->rtt_input_lock); + rxrpc_ack_update_rtt(call, resp_time, rtt_us); + if (call->rtt_count < 3) + call->rtt_count++; + call->rtt_taken++; + + WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8); + WRITE_ONCE(call->peer->recent_rto_us, call->rto_us); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, - peer->srtt_us >> 3, peer->rto_us); + rtt_us, call->srtt_us, call->rto_us); } /* * Get the retransmission timeout to set in nanoseconds, backing it off each * time we retransmit. */ -ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) +ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans) { u64 timo_us; - u32 backoff = READ_ONCE(peer->backoff); + u32 backoff = READ_ONCE(call->backoff); - timo_us = peer->rto_us; + timo_us = call->rto_us; timo_us <<= backoff; if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) - WRITE_ONCE(peer->backoff, backoff + 1); + WRITE_ONCE(call->backoff, backoff + 1); if (timo_us < 1) timo_us = 1; @@ -186,10 +198,11 @@ ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) return ns_to_ktime(timo_us * NSEC_PER_USEC); } -void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) +void rxrpc_call_init_rtt(struct rxrpc_call *call) { - peer->rto_us = RXRPC_TIMEOUT_INIT; - peer->mdev_us = RXRPC_TIMEOUT_INIT; - peer->backoff = 0; - //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); + call->rtt_last_req = KTIME_MIN; + call->rto_us = RXRPC_TIMEOUT_INIT; + call->mdev_us = RXRPC_TIMEOUT_INIT; + call->backoff = 0; + //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U); } diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c new file mode 100644 index 000000000000..1e19c605bcc8 --- /dev/null +++ b/net/rxrpc/rxgk.c @@ -0,0 +1,1371 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/key-type.h> +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Parse the information from a server key + */ +static int rxgk_preparse_server_key(struct key_preparsed_payload *prep) +{ + const struct krb5_enctype *krb5; + struct krb5_buffer *server_key = (void *)&prep->payload.data[2]; + unsigned int service, sec_class, kvno, enctype; + int n = 0; + + _enter("%zu", prep->datalen); + + if (sscanf(prep->orig_description, "%u:%u:%u:%u%n", + &service, &sec_class, &kvno, &enctype, &n) != 4) + return -EINVAL; + + if (prep->orig_description[n]) + return -EINVAL; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + prep->payload.data[0] = (struct krb5_enctype *)krb5; + + if (prep->datalen != krb5->key_len) + return -EKEYREJECTED; + + server_key->len = prep->datalen; + server_key->data = kmemdup(prep->data, prep->datalen, GFP_KERNEL); + if (!server_key->data) + return -ENOMEM; + + _leave(" = 0"); + return 0; +} + +static void rxgk_free_server_key(union key_payload *payload) +{ + struct krb5_buffer *server_key = (void *)&payload->data[2]; + + kfree_sensitive(server_key->data); +} + +static void rxgk_free_preparse_server_key(struct key_preparsed_payload *prep) +{ + rxgk_free_server_key(&prep->payload); +} + +static void rxgk_destroy_server_key(struct key *key) +{ + rxgk_free_server_key(&key->payload); +} + +static void rxgk_describe_server_key(const struct key *key, struct seq_file *m) +{ + const struct krb5_enctype *krb5 = key->payload.data[0]; + + if (krb5) + seq_printf(m, ": %s", krb5->name); +} + +/* + * Handle rekeying the connection when we see our limits overrun or when the + * far side decided to rekey. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ +static struct rxgk_context *rxgk_rekey(struct rxrpc_connection *conn, + const u16 *specific_key_number) +{ + struct rxgk_context *gk, *dead = NULL; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + bool crank = false; + + _enter("%d", specific_key_number ? *specific_key_number : -1); + + mutex_lock(&conn->security_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto crank_window; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto generate_key; + if (!specific_key_number && + test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto crank_window; + +grab: + refcount_inc(&gk->usage); + mutex_unlock(&conn->security_lock); + rxgk_put(dead); + return gk; + +crank_window: + trace_rxrpc_rxgk_rekey(conn, current_key, + specific_key_number ? *specific_key_number : -1); + if (current_key == UINT_MAX) + goto bad_key; + if (current_key + 1 == UINT_MAX) + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + + key_number = current_key + 1; + if (WARN_ON(conn->rxgk.keys[key_number & mask])) + goto bad_key; + crank = true; + +generate_key: + gk = conn->rxgk.keys[current_key & mask]; + gk = rxgk_generate_transport_key(conn, gk->key, key_number, GFP_NOFS); + if (IS_ERR(gk)) { + mutex_unlock(&conn->security_lock); + return gk; + } + + write_lock(&conn->security_use_lock); + if (crank) { + current_key++; + conn->rxgk.key_number = current_key; + dead = conn->rxgk.keys[(current_key - 2) & mask]; + conn->rxgk.keys[(current_key - 2) & mask] = NULL; + } + conn->rxgk.keys[current_key & mask] = gk; + write_unlock(&conn->security_use_lock); + goto grab; + +bad_key: + mutex_unlock(&conn->security_lock); + return ERR_PTR(-ESTALE); +} + +/* + * Get the specified keying context. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ +static struct rxgk_context *rxgk_get_key(struct rxrpc_connection *conn, + const u16 *specific_key_number) +{ + struct rxgk_context *gk; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + + _enter("{%u},%d", + conn->rxgk.key_number, specific_key_number ? *specific_key_number : -1); + + read_lock(&conn->security_use_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + /* Only the bottom 16 bits of the key number are exposed in the + * header, so we try and keep the upper 16 bits in step. The + * whole 32 bits are used to generate the TK. + */ + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto rekey; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto slow_path; + if (!specific_key_number && + key_number < UINT_MAX) { + if (time_after(jiffies, gk->expiry) || + gk->bytes_remaining < 0) { + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); + goto slow_path; + } + + if (test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto slow_path; + } + + refcount_inc(&gk->usage); + read_unlock(&conn->security_use_lock); + return gk; + +rekey: + _debug("rekey"); + if (current_key == UINT_MAX) + goto bad_key; + gk = conn->rxgk.keys[current_key & mask]; + if (gk) + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); +slow_path: + read_unlock(&conn->security_use_lock); + return rxgk_rekey(conn, specific_key_number); +bad_key: + read_unlock(&conn->security_use_lock); + return ERR_PTR(-ESTALE); +} + +/* + * initialise connection security + */ +static int rxgk_init_connection_security(struct rxrpc_connection *conn, + struct rxrpc_key_token *token) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d,%u},{%x}", + conn->debug_id, conn->rxgk.key_number, key_serial(conn->key)); + + conn->security_ix = token->security_index; + conn->security_level = token->rxgk->level; + + if (rxrpc_conn_is_client(conn)) { + conn->rxgk.start_time = ktime_get(); + do_div(conn->rxgk.start_time, 100); + } + + gk = rxgk_generate_transport_key(conn, token->rxgk, conn->rxgk.key_number, + GFP_NOFS); + if (IS_ERR(gk)) + return PTR_ERR(gk); + conn->rxgk.enctype = gk->krb5->etype; + conn->rxgk.keys[gk->key_number & 3] = gk; + + switch (conn->security_level) { + case RXRPC_SECURITY_PLAIN: + case RXRPC_SECURITY_AUTH: + case RXRPC_SECURITY_ENCRYPT: + break; + default: + ret = -EKEYREJECTED; + goto error; + } + + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * Clean up the crypto on a call. + */ +static void rxgk_free_call_crypto(struct rxrpc_call *call) +{ +} + +/* + * Work out how much data we can put in a packet. + */ +static struct rxrpc_txbuf *rxgk_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) +{ + enum krb5_crypto_mode mode; + struct rxgk_context *gk; + struct rxrpc_txbuf *txb; + size_t shdr, alloc, limit, part, offset, gap; + + switch (call->conn->security_level) { + default: + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); + case RXRPC_SECURITY_AUTH: + shdr = 0; + mode = KRB5_CHECKSUM_MODE; + break; + case RXRPC_SECURITY_ENCRYPT: + shdr = sizeof(struct rxgk_header); + mode = KRB5_ENCRYPT_MODE; + break; + } + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return NULL; + + /* Work out the maximum amount of data that will fit. */ + alloc = RXRPC_JUMBO_DATALEN; + limit = crypto_krb5_how_much_data(gk->krb5, mode, &alloc, &offset); + + if (remain < limit - shdr) { + part = remain; + alloc = crypto_krb5_how_much_buffer(gk->krb5, mode, + shdr + part, &offset); + gap = 0; + } else { + part = limit - shdr; + gap = RXRPC_JUMBO_DATALEN - alloc; + alloc = RXRPC_JUMBO_DATALEN; + } + + rxgk_put(gk); + + txb = rxrpc_alloc_data_txbuf(call, alloc, 16, gfp); + if (!txb) + return NULL; + + txb->crypto_header = offset; + txb->sec_header = shdr; + txb->offset += offset + shdr; + txb->space = part; + + /* Clear excess space in the packet */ + if (gap) + memset(txb->data + alloc - gap, 0, gap); + return txb; +} + +/* + * Integrity mode (sign a packet - level 1 security) + */ +static int rxgk_secure_packet_integrity(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + struct krb5_buffer metadata; + int ret = -ENOMEM; + + _enter(""); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + goto error_gk; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + metadata.len = sizeof(*hdr); + metadata.data = hdr; + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_get_mic(gk->krb5, gk->tx_Kc, &metadata, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + kfree(hdr); +error_gk: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * wholly encrypt a packet (level 2 security) + */ +static int rxgk_secure_packet_encrypted(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + int ret; + + _enter("%x", txb->len); + + /* Insert the header into the buffer. */ + hdr = txb->data + txb->crypto_header; + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_encrypt(gk->krb5, gk->tx_enc, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len, + false); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * checksum an RxRPC packet header + */ +static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d{%x}},{#%u},%u,", + call->debug_id, key_serial(call->conn->key), txb->seq, txb->len); + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk) == -ESTALE ? -EKEYREJECTED : PTR_ERR(gk); + + ret = key_validate(call->conn->key); + if (ret < 0) { + rxgk_put(gk); + return ret; + } + + call->security_enctype = gk->krb5->etype; + txb->cksum = htons(gk->key_number); + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + rxgk_put(gk); + txb->pkt_len = txb->len; + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_secure_packet_integrity(call, gk, txb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_secure_packet_encrypted(call, gk, txb); + default: + rxgk_put(gk); + return -EPERM; + } +} + +/* + * Integrity mode (check the signature on a packet - level 1 security) + */ +static int rxgk_verify_packet_integrity(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header *hdr; + struct krb5_buffer metadata; + unsigned int offset = sp->offset, len = sp->len; + size_t data_offset = 0, data_len = len; + u32 ac; + int ret = -ENOMEM; + + _enter(""); + + crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, + &data_offset, &data_len); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + goto put_gk; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(sp->hdr.seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(data_len); + + metadata.len = sizeof(*hdr); + metadata.data = hdr; + ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, + skb, &offset, &len, &ac); + kfree(hdr); + if (ret == -EPROTO) { + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); + } else { + sp->offset = offset; + sp->len = len; + } + +put_gk: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Decrypt an encrypted packet (level 2 security). + */ +static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header hdr; + unsigned int offset = sp->offset, len = sp->len; + int ret; + u32 ac; + + _enter(""); + + ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); + if (ret == -EPROTO) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); + if (ret < 0) + goto error; + + if (len < sizeof(hdr)) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_header); + goto error; + } + + /* Extract the header from the skb */ + ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr)); + if (ret < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_encdata); + goto error; + } + offset += sizeof(hdr); + len -= sizeof(hdr); + + if (ntohl(hdr.epoch) != call->conn->proto.epoch || + ntohl(hdr.cid) != call->cid || + ntohl(hdr.call_number) != call->call_id || + ntohl(hdr.seq) != sp->hdr.seq || + ntohl(hdr.sec_index) != call->security_ix || + ntohl(hdr.data_len) > len) { + ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON, + rxgk_abort_2_short_data); + goto error; + } + + sp->offset = offset; + sp->len = ntohl(hdr.data_len); + ret = 0; +error: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Verify the security on a received packet or subpacket (if part of a + * jumbo packet). + */ +static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_context *gk; + u16 key_number = sp->hdr.cksum; + + _enter("{%d{%x}},{#%u}", + call->debug_id, key_serial(call->conn->key), sp->hdr.seq); + + gk = rxgk_get_key(call->conn, &key_number); + if (IS_ERR(gk)) { + switch (PTR_ERR(gk)) { + case -ESTALE: + return rxrpc_abort_eproto(call, skb, RXGK_BADKEYNO, + rxgk_abort_bad_key_number); + default: + return PTR_ERR(gk); + } + } + + call->security_enctype = gk->krb5->etype; + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + rxgk_put(gk); + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_verify_packet_integrity(call, gk, skb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_verify_packet_encrypted(call, gk, skb); + default: + rxgk_put(gk); + return -ENOANO; + } +} + +/* + * Allocate memory to hold a challenge or a response packet. We're not running + * in the io_thread, so we can't use ->tx_alloc. + */ +static struct page *rxgk_alloc_packet(size_t total_len) +{ + gfp_t gfp = GFP_NOFS; + int order; + + order = get_order(total_len); + if (order > 0) + gfp |= __GFP_COMP; + return alloc_pages(gfp, order); +} + +/* + * Issue a challenge. + */ +static int rxgk_issue_challenge(struct rxrpc_connection *conn) +{ + struct rxrpc_wire_header *whdr; + struct bio_vec bvec[1]; + struct msghdr msg; + struct page *page; + size_t len = sizeof(*whdr) + sizeof(conn->rxgk.nonce); + u32 serial; + int ret; + + _enter("{%d}", conn->debug_id); + + get_random_bytes(&conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + /* We can't use conn->tx_alloc without a lock */ + page = rxgk_alloc_packet(sizeof(*whdr) + sizeof(conn->rxgk.nonce)); + if (!page) + return -ENOMEM; + + bvec_set_page(&bvec[0], page, len, 0); + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, 1, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + whdr = page_address(page); + whdr->epoch = htonl(conn->proto.epoch); + whdr->cid = htonl(conn->proto.cid); + whdr->callNumber = 0; + whdr->seq = 0; + whdr->type = RXRPC_PACKET_TYPE_CHALLENGE; + whdr->flags = conn->out_clientflag; + whdr->userStatus = 0; + whdr->securityIndex = conn->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(conn->service_id); + + memcpy(whdr + 1, conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + serial = rxrpc_get_next_serials(conn, 1); + whdr->serial = htonl(serial); + + trace_rxrpc_tx_challenge(conn, serial, 0, *(u32 *)&conn->rxgk.nonce); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret > 0) + conn->peer->last_tx_at = ktime_get_seconds(); + __free_page(page); + + if (ret < 0) { + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_rxgk_challenge); + return -EAGAIN; + } + + trace_rxrpc_tx_packet(conn->debug_id, whdr, + rxrpc_tx_point_rxgk_challenge); + _leave(" = 0"); + return 0; +} + +/* + * Validate a challenge packet. + */ +static bool rxgk_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + u8 nonce[20]; + + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxgk_abort_chall_no_key); + return false; + } + + if (key_validate(conn->key) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + return false; + } + + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + nonce, sizeof(nonce)) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_chall_short); + return false; + } + + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, 0, *(u32 *)nonce, 0); + return true; +} + +/** + * rxgk_kernel_query_challenge - Query RxGK-specific challenge parameters + * @challenge: The challenge packet to query + * + * Return: The Kerberos 5 encoding type for the challenged connection. + */ +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + return sp->chall.conn->rxgk.enctype; +} +EXPORT_SYMBOL(rxgk_kernel_query_challenge); + +/* + * Fill out the control message to pass to userspace to inform about the + * challenge. + */ +static int rxgk_challenge_to_recvmsg(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg) +{ + struct rxgk_challenge chall; + + chall.base.service_id = conn->service_id; + chall.base.security_index = conn->security_ix; + chall.enctype = conn->rxgk.enctype; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_CHALLENGED, sizeof(chall), &chall); +} + +/* + * Insert the requisite amount of XDR padding for the length given. + */ +static int rxgk_pad_out(struct sk_buff *response, size_t len, size_t offset) +{ + __be32 zero = 0; + size_t pad = xdr_round_up(len) - len; + int ret; + + if (!pad) + return 0; + + ret = skb_store_bits(response, offset, &zero, pad); + if (ret < 0) + return ret; + return pad; +} + +/* + * Insert the header into the response. + */ +static noinline ssize_t rxgk_insert_response_header(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset) +{ + struct rxrpc_skb_priv *rsp = rxrpc_skb(response); + + struct { + struct rxrpc_wire_header whdr; + __be32 start_time_msw; + __be32 start_time_lsw; + __be32 ticket_len; + } h; + int ret; + + rsp->resp.kvno = gk->key_number; + rsp->resp.version = gk->krb5->etype; + + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = htons(gk->key_number); + h.whdr.serviceId = htons(conn->service_id); + h.start_time_msw = htonl(upper_32_bits(conn->rxgk.start_time)); + h.start_time_lsw = htonl(lower_32_bits(conn->rxgk.start_time)); + h.ticket_len = htonl(gk->key->ticket.len); + + ret = skb_store_bits(response, offset, &h, sizeof(h)); + return ret < 0 ? ret : sizeof(h); +} + +/* + * Construct the authenticator to go in the response packet + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static ssize_t rxgk_construct_authenticator(struct rxrpc_connection *conn, + struct sk_buff *challenge, + const struct krb5_buffer *appdata, + struct sk_buff *response, + size_t offset) +{ + struct { + u8 nonce[20]; + __be32 appdata_len; + } a; + struct { + __be32 level; + __be32 epoch; + __be32 cid; + __be32 call_numbers_count; + __be32 call_numbers[4]; + } b; + int ret; + + ret = skb_copy_bits(challenge, sizeof(struct rxrpc_wire_header), + a.nonce, sizeof(a.nonce)); + if (ret < 0) + return -EPROTO; + + a.appdata_len = htonl(appdata->len); + + ret = skb_store_bits(response, offset, &a, sizeof(a)); + if (ret < 0) + return ret; + offset += sizeof(a); + + if (appdata->len) { + ret = skb_store_bits(response, offset, appdata->data, appdata->len); + if (ret < 0) + return ret; + offset += appdata->len; + + ret = rxgk_pad_out(response, appdata->len, offset); + if (ret < 0) + return ret; + offset += ret; + } + + b.level = htonl(conn->security_level); + b.epoch = htonl(conn->proto.epoch); + b.cid = htonl(conn->proto.cid); + b.call_numbers_count = htonl(4); + b.call_numbers[0] = htonl(conn->channels[0].call_counter); + b.call_numbers[1] = htonl(conn->channels[1].call_counter); + b.call_numbers[2] = htonl(conn->channels[2].call_counter); + b.call_numbers[3] = htonl(conn->channels[3].call_counter); + + ret = skb_store_bits(response, offset, &b, sizeof(b)); + if (ret < 0) + return ret; + return sizeof(a) + xdr_round_up(appdata->len) + sizeof(b); +} + +static ssize_t rxgk_encrypt_authenticator(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset, + size_t alloc_len, + size_t auth_offset, + size_t auth_len) +{ + struct scatterlist sg[16]; + int nr_sg; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(response, sg, offset, alloc_len); + if (unlikely(nr_sg < 0)) + return nr_sg; + return crypto_krb5_encrypt(gk->krb5, gk->resp_enc, sg, nr_sg, alloc_len, + auth_offset, auth_len, false); +} + +/* + * Construct the response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator<RXGK_MAXAUTHENTICATOR> + * }; + */ +static int rxgk_construct_response(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp, *rsp; + struct rxgk_context *gk; + struct sk_buff *response; + size_t len, auth_len, authx_len, offset, auth_offset, authx_offset; + __be32 tmp; + int ret; + + gk = rxgk_get_key(conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk); + + auth_len = 20 + (4 + appdata->len) + 12 + (1 + 4) * 4; + authx_len = crypto_krb5_how_much_buffer(gk->krb5, KRB5_ENCRYPT_MODE, + auth_len, &auth_offset); + len = sizeof(struct rxrpc_wire_header) + + 8 + (4 + xdr_round_up(gk->key->ticket.len)) + (4 + authx_len); + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxgk); + response->len = len; + response->data_len = len; + + ret = rxgk_insert_response_header(conn, gk, response, 0); + if (ret < 0) + goto error; + offset = ret; + + ret = skb_store_bits(response, offset, gk->key->ticket.data, gk->key->ticket.len); + if (ret < 0) + goto error; + offset += gk->key->ticket.len; + ret = rxgk_pad_out(response, gk->key->ticket.len, offset); + if (ret < 0) + goto error; + + authx_offset = offset + ret + 4; /* Leave a gap for the length. */ + + ret = rxgk_construct_authenticator(conn, challenge, appdata, response, + authx_offset + auth_offset); + if (ret < 0) + goto error; + auth_len = ret; + + ret = rxgk_encrypt_authenticator(conn, gk, response, + authx_offset, authx_len, + auth_offset, auth_len); + if (ret < 0) + goto error; + authx_len = ret; + + tmp = htonl(authx_len); + ret = skb_store_bits(response, authx_offset - 4, &tmp, 4); + if (ret < 0) + goto error; + + ret = rxgk_pad_out(response, authx_len, authx_offset + authx_len); + if (ret < 0) + goto error; + len = authx_offset + authx_len + ret; + + if (len != response->len) { + response->len = len; + response->data_len = len; + } + + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Respond to a challenge packet. + */ +static int rxgk_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + if (key_validate(conn->key) < 0) + return rxrpc_abort_conn(conn, NULL, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + + return rxgk_construct_response(conn, challenge, appdata); +} + +static int rxgk_respond_to_challenge_no_appdata(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + struct krb5_buffer appdata = {}; + + return rxgk_respond_to_challenge(conn, challenge, &appdata); +} + +/** + * rxgk_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * @appdata: The application data to include in the RESPONSE authenticator + * + * Allow a kernel application to respond to a CHALLENGE with application data + * to be included in the RxGK RESPONSE Authenticator. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxgk_respond_to_challenge(csp->chall.conn, challenge, appdata); +} +EXPORT_SYMBOL(rxgk_kernel_respond_to_challenge); + +/* + * Parse sendmsg() control message and respond to challenge. We need to see if + * there's an appdata to fish out. + */ +static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + struct krb5_buffer appdata = {}; + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (cmsg->cmsg_level != SOL_RXRPC || + cmsg->cmsg_type != RXRPC_RESP_RXGK_APPDATA) + continue; + if (appdata.data) + return -EINVAL; + appdata.data = CMSG_DATA(cmsg); + appdata.len = cmsg->cmsg_len - sizeof(struct cmsghdr); + } + + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +} + +/* + * Verify the authenticator. + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + __be32 *p, __be32 *end) +{ + u32 app_len, call_count, level, epoch, cid, i; + + _enter(""); + + if (memcmp(p, conn->rxgk.nonce, 20) != 0) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_nonce); + p += 20 / sizeof(__be32); + + app_len = ntohl(*p++); + if (app_len > (end - p) * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + p += xdr_round_up(app_len) / sizeof(__be32); + if (end - p < 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + level = ntohl(*p++); + epoch = ntohl(*p++); + cid = ntohl(*p++); + call_count = ntohl(*p++); + + if (level != conn->security_level || + epoch != conn->proto.epoch || + cid != conn->proto.cid || + call_count > 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_param); + + if (end - p < call_count) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_call_list); + + for (i = 0; i < call_count; i++) { + u32 call_id = ntohl(*p++); + + if (call_id > INT_MAX) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_callid); + + if (call_id < conn->channels[i].call_counter) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_ctr); + + if (call_id > conn->channels[i].call_counter) { + if (conn->channels[i].call) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_state); + + conn->channels[i].call_counter = call_id; + } + } + + _leave(" = 0"); + return 0; +} + +/* + * Extract the authenticator and verify it. + */ +static int rxgk_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + unsigned int auth_offset, unsigned int auth_len) +{ + void *auth; + __be32 *p; + int ret; + + auth = kmalloc(auth_len, GFP_NOFS); + if (!auth) + return -ENOMEM; + + ret = skb_copy_bits(skb, auth_offset, auth, auth_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_auth); + goto error; + } + + p = auth; + ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, p + auth_len); +error: + kfree(auth); + return ret; +} + +/* + * Verify a response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator<RXGK_MAXAUTHENTICATOR> + * }; + */ +static int rxgk_verify_response(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + const struct krb5_enctype *krb5; + struct rxrpc_key_token *token; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_response rhdr; + struct rxgk_context *gk; + struct key *key = NULL; + unsigned int offset = sizeof(struct rxrpc_wire_header); + unsigned int len = skb->len - sizeof(struct rxrpc_wire_header); + unsigned int token_offset, token_len; + unsigned int auth_offset, auth_len; + __be32 xauth_len; + int ret, ec; + + _enter("{%d}", conn->debug_id); + + /* Parse the RXGK_Response object */ + if (sizeof(rhdr) + sizeof(__be32) > len) + goto short_packet; + + if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0) + goto short_packet; + offset += sizeof(rhdr); + len -= sizeof(rhdr); + + token_offset = offset; + token_len = ntohl(rhdr.token_len); + if (xdr_round_up(token_len) + sizeof(__be32) > len) + goto short_packet; + + trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len); + + offset += xdr_round_up(token_len); + len -= xdr_round_up(token_len); + + if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0) + goto short_packet; + offset += sizeof(xauth_len); + len -= sizeof(xauth_len); + + auth_offset = offset; + auth_len = ntohl(xauth_len); + if (auth_len < len) + goto short_packet; + if (auth_len & 3) + goto inconsistent; + if (auth_len < 20 + 9 * 4) + goto auth_too_short; + + /* We need to extract and decrypt the token and instantiate a session + * key for it. This bit, however, is application-specific. If + * possible, we use a default parser, but we might end up bumping this + * to the app to deal with - which might mean a round trip to + * userspace. + */ + ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key); + if (ret < 0) + goto out; + + /* We now have a key instantiated from the decrypted ticket. We can + * pass this to the application so that they can parse the ticket + * content and we can use the session key it contains to derive the + * keys we need. + * + * Note that we have to switch enctype at this point as the enctype of + * the ticket doesn't necessarily match that of the transport. + */ + token = key->payload.data[0]; + conn->security_level = token->rxgk->level; + conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time); + + gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS); + if (IS_ERR(gk)) { + ret = PTR_ERR(gk); + goto cant_get_token; + } + + krb5 = gk->krb5; + + trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len); + + /* Decrypt, parse and verify the authenticator. */ + ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb, + &auth_offset, &auth_len, &ec); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret, + rxgk_abort_resp_auth_dec); + goto out; + } + + ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len); + if (ret < 0) + goto out; + + conn->key = key; + key = NULL; + ret = 0; +out: + key_put(key); + _leave(" = %d", ret); + return ret; + +inconsistent: + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_xdr_align); + goto out; +auth_too_short: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_auth); + goto out; +short_packet: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_packet); + goto out; + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_internal_error); + goto out; + case -ENOPKG: + ret = rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_nopkg); + goto out; + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + goto out; +} + +/* + * clear the connection security + */ +static void rxgk_clear(struct rxrpc_connection *conn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(conn->rxgk.keys); i++) + rxgk_put(conn->rxgk.keys[i]); +} + +/* + * Initialise the RxGK security service. + */ +static int rxgk_init(void) +{ + return 0; +} + +/* + * Clean up the RxGK security service. + */ +static void rxgk_exit(void) +{ +} + +/* + * RxRPC YFS GSSAPI-based security + */ +const struct rxrpc_security rxgk_yfs = { + .name = "yfs-rxgk", + .security_index = RXRPC_SECURITY_YFS_RXGK, + .no_key_abort = RXGK_NOTAUTH, + .init = rxgk_init, + .exit = rxgk_exit, + .preparse_server_key = rxgk_preparse_server_key, + .free_preparse_server_key = rxgk_free_preparse_server_key, + .destroy_server_key = rxgk_destroy_server_key, + .describe_server_key = rxgk_describe_server_key, + .init_connection_security = rxgk_init_connection_security, + .alloc_txbuf = rxgk_alloc_txbuf, + .secure_packet = rxgk_secure_packet, + .verify_packet = rxgk_verify_packet, + .free_call_crypto = rxgk_free_call_crypto, + .issue_challenge = rxgk_issue_challenge, + .validate_challenge = rxgk_validate_challenge, + .challenge_to_recvmsg = rxgk_challenge_to_recvmsg, + .sendmsg_respond_to_challenge = rxgk_sendmsg_respond_to_challenge, + .respond_to_challenge = rxgk_respond_to_challenge_no_appdata, + .verify_response = rxgk_verify_response, + .clear = rxgk_clear, + .default_decode_ticket = rxgk_yfs_decode_ticket, +}; diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c new file mode 100644 index 000000000000..b94b77a1c317 --- /dev/null +++ b/net/rxrpc/rxgk_app.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Application-specific bits for GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/key-type.h> +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Decode a default-style YFS ticket in a response and turn it into an + * rxrpc-type key. + * + * struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + * + * struct RXGK_AuthName { + * afs_int32 kind; + * opaque data<AUTHDATAMAX>; + * opaque display<AUTHPRINTABLEMAX>; + * }; + * + * struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key) +{ + struct rxrpc_key_token *token; + const struct cred *cred = current_cred(); // TODO - use socket creds + struct key *key; + size_t pre_ticket_len, payload_len; + unsigned int klen, enctype; + void *payload, *ticket; + __be32 *t, *p, *q, tmp[2]; + int ret; + + _enter(""); + + /* Get the session key length */ + ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); + if (ret < 0) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_klen); + enctype = ntohl(tmp[0]); + klen = ntohl(tmp[1]); + + if (klen > ticket_len - 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_key); + + pre_ticket_len = ((5 + 14) * sizeof(__be32) + + xdr_round_up(klen) + + sizeof(__be32)); + payload_len = pre_ticket_len + xdr_round_up(ticket_len); + + payload = kzalloc(payload_len, GFP_NOFS); + if (!payload) + return -ENOMEM; + + /* We need to fill out the XDR form for a key payload that we can pass + * to add_key(). Start by copying in the ticket so that we can parse + * it. + */ + ticket = payload + pre_ticket_len; + ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + goto error; + } + + /* Fill out the form header. */ + p = payload; + p[0] = htonl(0); /* Flags */ + p[1] = htonl(1); /* len(cellname) */ + p[2] = htonl(0x20000000); /* Cellname " " */ + p[3] = htonl(1); /* #tokens */ + p[4] = htonl(15 * sizeof(__be32) + xdr_round_up(klen) + + xdr_round_up(ticket_len)); /* Token len */ + + /* Now fill in the body. Most of this we can just scrape directly from + * the ticket. + */ + t = ticket + sizeof(__be32) * 2 + xdr_round_up(klen); + q = payload + 5 * sizeof(__be32); + q[0] = htonl(RXRPC_SECURITY_YFS_RXGK); + q[1] = t[1]; /* begintime - msw */ + q[2] = t[2]; /* - lsw */ + q[3] = t[5]; /* endtime - msw */ + q[4] = t[6]; /* - lsw */ + q[5] = 0; /* level - msw */ + q[6] = t[0]; /* - lsw */ + q[7] = 0; /* lifetime - msw */ + q[8] = t[3]; /* - lsw */ + q[9] = 0; /* bytelife - msw */ + q[10] = t[4]; /* - lsw */ + q[11] = 0; /* enctype - msw */ + q[12] = htonl(enctype); /* - lsw */ + q[13] = htonl(klen); /* Key length */ + + q += 14; + + memcpy(q, ticket + sizeof(__be32) * 2, klen); + q += xdr_round_up(klen) / 4; + q[0] = htonl(ticket_len); + q++; + if (WARN_ON((unsigned long)q != (unsigned long)ticket)) { + ret = -EIO; + goto error; + } + + /* Ticket read in with skb_copy_bits above */ + q += xdr_round_up(ticket_len) / 4; + if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) { + ret = -EIO; + goto error; + } + + /* Now turn that into a key. */ + key = key_alloc(&key_type_rxrpc, "x", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, // TODO: Use socket owner + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + ret = PTR_ERR(key); + goto error; + } + + _debug("key %d", key_serial(key)); + + ret = key_instantiate_and_link(key, payload, payload_len, NULL, NULL); + if (ret < 0) + goto error_key; + + token = key->payload.data[0]; + token->no_leak_key = true; + *_key = key; + key = NULL; + ret = 0; + goto error; + +error_key: + key_put(key); +error: + kfree_sensitive(payload); + _leave(" = %d", ret); + return ret; +} + +/* + * Extract the token and set up a session key from the details. + * + * struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + * + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1] + */ +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key) +{ + const struct krb5_enctype *krb5; + const struct krb5_buffer *server_secret; + struct crypto_aead *token_enc = NULL; + struct key *server_key; + unsigned int ticket_offset, ticket_len; + u32 kvno, enctype; + int ret, ec; + + struct { + __be32 kvno; + __be32 enctype; + __be32 token_len; + } container; + + /* Decode the RXGK_TokenContainer object. This tells us which server + * key we should be using. We can then fetch the key, get the secret + * and set up the crypto to extract the token. + */ + if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + kvno = ntohl(container.kvno); + enctype = ntohl(container.enctype); + ticket_len = ntohl(container.token_len); + ticket_offset = token_offset + sizeof(container); + + if (xdr_round_up(ticket_len) > token_len - 3 * 4) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + _debug("KVNO %u", kvno); + _debug("ENC %u", enctype); + _debug("TLEN %u", ticket_len); + + server_key = rxrpc_look_up_server_security(conn, skb, kvno, enctype); + if (IS_ERR(server_key)) + goto cant_get_server_key; + + down_read(&server_key->sem); + server_secret = (const void *)&server_key->payload.data[2]; + ret = rxgk_set_up_token_cipher(server_secret, &token_enc, enctype, &krb5, GFP_NOFS); + up_read(&server_key->sem); + key_put(server_key); + if (ret < 0) + goto cant_get_token; + + /* We can now decrypt and parse the token/ticket. This allows us to + * gain access to K0, from which we can derive the transport key and + * thence decode the authenticator. + */ + ret = rxgk_decrypt_skb(krb5, token_enc, skb, + &ticket_offset, &ticket_len, &ec); + crypto_free_aead(token_enc); + token_enc = NULL; + if (ret < 0) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + + ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, + ticket_len, _key); + if (ret < 0) + goto cant_get_token; + + _leave(" = 0"); + return ret; + +cant_get_server_key: + ret = PTR_ERR(server_key); + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -ENOKEY: + case -EKEYREJECTED: + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EPERM: + return rxrpc_abort_conn(conn, skb, RXGK_BADKEYNO, -EKEYREJECTED, + rxgk_abort_resp_tok_nokey); + default: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_keyerr); + } + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_internal_error); + case -ENOPKG: + return rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_tok_nopkg); + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + return ret; +} diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h new file mode 100644 index 000000000000..7370a5655985 --- /dev/null +++ b/net/rxrpc/rxgk_common.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Common bits for GSSAPI-based RxRPC security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <crypto/krb5.h> +#include <crypto/skcipher.h> +#include <crypto/hash.h> + +/* + * Per-key number context. This is replaced when the connection is rekeyed. + */ +struct rxgk_context { + refcount_t usage; + unsigned int key_number; /* Rekeying number (goes in the rx header) */ + unsigned long flags; +#define RXGK_TK_NEEDS_REKEY 0 /* Set if this needs rekeying */ + unsigned long expiry; /* Expiration time of this key */ + long long bytes_remaining; /* Remaining Tx lifetime of this key */ + const struct krb5_enctype *krb5; /* RxGK encryption type */ + const struct rxgk_key *key; + + /* We need up to 7 keys derived from the transport key, but we don't + * actually need the transport key. Each key is derived by + * DK(TK,constant). + */ + struct crypto_aead *tx_enc; /* Transmission key */ + struct crypto_aead *rx_enc; /* Reception key */ + struct crypto_shash *tx_Kc; /* Transmission checksum key */ + struct crypto_shash *rx_Kc; /* Reception checksum key */ + struct crypto_aead *resp_enc; /* Response packet enc key */ +}; + +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_object_len(x) (4 + xdr_round_up(x)) + +/* + * rxgk_app.c + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key); + +/* + * rxgk_kdf.c + */ +void rxgk_put(struct rxgk_context *gk); +struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, + const struct rxgk_key *key, + unsigned int key_number, + gfp_t gfp); +int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, + struct crypto_aead **token_key, + unsigned int enctype, + const struct krb5_enctype **_krb5, + gfp_t gfp); + +/* + * Apply decryption and checksumming functions to part of an skbuff. The + * offset and length are updated to reflect the actual content of the encrypted + * region. + */ +static inline +int rxgk_decrypt_skb(const struct krb5_enctype *krb5, + struct crypto_aead *aead, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + int *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} + +/* + * Check the MIC on a region of an skbuff. The offset and length are updated + * to reflect the actual content of the secure region. + */ +static inline +int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, + struct crypto_shash *shash, + const struct krb5_buffer *metadata, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + u32 *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} diff --git a/net/rxrpc/rxgk_kdf.c b/net/rxrpc/rxgk_kdf.c new file mode 100644 index 000000000000..b4db5aa30e5b --- /dev/null +++ b/net/rxrpc/rxgk_kdf.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RxGK transport key derivation. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/key-type.h> +#include <linux/slab.h> +#include <keys/rxrpc-type.h> +#include "ar-internal.h" +#include "rxgk_common.h" + +#define round16(x) (((x) + 15) & ~15) + +/* + * Constants used to derive the keys and hmacs actually used for doing stuff. + */ +#define RXGK_CLIENT_ENC_PACKET 1026U // 0x402 +#define RXGK_CLIENT_MIC_PACKET 1027U // 0x403 +#define RXGK_SERVER_ENC_PACKET 1028U // 0x404 +#define RXGK_SERVER_MIC_PACKET 1029U // 0x405 +#define RXGK_CLIENT_ENC_RESPONSE 1030U // 0x406 +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c + +static void rxgk_free(struct rxgk_context *gk) +{ + if (gk->tx_Kc) + crypto_free_shash(gk->tx_Kc); + if (gk->rx_Kc) + crypto_free_shash(gk->rx_Kc); + if (gk->tx_enc) + crypto_free_aead(gk->tx_enc); + if (gk->rx_enc) + crypto_free_aead(gk->rx_enc); + if (gk->resp_enc) + crypto_free_aead(gk->resp_enc); + kfree(gk); +} + +void rxgk_put(struct rxgk_context *gk) +{ + if (gk && refcount_dec_and_test(&gk->usage)) + rxgk_free(gk); +} + +/* + * Transport key derivation function. + * + * TK = random-to-key(PRF+(K0, L, + * epoch || cid || start_time || key_number)) + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-11 sec 8.3] + */ +static int rxgk_derive_transport_key(struct rxrpc_connection *conn, + struct rxgk_context *gk, + const struct rxgk_key *rxgk, + struct krb5_buffer *TK, + gfp_t gfp) +{ + const struct krb5_enctype *krb5 = gk->krb5; + struct krb5_buffer conn_info; + unsigned int L = krb5->key_bytes; + __be32 *info; + u8 *buffer; + int ret; + + _enter(""); + + conn_info.len = sizeof(__be32) * 5; + + buffer = kzalloc(round16(conn_info.len), gfp); + if (!buffer) + return -ENOMEM; + + conn_info.data = buffer; + + info = (__be32 *)conn_info.data; + info[0] = htonl(conn->proto.epoch); + info[1] = htonl(conn->proto.cid); + info[2] = htonl(conn->rxgk.start_time >> 32); + info[3] = htonl(conn->rxgk.start_time >> 0); + info[4] = htonl(gk->key_number); + + ret = crypto_krb5_calc_PRFplus(krb5, &rxgk->key, L, &conn_info, TK, gfp); + kfree_sensitive(buffer); + _leave(" = %d", ret); + return ret; +} + +/* + * Set up the ciphers for the usage keys. + */ +static int rxgk_set_up_ciphers(struct rxrpc_connection *conn, + struct rxgk_context *gk, + const struct rxgk_key *rxgk, + gfp_t gfp) +{ + const struct krb5_enctype *krb5 = gk->krb5; + struct crypto_shash *shash; + struct crypto_aead *aead; + struct krb5_buffer TK; + bool service = rxrpc_conn_is_service(conn); + int ret; + u8 *buffer; + + buffer = kzalloc(krb5->key_bytes, gfp); + if (!buffer) + return -ENOMEM; + + TK.len = krb5->key_bytes; + TK.data = buffer; + + ret = rxgk_derive_transport_key(conn, gk, rxgk, &TK, gfp); + if (ret < 0) + goto out; + + aead = crypto_krb5_prepare_encryption(krb5, &TK, RXGK_CLIENT_ENC_RESPONSE, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->resp_enc = aead; + + if (crypto_aead_blocksize(gk->resp_enc) != krb5->block_len || + crypto_aead_authsize(gk->resp_enc) != krb5->cksum_len) { + pr_notice("algo inconsistent with krb5 table %u!=%u or %u!=%u\n", + crypto_aead_blocksize(gk->resp_enc), krb5->block_len, + crypto_aead_authsize(gk->resp_enc), krb5->cksum_len); + ret = -EINVAL; + goto out; + } + + if (service) { + switch (conn->security_level) { + case RXRPC_SECURITY_AUTH: + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->tx_Kc = shash; + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->rx_Kc = shash; + break; + case RXRPC_SECURITY_ENCRYPT: + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->tx_enc = aead; + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->rx_enc = aead; + break; + } + } else { + switch (conn->security_level) { + case RXRPC_SECURITY_AUTH: + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->tx_Kc = shash; + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->rx_Kc = shash; + break; + case RXRPC_SECURITY_ENCRYPT: + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->tx_enc = aead; + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->rx_enc = aead; + break; + } + } + + ret = 0; +out: + kfree_sensitive(buffer); + return ret; +aead_error: + ret = PTR_ERR(aead); + goto out; +hash_error: + ret = PTR_ERR(shash); + goto out; +} + +/* + * Derive a transport key for a connection and then derive a bunch of usage + * keys from it and set up ciphers using them. + */ +struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, + const struct rxgk_key *key, + unsigned int key_number, + gfp_t gfp) +{ + struct rxgk_context *gk; + unsigned long lifetime; + int ret = -ENOPKG; + + _enter(""); + + gk = kzalloc(sizeof(*gk), GFP_KERNEL); + if (!gk) + return ERR_PTR(-ENOMEM); + refcount_set(&gk->usage, 1); + gk->key = key; + gk->key_number = key_number; + + gk->krb5 = crypto_krb5_find_enctype(key->enctype); + if (!gk->krb5) + goto err_tk; + + ret = rxgk_set_up_ciphers(conn, gk, key, gfp); + if (ret) + goto err_tk; + + /* Set the remaining number of bytes encrypted with this key that may + * be transmitted before rekeying. Note that the spec has been + * interpreted differently on this point... + */ + switch (key->bytelife) { + case 0: + case 63: + gk->bytes_remaining = LLONG_MAX; + break; + case 1 ... 62: + gk->bytes_remaining = 1LL << key->bytelife; + break; + default: + gk->bytes_remaining = key->bytelife; + break; + } + + /* Set the time after which rekeying must occur */ + if (key->lifetime) { + lifetime = min_t(u64, key->lifetime, INT_MAX / HZ); + lifetime *= HZ; + } else { + lifetime = MAX_JIFFY_OFFSET; + } + gk->expiry = jiffies + lifetime; + return gk; + +err_tk: + rxgk_put(gk); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * Use the server secret key to set up the ciphers that will be used to extract + * the token from a response packet. + */ +int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, + struct crypto_aead **token_aead, + unsigned int enctype, + const struct krb5_enctype **_krb5, + gfp_t gfp) +{ + const struct krb5_enctype *krb5; + struct crypto_aead *aead; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + aead = crypto_krb5_prepare_encryption(krb5, server_key, RXGK_SERVER_ENC_TOKEN, gfp); + if (IS_ERR(aead)) + return PTR_ERR(aead); + + *_krb5 = krb5; + *token_aead = aead; + return 0; +} diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index f1a68270862d..3657c0661cdc 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -148,14 +148,14 @@ error: static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { struct rxrpc_txbuf *txb; - size_t shdr, space; + size_t shdr, alloc, limit, part; - remain = min(remain, 65535 - sizeof(struct rxrpc_wire_header)); + remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header)); switch (call->conn->security_level) { default: - space = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); - return rxrpc_alloc_data_txbuf(call, space, 0, gfp); + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); case RXRPC_SECURITY_AUTH: shdr = sizeof(struct rxkad_level1_hdr); break; @@ -164,15 +164,23 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem break; } - space = min_t(size_t, round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr); - space = round_up(space, RXKAD_ALIGN); + limit = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN) - shdr; + if (remain < limit) { + part = remain; + alloc = round_up(shdr + part, RXKAD_ALIGN); + } else { + part = limit; + alloc = RXRPC_JUMBO_DATALEN; + } - txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp); + txb = rxrpc_alloc_data_txbuf(call, alloc, RXKAD_ALIGN, gfp); if (!txb) return NULL; - txb->offset += shdr; - txb->space -= shdr; + txb->crypto_header = 0; + txb->sec_header = shdr; + txb->offset += shdr; + txb->space = part; return txb; } @@ -251,8 +259,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, struct rxrpc_txbuf *txb, struct skcipher_request *req) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxkad_level1_hdr *hdr = (void *)(whdr + 1); + struct rxkad_level1_hdr *hdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; size_t pad; @@ -263,13 +270,13 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, check = txb->seq ^ call->call_id; hdr->data_size = htonl((u32)check << 16 | txb->len); - txb->len += sizeof(struct rxkad_level1_hdr); - pad = txb->len; + txb->pkt_len = sizeof(struct rxkad_level1_hdr) + txb->len; + pad = txb->pkt_len; pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { - memset(txb->kvec[0].iov_base + txb->offset, 0, pad); - txb->len += pad; + memset(txb->data + txb->offset, 0, pad); + txb->pkt_len += pad; } /* start the encryption afresh */ @@ -294,11 +301,10 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct skcipher_request *req) { const struct rxrpc_key_token *token; - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1); + struct rxkad_level2_hdr *rxkhdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; - size_t pad; + size_t content, pad; u16 check; int ret; @@ -309,23 +315,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr->data_size = htonl(txb->len | (u32)check << 16); rxkhdr->checksum = 0; - txb->len += sizeof(struct rxkad_level2_hdr); - pad = txb->len; - pad = RXKAD_ALIGN - pad; - pad &= RXKAD_ALIGN - 1; - if (pad) { - memset(txb->kvec[0].iov_base + txb->offset, 0, pad); - txb->len += pad; - } + content = sizeof(struct rxkad_level2_hdr) + txb->len; + txb->pkt_len = round_up(content, RXKAD_ALIGN); + pad = txb->pkt_len - content; + if (pad) + memset(txb->data + txb->offset, 0, pad); /* encrypt from the session key */ token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg, rxkhdr, txb->len); + sg_init_one(&sg, rxkhdr, txb->pkt_len); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x); + skcipher_request_set_crypt(req, &sg, &sg, txb->pkt_len, iv.x); ret = crypto_skcipher_encrypt(req); skcipher_request_zero(req); return ret; @@ -384,19 +387,32 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: + txb->pkt_len = txb->len; ret = 0; break; case RXRPC_SECURITY_AUTH: ret = rxkad_secure_packet_auth(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; case RXRPC_SECURITY_ENCRYPT: ret = rxkad_secure_packet_encrypt(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; default: ret = -EPERM; break; } + /* Clear excess space in the packet */ + if (txb->pkt_len < txb->alloc_size) { + size_t gap = txb->alloc_size - txb->pkt_len; + void *p = txb->data; + + memset(p + txb->pkt_len, 0, gap); + } + skcipher_request_free(req); _leave(" = %d [set %x]", ret, y); return ret; @@ -669,6 +685,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) serial = rxrpc_get_next_serial(conn); whdr.serial = htonl(serial); + trace_rxrpc_tx_challenge(conn, serial, 0, conn->rxkad.nonce); + ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, @@ -684,62 +702,6 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) } /* - * send a Kerberos security response - */ -static int rxkad_send_response(struct rxrpc_connection *conn, - struct rxrpc_host_header *hdr, - struct rxkad_response *resp, - const struct rxkad_key *s2) -{ - struct rxrpc_wire_header whdr; - struct msghdr msg; - struct kvec iov[3]; - size_t len; - u32 serial; - int ret; - - _enter(""); - - msg.msg_name = &conn->peer->srx.transport; - msg.msg_namelen = conn->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - memset(&whdr, 0, sizeof(whdr)); - whdr.epoch = htonl(hdr->epoch); - whdr.cid = htonl(hdr->cid); - whdr.type = RXRPC_PACKET_TYPE_RESPONSE; - whdr.flags = conn->out_clientflag; - whdr.securityIndex = hdr->securityIndex; - whdr.serviceId = htons(hdr->serviceId); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = resp; - iov[1].iov_len = sizeof(*resp); - iov[2].iov_base = (void *)s2->ticket; - iov[2].iov_len = s2->ticket_len; - - len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; - - serial = rxrpc_get_next_serial(conn); - whdr.serial = htonl(serial); - - rxrpc_local_dont_fragment(conn->local, false); - ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); - if (ret < 0) { - trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_point_rxkad_response); - return -EAGAIN; - } - - conn->peer->last_tx_at = ktime_get_seconds(); - _leave(" = 0"); - return 0; -} - -/* * calculate the response checksum */ static void rxkad_calc_response_checksum(struct rxkad_response *response) @@ -758,12 +720,21 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response) * encrypt the response packet */ static int rxkad_encrypt_response(struct rxrpc_connection *conn, - struct rxkad_response *resp, + struct sk_buff *response, const struct rxkad_key *s2) { struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg[1]; + size_t encsize = sizeof(((struct rxkad_response *)0)->encrypted); + int ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, + sizeof(struct rxrpc_wire_header) + + offsetof(struct rxkad_response, encrypted), encsize); + if (ret < 0) + return ret; req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS); if (!req) @@ -772,89 +743,206 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, /* continue encrypting from where we left off */ memcpy(&iv, s2->session_key, sizeof(iv)); - sg_init_table(sg, 1); - sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); skcipher_request_set_sync_tfm(req, conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_encrypt(req); + skcipher_request_set_crypt(req, sg, sg, encsize, iv.x); + ret = crypto_skcipher_encrypt(req); skcipher_request_free(req); - return 0; + return ret; } /* - * respond to a challenge packet + * Validate a challenge packet. */ -static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool rxkad_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { - const struct rxrpc_key_token *token; struct rxkad_challenge challenge; - struct rxkad_response *resp; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - u32 version, nonce, min_level; - int ret = -EPROTO; + u32 version, min_level; + int ret; _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); - if (!conn->key) - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, - rxkad_abort_chall_no_key); + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxkad_abort_chall_no_key); + return false; + } ret = key_validate(conn->key); - if (ret < 0) - return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, - rxkad_abort_chall_key_expired); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); + return false; + } if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &challenge, sizeof(challenge)) < 0) - return rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, - rxkad_abort_chall_short); + &challenge, sizeof(challenge)) < 0) { + rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_chall_short); + return false; + } version = ntohl(challenge.version); - nonce = ntohl(challenge.nonce); + sp->chall.rxkad_nonce = ntohl(challenge.nonce); min_level = ntohl(challenge.min_level); - trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level); + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, + sp->chall.rxkad_nonce, min_level); + + if (version != RXKAD_VERSION) { + rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_chall_version); + return false; + } + + if (conn->security_level < min_level) { + rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, + rxkad_abort_chall_level); + return false; + } + return true; +} + +/* + * Insert the header into the response. + */ +static noinline +int rxkad_insert_response_header(struct rxrpc_connection *conn, + const struct rxrpc_key_token *token, + struct sk_buff *challenge, + struct sk_buff *response, + size_t *offset) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + struct { + struct rxrpc_wire_header whdr; + struct rxkad_response resp; + } h; + int ret; + + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = 0; + h.whdr.serviceId = htons(conn->service_id); + h.resp.version = htonl(RXKAD_VERSION); + h.resp.__pad = 0; + h.resp.encrypted.epoch = htonl(conn->proto.epoch); + h.resp.encrypted.cid = htonl(conn->proto.cid); + h.resp.encrypted.checksum = 0; + h.resp.encrypted.securityIndex = htonl(conn->security_ix); + h.resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter); + h.resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter); + h.resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter); + h.resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter); + h.resp.encrypted.inc_nonce = htonl(csp->chall.rxkad_nonce + 1); + h.resp.encrypted.level = htonl(conn->security_level); + h.resp.kvno = htonl(token->kad->kvno); + h.resp.ticket_len = htonl(token->kad->ticket_len); + + rxkad_calc_response_checksum(&h.resp); + + ret = skb_store_bits(response, *offset, &h, sizeof(h)); + *offset += sizeof(h); + return ret; +} - if (version != RXKAD_VERSION) - return rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, - rxkad_abort_chall_version); +/* + * respond to a challenge packet + */ +static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + const struct rxrpc_key_token *token; + struct rxrpc_skb_priv *csp, *rsp; + struct sk_buff *response; + size_t len, offset = 0; + int ret = -EPROTO; - if (conn->security_level < min_level) - return rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, - rxkad_abort_chall_level); + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + ret = key_validate(conn->key); + if (ret < 0) + return rxrpc_abort_conn(conn, challenge, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); token = conn->key->payload.data[0]; /* build the response packet */ - resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); - if (!resp) - return -ENOMEM; + len = sizeof(struct rxrpc_wire_header) + + sizeof(struct rxkad_response) + + token->kad->ticket_len; + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxkad); + response->len = len; + response->data_len = len; - resp->version = htonl(RXKAD_VERSION); - resp->encrypted.epoch = htonl(conn->proto.epoch); - resp->encrypted.cid = htonl(conn->proto.cid); - resp->encrypted.securityIndex = htonl(conn->security_ix); - resp->encrypted.inc_nonce = htonl(nonce + 1); - resp->encrypted.level = htonl(conn->security_level); - resp->kvno = htonl(token->kad->kvno); - resp->ticket_len = htonl(token->kad->ticket_len); - resp->encrypted.call_id[0] = htonl(conn->channels[0].call_counter); - resp->encrypted.call_id[1] = htonl(conn->channels[1].call_counter); - resp->encrypted.call_id[2] = htonl(conn->channels[2].call_counter); - resp->encrypted.call_id[3] = htonl(conn->channels[3].call_counter); - - /* calculate the response checksum and then do the encryption */ - rxkad_calc_response_checksum(resp); - ret = rxkad_encrypt_response(conn, resp, token->kad); - if (ret == 0) - ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); - kfree(resp); + offset = 0; + ret = rxkad_insert_response_header(conn, token, challenge, response, + &offset); + if (ret < 0) + goto error; + + ret = rxkad_encrypt_response(conn, response, token->kad); + if (ret < 0) + goto error; + + ret = skb_store_bits(response, offset, token->kad->ticket, + token->kad->ticket_len); + if (ret < 0) + goto error; + + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); return ret; } /* + * RxKAD does automatic response only as there's nothing to manage that isn't + * already in the key. + */ +static int rxkad_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; +} + +/** + * rxkad_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * + * Allow a kernel application to respond to a CHALLENGE. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxkad_respond_to_challenge(csp->chall.conn, challenge); +} +EXPORT_SYMBOL(rxkad_kernel_respond_to_challenge); + +/* * decrypt the kerberos IV ticket in the response */ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, @@ -1262,6 +1350,8 @@ const struct rxrpc_security rxkad = { .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, .issue_challenge = rxkad_issue_challenge, + .validate_challenge = rxkad_validate_challenge, + .sendmsg_respond_to_challenge = rxkad_sendmsg_respond_to_challenge, .respond_to_challenge = rxkad_respond_to_challenge, .verify_response = rxkad_verify_response, .clear = rxkad_clear, diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 085e7892d310..0377301156b0 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "rxperf: " fmt #include <linux/module.h> #include <linux/slab.h> +#include <crypto/krb5.h> #include <net/sock.h> #include <net/af_rxrpc.h> #define RXRPC_TRACE_ONLY_DEFINE_ENUMS @@ -136,6 +137,12 @@ static void rxperf_notify_end_reply_tx(struct sock *sock, RXPERF_CALL_SV_AWAIT_ACK); } +static const struct rxrpc_kernel_ops rxperf_rxrpc_callback_ops = { + .notify_new_call = rxperf_rx_new_call, + .discard_new_call = rxperf_rx_discard_new_call, + .user_attach_call = rxperf_rx_attach, +}; + /* * Charge the incoming call preallocation. */ @@ -161,7 +168,6 @@ static void rxperf_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(rxperf_socket, rxperf_notify_rx, - rxperf_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -209,8 +215,7 @@ static int rxperf_open_socket(void) if (ret < 0) goto error_2; - rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call, - rxperf_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &rxperf_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -478,6 +483,18 @@ static int rxperf_deliver_request(struct rxperf_call *call) call->unmarshal++; fallthrough; case 2: + ret = rxperf_extract_data(call, true); + if (ret < 0) + return ret; + + /* Deal with the terminal magic cookie. */ + call->iov_len = 4; + call->kvec[0].iov_len = call->iov_len; + call->kvec[0].iov_base = call->tmp; + iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len); + call->unmarshal++; + fallthrough; + case 3: ret = rxperf_extract_data(call, false); if (ret < 0) return ret; @@ -503,7 +520,7 @@ static int rxperf_process_call(struct rxperf_call *call) reply_len + sizeof(rxperf_magic_cookie)); while (reply_len > 0) { - len = min_t(size_t, reply_len, PAGE_SIZE); + len = umin(reply_len, PAGE_SIZE); bvec_set_page(&bv, ZERO_PAGE(0), len, 0); iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, len); msg.msg_flags = MSG_MORE; @@ -534,9 +551,9 @@ static int rxperf_process_call(struct rxperf_call *call) } /* - * Add a key to the security keyring. + * Add an rxkad key to the security keyring. */ -static int rxperf_add_key(struct key *keyring) +static int rxperf_add_rxkad_key(struct key *keyring) { key_ref_t kref; int ret; @@ -562,6 +579,47 @@ static int rxperf_add_key(struct key *keyring) return ret; } +#ifdef CONFIG_RXGK +/* + * Add a yfs-rxgk key to the security keyring. + */ +static int rxperf_add_yfs_rxgk_key(struct key *keyring, u32 enctype) +{ + const struct krb5_enctype *krb5 = crypto_krb5_find_enctype(enctype); + key_ref_t kref; + char name[64]; + int ret; + u8 key[32]; + + if (!krb5 || krb5->key_len > sizeof(key)) + return 0; + + /* The key is just { 0, 1, 2, 3, 4, ... } */ + for (int i = 0; i < krb5->key_len; i++) + key[i] = i; + + sprintf(name, "%u:6:1:%u", RX_PERF_SERVICE, enctype); + + kref = key_create_or_update(make_key_ref(keyring, true), + "rxrpc_s", name, + key, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + + if (IS_ERR(kref)) { + pr_err("Can't allocate rxperf server key: %ld\n", PTR_ERR(kref)); + return PTR_ERR(kref); + } + + ret = key_link(keyring, key_ref_to_ptr(kref)); + if (ret < 0) + pr_err("Can't link rxperf server key: %d\n", ret); + key_ref_put(kref); + return ret; +} +#endif + /* * Initialise the rxperf server. */ @@ -591,9 +649,29 @@ static int __init rxperf_init(void) goto error_keyring; } rxperf_sec_keyring = keyring; - ret = rxperf_add_key(keyring); + ret = rxperf_add_rxkad_key(keyring); + if (ret < 0) + goto error_key; +#ifdef CONFIG_RXGK + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA1_96); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA256_128); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA384_192); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA128_CTS_CMAC); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA256_CTS_CMAC); if (ret < 0) goto error_key; +#endif ret = rxperf_open_socket(); if (ret < 0) diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index cb8dd1d3b1d4..078d91a6b77f 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -20,6 +20,9 @@ static const struct rxrpc_security *rxrpc_security_types[] = { #ifdef CONFIG_RXKAD [RXRPC_SECURITY_RXKAD] = &rxkad, #endif +#ifdef CONFIG_RXGK + [RXRPC_SECURITY_YFS_RXGK] = &rxgk_yfs, +#endif }; int __init rxrpc_init_security(void) @@ -114,10 +117,10 @@ found: if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) { ret = conn->security->init_connection_security(conn, token); if (ret == 0) { - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) conn->state = RXRPC_CONN_CLIENT; - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); } } mutex_unlock(&conn->security_lock); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 894b8fa68e5e..ebbb78b842de 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -29,6 +29,7 @@ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, call->send_abort_why = why; call->send_abort_err = error; call->send_abort_seq = 0; + trace_rxrpc_abort_call(call, abort_code); /* Request abort locklessly vs rxrpc_input_call_event(). */ smp_store_release(&call->send_abort, abort_code); rxrpc_poke_call(call, rxrpc_call_poke_abort); @@ -93,9 +94,11 @@ no_wait: */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { + rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom); + if (_tx_win) - *_tx_win = call->tx_bottom; - return call->tx_prepared - call->tx_bottom < 256; + *_tx_win = tx_bottom; + return call->send_top - tx_bottom < 256; } /* @@ -131,13 +134,13 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, rxrpc_seq_t tx_start, tx_win; signed long rtt, timeout; - rtt = READ_ONCE(call->peer->srtt_us) >> 3; + rtt = READ_ONCE(call->srtt_us) >> 3; rtt = usecs_to_jiffies(rtt) * 2; if (rtt < 2) rtt = 2; timeout = rtt; - tx_start = smp_load_acquire(&call->acks_hard_ack); + tx_start = READ_ONCE(call->tx_bottom); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -194,8 +197,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, DECLARE_WAITQUEUE(myself, current); int ret; - _enter(",{%u,%u,%u,%u}", - call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize); + _enter(",{%u,%u,%u}", + call->tx_bottom, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); @@ -239,37 +242,77 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { + struct rxrpc_txqueue *sq = call->send_queue; rxrpc_seq_t seq = txb->seq; bool poke, last = txb->flags & RXRPC_LAST_PACKET; - + int ix = seq & RXRPC_TXQ_MASK; rxrpc_inc_stat(call->rxnet, stat_tx_data); - ASSERTCMP(txb->seq, ==, call->tx_prepared + 1); - - /* We have to set the timestamp before queueing as the retransmit - * algorithm can see the packet as soon as we queue it. - */ - txb->last_sent = ktime_get_real(); + ASSERTCMP(txb->seq, ==, call->send_top + 1); if (last) trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); + if (WARN_ON_ONCE(sq->bufs[ix])) + trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup); + else + trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue); + /* Add the packet to the call's output buffer */ - spin_lock(&call->tx_lock); - poke = list_empty(&call->tx_sendmsg); - list_add_tail(&txb->call_link, &call->tx_sendmsg); - call->tx_prepared = seq; - if (last) + poke = (READ_ONCE(call->tx_bottom) == call->send_top); + sq->bufs[ix] = txb; + /* Order send_top after the queue->next pointer and txb content. */ + smp_store_release(&call->send_top, seq); + if (last) { + set_bit(RXRPC_CALL_TX_NO_MORE, &call->flags); rxrpc_notify_end_tx(rx, call, notify_end_tx); - spin_unlock(&call->tx_lock); + call->send_queue = NULL; + } if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } /* + * Allocate a new txqueue unit and add it to the transmission queue. + */ +static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq; + + tq = kzalloc(sizeof(*tq), sk->sk_allocation); + if (!tq) + return -ENOMEM; + + tq->xmit_ts_base = KTIME_MIN; + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + tq->segment_xmit_ts[i] = UINT_MAX; + + if (call->send_queue) { + tq->qbase = call->send_top + 1; + call->send_queue->next = tq; + call->send_queue = tq; + } else if (WARN_ON(call->tx_queue)) { + kfree(tq); + return -ENOMEM; + } else { + /* We start at seq 1, so pretend seq 0 is hard-acked. */ + tq->nr_reported_acks = 1; + tq->segment_acked = 1UL; + tq->qbase = 0; + call->tx_qbase = 0; + call->send_queue = tq; + call->tx_qtail = tq; + call->tx_queue = tq; + } + + trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc); + return 0; +} + +/* * send data through a socket * - must be called in process context * - The caller holds the call user access mutex, but not the socket lock. @@ -287,6 +330,13 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, bool more = msg->msg_flags & MSG_MORE; int ret, copied = 0; + if (test_bit(RXRPC_CALL_TX_NO_MORE, &call->flags)) { + trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, + call->cid, call->call_id, call->rx_consumed, + 0, -EPROTO); + return -EPROTO; + } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ret = rxrpc_wait_to_be_connected(call, &timeo); @@ -303,6 +353,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); reload: + txb = call->tx_pending; + call->tx_pending = NULL; + if (txb) + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); + ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) goto maybe_error; @@ -329,11 +384,6 @@ reload: goto maybe_error; } - txb = call->tx_pending; - call->tx_pending = NULL; - if (txb) - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); - do { if (!txb) { size_t remain; @@ -343,6 +393,13 @@ reload: if (!rxrpc_check_tx_space(call, NULL)) goto wait_for_space; + /* See if we need to begin/extend the Tx queue. */ + if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) { + ret = rxrpc_alloc_txqueue(sk, call); + if (ret < 0) + goto maybe_error; + } + /* Work out the maximum size of a packet. Assume that * the security header is going to be in the padded * region (enc blocksize), but the trailer is not. @@ -359,10 +416,10 @@ reload: /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { - size_t copy = min_t(size_t, txb->space, msg_data_left(msg)); + size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); - if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset, + if (!copy_from_iter_full(txb->data + txb->offset, copy, &msg->msg_iter)) goto efault; _debug("added"); @@ -384,16 +441,10 @@ reload: (msg_data_left(msg) == 0 && !more)) { if (msg_data_left(msg) == 0 && !more) txb->flags |= RXRPC_LAST_PACKET; - else if (call->tx_top - call->acks_hard_ack < - call->tx_winsize) - txb->flags |= RXRPC_MORE_PACKETS; ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; - - txb->kvec[0].iov_len += txb->len; - txb->len = txb->kvec[0].iov_len; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } @@ -556,7 +607,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) - __releases(&rx->sk.sk_lock.slock) + __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; @@ -606,7 +657,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) - __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; bool dropped_lock = false; @@ -654,7 +704,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) } else { switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_AWAIT_CONN: - case RXRPC_CALL_SERVER_SECURING: + case RXRPC_CALL_SERVER_RECV_REQUEST: if (p.command == RXRPC_CMD_SEND_ABORT) break; fallthrough; @@ -708,14 +758,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; - } else if (p.command == RXRPC_CMD_SEND_ABORT) { + goto out_put_unlock; + } + + switch (p.command) { + case RXRPC_CMD_SEND_ABORT: rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, rxrpc_abort_call_sendmsg); ret = 0; - } else if (p.command != RXRPC_CMD_SEND_DATA) { - ret = -EINVAL; - } else { + break; + case RXRPC_CMD_SEND_DATA: ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); + break; + default: + ret = -EINVAL; + break; } out_put_unlock: @@ -743,6 +800,8 @@ error_release_sock: * appropriate to sending data. No control data should be supplied in @msg, * nor should an address be supplied. MSG_MORE should be flagged if there's * more data to come, otherwise this data will end the transmission phase. + * + * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, @@ -778,8 +837,9 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data); * @error: Local error value * @why: Indication as to why. * - * Allow a kernel service to abort a call, if it's still in an abortable state - * and return true if the call was aborted, false if it was already complete. + * Allow a kernel service to abort a call if it's still in an abortable state. + * + * Return: %true if the call was aborted, %false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, u32 abort_code, int error, enum rxrpc_abort_reason why) diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c index e51940589ee5..36b05fd842a7 100644 --- a/net/rxrpc/server_key.c +++ b/net/rxrpc/server_key.c @@ -152,6 +152,8 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) * * Set the server security keyring on an rxrpc socket. This is used to provide * the encryption keys for a kernel service. + * + * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) { @@ -169,3 +171,43 @@ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) return ret; } EXPORT_SYMBOL(rxrpc_sock_set_security_keyring); + +/** + * rxrpc_sock_set_manage_response - Set the manage-response flag for a kernel service + * @sk: The socket to set the keyring on + * @set: True to set, false to clear the flag + * + * Set the flag on an rxrpc socket to say that the caller wants to manage the + * RESPONSE packet and the user-defined data it may contain. Setting this + * means that recvmsg() will return messages with RXRPC_CHALLENGED in the + * control message buffer containing information about the challenge. + * + * The user should respond to the challenge by passing RXRPC_RESPOND or + * RXRPC_RESPOND_ABORT control messages with sendmsg() to the same call. + * Supplementary control messages, such as RXRPC_RESP_RXGK_APPDATA, may be + * included to indicate the parts the user wants to supply. + * + * The server will be passed the response data with a RXRPC_RESPONDED control + * message when it gets the first data from each call. + * + * Note that this is only honoured by security classes that need auxiliary data + * (e.g. RxGK). Those that don't offer the facility (e.g. RxKAD) respond + * without consulting userspace. + * + * Return: The previous setting. + */ +int rxrpc_sock_set_manage_response(struct sock *sk, bool set) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret; + + lock_sock(sk); + ret = !!test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + if (set) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + release_sock(sk); + return ret; +} +EXPORT_SYMBOL(rxrpc_sock_set_manage_response); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index c9bedd0e2d86..46a20cf4c402 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -11,6 +11,8 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; +static const unsigned int rxrpc_rx_mtu_min = 500; +static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO; static const unsigned int four = 4; static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1; static const unsigned int n_65535 = 65535; @@ -115,7 +117,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)SYSCTL_ONE, + .extra1 = (void *)&rxrpc_rx_mtu_min, .extra2 = (void *)&n_65535, }, { @@ -125,9 +127,8 @@ static struct ctl_table rxrpc_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)SYSCTL_ONE, - .extra2 = (void *)&four, + .extra2 = (void *)&rxrpc_jumbo_max, }, - { } }; int __init rxrpc_sysctl_init(void) diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index e0679658d9de..29767038691a 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -19,58 +19,39 @@ atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp) { - struct rxrpc_wire_header *whdr; struct rxrpc_txbuf *txb; - size_t total, hoff = 0; + size_t total, doff, jsize = sizeof(struct rxrpc_jumbo_header); void *buf; - txb = kmalloc(sizeof(*txb), gfp); + txb = kzalloc(sizeof(*txb), gfp); if (!txb) return NULL; - if (data_align) - hoff = round_up(sizeof(*whdr), data_align) - sizeof(*whdr); - total = hoff + sizeof(*whdr) + data_size; + /* We put a jumbo header in the buffer, but not a full wire header to + * avoid delayed-corruption problems with zerocopy. + */ + doff = round_up(jsize, data_align); + total = doff + data_size; + data_align = umax(data_align, L1_CACHE_BYTES); mutex_lock(&call->conn->tx_data_alloc_lock); - buf = __page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp, - ~(data_align - 1) & ~(L1_CACHE_BYTES - 1)); + buf = page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp, + data_align); mutex_unlock(&call->conn->tx_data_alloc_lock); if (!buf) { kfree(txb); return NULL; } - whdr = buf + hoff; - - INIT_LIST_HEAD(&txb->call_link); - INIT_LIST_HEAD(&txb->tx_link); refcount_set(&txb->ref, 1); - txb->last_sent = KTIME_MIN; txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); + txb->alloc_size = data_size; txb->space = data_size; - txb->len = 0; - txb->offset = sizeof(*whdr); + txb->offset = 0; txb->flags = call->conn->out_clientflag; - txb->ack_why = 0; - txb->seq = call->tx_prepared + 1; - txb->serial = 0; - txb->cksum = 0; - txb->nr_kvec = 1; - txb->kvec[0].iov_base = whdr; - txb->kvec[0].iov_len = sizeof(*whdr); - - whdr->epoch = htonl(call->conn->proto.epoch); - whdr->cid = htonl(call->cid); - whdr->callNumber = htonl(call->call_id); - whdr->seq = htonl(txb->seq); - whdr->type = RXRPC_PACKET_TYPE_DATA; - whdr->flags = 0; - whdr->userStatus = 0; - whdr->securityIndex = call->security_ix; - whdr->_rsvd = 0; - whdr->serviceId = htons(call->dest_srx.srx_service); + txb->seq = call->send_top + 1; + txb->data = buf + doff; trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, rxrpc_txbuf_alloc_data); @@ -79,92 +60,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return txb; } -/* - * Allocate and partially initialise an ACK packet. - */ -struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size) -{ - struct rxrpc_wire_header *whdr; - struct rxrpc_acktrailer *trailer; - struct rxrpc_ackpacket *ack; - struct rxrpc_txbuf *txb; - gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS; - void *buf, *buf2 = NULL; - u8 *filler; - - txb = kmalloc(sizeof(*txb), gfp); - if (!txb) - return NULL; - - buf = page_frag_alloc(&call->local->tx_alloc, - sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp); - if (!buf) { - kfree(txb); - return NULL; - } - - if (sack_size) { - buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp); - if (!buf2) { - page_frag_free(buf); - kfree(txb); - return NULL; - } - } - - whdr = buf; - ack = buf + sizeof(*whdr); - filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; - trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; - - INIT_LIST_HEAD(&txb->call_link); - INIT_LIST_HEAD(&txb->tx_link); - refcount_set(&txb->ref, 1); - txb->call_debug_id = call->debug_id; - txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); - txb->space = 0; - txb->len = sizeof(*whdr) + sizeof(*ack) + 3 + sizeof(*trailer); - txb->offset = 0; - txb->flags = call->conn->out_clientflag; - txb->ack_rwind = 0; - txb->seq = 0; - txb->serial = 0; - txb->cksum = 0; - txb->nr_kvec = 3; - txb->kvec[0].iov_base = whdr; - txb->kvec[0].iov_len = sizeof(*whdr) + sizeof(*ack); - txb->kvec[1].iov_base = buf2; - txb->kvec[1].iov_len = sack_size; - txb->kvec[2].iov_base = filler; - txb->kvec[2].iov_len = 3 + sizeof(*trailer); - - whdr->epoch = htonl(call->conn->proto.epoch); - whdr->cid = htonl(call->cid); - whdr->callNumber = htonl(call->call_id); - whdr->seq = 0; - whdr->type = RXRPC_PACKET_TYPE_ACK; - whdr->flags = 0; - whdr->userStatus = 0; - whdr->securityIndex = call->security_ix; - whdr->_rsvd = 0; - whdr->serviceId = htons(call->dest_srx.srx_service); - - get_page(virt_to_head_page(trailer)); - - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, - rxrpc_txbuf_alloc_ack); - atomic_inc(&rxrpc_nr_txbuf); - return txb; -} - -void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) -{ - int r; - - __refcount_inc(&txb->ref, &r); - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r + 1, what); -} - void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) { int r = refcount_read(&txb->ref); @@ -174,13 +69,10 @@ void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) { - int i; - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); - for (i = 0; i < txb->nr_kvec; i++) - if (txb->kvec[i].iov_base) - page_frag_free(txb->kvec[i].iov_base); + if (txb->data) + page_frag_free(txb->data); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); } @@ -202,37 +94,3 @@ void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) rxrpc_free_txbuf(txb); } } - -/* - * Shrink the transmit buffer. - */ -void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) -{ - struct rxrpc_txbuf *txb; - rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack); - bool wake = false; - - _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top); - - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - hard_ack = smp_load_acquire(&call->acks_hard_ack); - if (before(hard_ack, txb->seq)) - break; - - if (txb->seq != call->tx_bottom + 1) - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step); - ASSERTCMP(txb->seq, ==, call->tx_bottom + 1); - smp_store_release(&call->tx_bottom, call->tx_bottom + 1); - list_del_rcu(&txb->call_link); - - trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue); - - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated); - if (after(call->acks_hard_ack, call->tx_bottom + 128)) - wake = true; - } - - if (wake) - wake_up(&call->waitq); -} |