summaryrefslogtreecommitdiff
path: root/net/rxrpc
diff options
context:
space:
mode:
Diffstat (limited to 'net/rxrpc')
-rw-r--r--net/rxrpc/Kconfig23
-rw-r--r--net/rxrpc/Makefile7
-rw-r--r--net/rxrpc/af_rxrpc.c143
-rw-r--r--net/rxrpc/ar-internal.h437
-rw-r--r--net/rxrpc/call_accept.c60
-rw-r--r--net/rxrpc/call_event.c387
-rw-r--r--net/rxrpc/call_object.c107
-rw-r--r--net/rxrpc/conn_client.c28
-rw-r--r--net/rxrpc/conn_event.c191
-rw-r--r--net/rxrpc/conn_object.c38
-rw-r--r--net/rxrpc/input.c757
-rw-r--r--net/rxrpc/input_rack.c418
-rw-r--r--net/rxrpc/insecure.c23
-rw-r--r--net/rxrpc/io_thread.c129
-rw-r--r--net/rxrpc/key.c187
-rw-r--r--net/rxrpc/local_object.c9
-rw-r--r--net/rxrpc/misc.c4
-rw-r--r--net/rxrpc/net_ns.c4
-rw-r--r--net/rxrpc/oob.c379
-rw-r--r--net/rxrpc/output.c637
-rw-r--r--net/rxrpc/peer_event.c130
-rw-r--r--net/rxrpc/peer_object.c87
-rw-r--r--net/rxrpc/proc.c61
-rw-r--r--net/rxrpc/protocol.h33
-rw-r--r--net/rxrpc/recvmsg.c150
-rw-r--r--net/rxrpc/rtt.c103
-rw-r--r--net/rxrpc/rxgk.c1371
-rw-r--r--net/rxrpc/rxgk_app.c286
-rw-r--r--net/rxrpc/rxgk_common.h139
-rw-r--r--net/rxrpc/rxgk_kdf.c288
-rw-r--r--net/rxrpc/rxkad.c362
-rw-r--r--net/rxrpc/rxperf.c92
-rw-r--r--net/rxrpc/security.c7
-rw-r--r--net/rxrpc/sendmsg.c142
-rw-r--r--net/rxrpc/server_key.c42
-rw-r--r--net/rxrpc/sysctl.c7
-rw-r--r--net/rxrpc/txbuf.c174
37 files changed, 5935 insertions, 1507 deletions
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index a20986806fea..f60b81c66078 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -67,6 +67,29 @@ config RXKAD
See Documentation/networking/rxrpc.rst.
+config RXGK
+ bool "RxRPC GSSAPI security"
+ select CRYPTO_KRB5
+ select CRYPTO_MANAGER
+ select CRYPTO_KRB5ENC
+ select CRYPTO_AUTHENC
+ select CRYPTO_SKCIPHER
+ select CRYPTO_HASH_INFO
+ select CRYPTO_HMAC
+ select CRYPTO_CMAC
+ select CRYPTO_SHA1
+ select CRYPTO_SHA256
+ select CRYPTO_SHA512
+ select CRYPTO_CBC
+ select CRYPTO_CTS
+ select CRYPTO_AES
+ select CRYPTO_CAMELLIA
+ help
+ Provide the GSSAPI-based RxGK security class for AFS. Keys are added
+ with add_key().
+
+ See Documentation/networking/rxrpc.rst.
+
config RXPERF
tristate "RxRPC test service"
help
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index ac5caf5a48e1..c0542bae719e 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -16,6 +16,7 @@ rxrpc-y := \
conn_object.o \
conn_service.o \
input.o \
+ input_rack.o \
insecure.o \
io_thread.o \
key.o \
@@ -23,6 +24,7 @@ rxrpc-y := \
local_object.o \
misc.o \
net_ns.o \
+ oob.o \
output.o \
peer_event.o \
peer_object.o \
@@ -38,6 +40,9 @@ rxrpc-y := \
rxrpc-$(CONFIG_PROC_FS) += proc.o
rxrpc-$(CONFIG_RXKAD) += rxkad.o
rxrpc-$(CONFIG_SYSCTL) += sysctl.o
-
+rxrpc-$(CONFIG_RXGK) += \
+ rxgk.o \
+ rxgk_app.o \
+ rxgk_kdf.o
obj-$(CONFIG_RXPERF) += rxperf.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 5222bc97d192..36df0274d7b7 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk)
if (skwq_has_sleeper(wq))
wake_up_interruptible(&wq->wait);
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
rcu_read_unlock();
}
@@ -265,7 +265,10 @@ static int rxrpc_listen(struct socket *sock, int backlog)
* @gfp: Allocation flags
*
* Lookup or create a remote transport endpoint record for the specified
- * address and return it with a ref held.
+ * address.
+ *
+ * Return: The peer record found with a reference, %NULL if no record is found
+ * or a negative error code if the address is invalid or unsupported.
*/
struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock,
struct sockaddr_rxrpc *srx, gfp_t gfp)
@@ -283,9 +286,11 @@ EXPORT_SYMBOL(rxrpc_kernel_lookup_peer);
/**
* rxrpc_kernel_get_peer - Get a reference on a peer
- * @peer: The peer to get a reference on.
+ * @peer: The peer to get a reference on (may be NULL).
+ *
+ * Get a reference for a remote peer record (if not NULL).
*
- * Get a record for the remote peer in a call.
+ * Return: The @peer argument.
*/
struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer)
{
@@ -296,6 +301,8 @@ EXPORT_SYMBOL(rxrpc_kernel_get_peer);
/**
* rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference
* @peer: The peer to drop a ref on
+ *
+ * Drop a reference on a peer record.
*/
void rxrpc_kernel_put_peer(struct rxrpc_peer *peer)
{
@@ -320,10 +327,12 @@ EXPORT_SYMBOL(rxrpc_kernel_put_peer);
*
* Allow a kernel service to begin a call on the nominated socket. This just
* sets up all the internal tracking structures and allocates connection and
- * call IDs as appropriate. The call to be used is returned.
+ * call IDs as appropriate.
*
* The default socket destination address and security may be overridden by
* supplying @srx and @key.
+ *
+ * Return: The new call or an error code.
*/
struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
struct rxrpc_peer *peer,
@@ -408,9 +417,9 @@ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call)
/* Make sure we're not going to call back into a kernel service */
if (call->notify_rx) {
- spin_lock(&call->notify_lock);
+ spin_lock_irq(&call->notify_lock);
call->notify_rx = rxrpc_dummy_notify_rx;
- spin_unlock(&call->notify_lock);
+ spin_unlock_irq(&call->notify_lock);
}
}
mutex_unlock(&call->user_mutex);
@@ -437,6 +446,8 @@ EXPORT_SYMBOL(rxrpc_kernel_put_call);
*
* Allow a kernel service to find out whether a call is still alive - whether
* it has completed successfully and all received data has been consumed.
+ *
+ * Return: %true if the call is still ongoing and %false if it has completed.
*/
bool rxrpc_kernel_check_life(const struct socket *sock,
const struct rxrpc_call *call)
@@ -450,63 +461,20 @@ bool rxrpc_kernel_check_life(const struct socket *sock,
EXPORT_SYMBOL(rxrpc_kernel_check_life);
/**
- * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call.
- * @sock: The socket the call is on
- * @call: The call to query
- *
- * Allow a kernel service to retrieve the epoch value from a service call to
- * see if the client at the other end rebooted.
- */
-u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call)
-{
- return call->conn->proto.epoch;
-}
-EXPORT_SYMBOL(rxrpc_kernel_get_epoch);
-
-/**
- * rxrpc_kernel_new_call_notification - Get notifications of new calls
- * @sock: The socket to intercept received messages on
- * @notify_new_call: Function to be called when new calls appear
- * @discard_new_call: Function to discard preallocated calls
+ * rxrpc_kernel_set_notifications - Set table of callback operations
+ * @sock: The socket to install table upon
+ * @app_ops: Callback operation table to set
*
- * Allow a kernel service to be given notifications about new calls.
+ * Allow a kernel service to set a table of event notifications on a socket.
*/
-void rxrpc_kernel_new_call_notification(
- struct socket *sock,
- rxrpc_notify_new_call_t notify_new_call,
- rxrpc_discard_new_call_t discard_new_call)
+void rxrpc_kernel_set_notifications(struct socket *sock,
+ const struct rxrpc_kernel_ops *app_ops)
{
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
- rx->notify_new_call = notify_new_call;
- rx->discard_new_call = discard_new_call;
+ rx->app_ops = app_ops;
}
-EXPORT_SYMBOL(rxrpc_kernel_new_call_notification);
-
-/**
- * rxrpc_kernel_set_max_life - Set maximum lifespan on a call
- * @sock: The socket the call is on
- * @call: The call to configure
- * @hard_timeout: The maximum lifespan of the call in ms
- *
- * Set the maximum lifespan of a call. The call will end with ETIME or
- * ETIMEDOUT if it takes longer than this.
- */
-void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call,
- unsigned long hard_timeout)
-{
- ktime_t delay = ms_to_ktime(hard_timeout), expect_term_by;
-
- mutex_lock(&call->user_mutex);
-
- expect_term_by = ktime_add(ktime_get_real(), delay);
- WRITE_ONCE(call->expect_term_by, expect_term_by);
- trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard);
- rxrpc_poke_call(call, rxrpc_call_poke_set_timeout);
-
- mutex_unlock(&call->user_mutex);
-}
-EXPORT_SYMBOL(rxrpc_kernel_set_max_life);
+EXPORT_SYMBOL(rxrpc_kernel_set_notifications);
/*
* connect an RxRPC socket
@@ -624,7 +592,10 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
fallthrough;
case RXRPC_SERVER_BOUND:
case RXRPC_SERVER_LISTENING:
- ret = rxrpc_do_sendmsg(rx, m, len);
+ if (m->msg_flags & MSG_OOB)
+ ret = rxrpc_sendmsg_oob(rx, m, len);
+ else
+ ret = rxrpc_do_sendmsg(rx, m, len);
/* The socket has been unlocked */
goto out;
default:
@@ -659,7 +630,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
- unsigned int min_sec_level;
+ unsigned int min_sec_level, val;
u16 service_upgrade[2];
int ret;
@@ -707,9 +678,10 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
ret = -EISCONN;
if (rx->sk.sk_state != RXRPC_UNBOUND)
goto error;
- ret = copy_from_sockptr(&min_sec_level, optval,
- sizeof(unsigned int));
- if (ret < 0)
+ ret = copy_safe_from_sockptr(&min_sec_level,
+ sizeof(min_sec_level),
+ optval, optlen);
+ if (ret)
goto error;
ret = -EINVAL;
if (min_sec_level > RXRPC_SECURITY_MAX)
@@ -739,6 +711,26 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
rx->service_upgrade.to = service_upgrade[1];
goto success;
+ case RXRPC_MANAGE_RESPONSE:
+ ret = -EINVAL;
+ if (optlen != sizeof(unsigned int))
+ goto error;
+ ret = -EISCONN;
+ if (rx->sk.sk_state != RXRPC_UNBOUND)
+ goto error;
+ ret = copy_safe_from_sockptr(&val, sizeof(val),
+ optval, optlen);
+ if (ret)
+ goto error;
+ ret = -EINVAL;
+ if (val > 1)
+ goto error;
+ if (val)
+ set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags);
+ else
+ clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags);
+ goto success;
+
default:
break;
}
@@ -845,6 +837,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
rx->calls = RB_ROOT;
spin_lock_init(&rx->incoming_lock);
+ skb_queue_head_init(&rx->recvmsg_oobq);
+ rx->pending_oobq = RB_ROOT;
INIT_LIST_HEAD(&rx->sock_calls);
INIT_LIST_HEAD(&rx->to_be_accepted);
INIT_LIST_HEAD(&rx->recvmsg_q);
@@ -878,8 +872,10 @@ static int rxrpc_shutdown(struct socket *sock, int flags)
lock_sock(sk);
if (sk->sk_state < RXRPC_CLOSE) {
+ spin_lock_irq(&rx->recvmsg_lock);
sk->sk_state = RXRPC_CLOSE;
sk->sk_shutdown = SHUTDOWN_MASK;
+ spin_unlock_irq(&rx->recvmsg_lock);
} else {
ret = -ESHUTDOWN;
}
@@ -891,12 +887,30 @@ static int rxrpc_shutdown(struct socket *sock, int flags)
}
/*
+ * Purge the out-of-band queue.
+ */
+static void rxrpc_purge_oob_queue(struct sock *sk)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&rx->recvmsg_oobq)))
+ rxrpc_kernel_free_oob(skb);
+ while (!RB_EMPTY_ROOT(&rx->pending_oobq)) {
+ skb = rb_entry(rx->pending_oobq.rb_node, struct sk_buff, rbnode);
+ rb_erase(&skb->rbnode, &rx->pending_oobq);
+ rxrpc_kernel_free_oob(skb);
+ }
+}
+
+/*
* RxRPC socket destructor
*/
static void rxrpc_sock_destructor(struct sock *sk)
{
_enter("%p", sk);
+ rxrpc_purge_oob_queue(sk);
rxrpc_purge_queue(&sk->sk_receive_queue);
WARN_ON(refcount_read(&sk->sk_wmem_alloc));
@@ -935,7 +949,9 @@ static int rxrpc_release_sock(struct sock *sk)
break;
}
+ spin_lock_irq(&rx->recvmsg_lock);
sk->sk_state = RXRPC_CLOSE;
+ spin_unlock_irq(&rx->recvmsg_lock);
if (rx->local && rx->local->service == rx) {
write_lock(&rx->local->services_lock);
@@ -947,6 +963,7 @@ static int rxrpc_release_sock(struct sock *sk)
rxrpc_discard_prealloc(rx);
rxrpc_release_calls_on_socket(rx);
flush_workqueue(rxrpc_workqueue);
+ rxrpc_purge_oob_queue(sk);
rxrpc_purge_queue(&sk->sk_receive_queue);
rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 08c0a32db8c7..376e33dce8c1 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -30,6 +30,8 @@ struct rxrpc_crypt {
struct key_preparsed_payload;
struct rxrpc_connection;
struct rxrpc_txbuf;
+struct rxrpc_txqueue;
+struct rxgk_context;
/*
* Mark applied to socket buffers in skb->mark. skb->priority is used
@@ -38,6 +40,7 @@ struct rxrpc_txbuf;
enum rxrpc_skb_mark {
RXRPC_SKB_MARK_PACKET, /* Received packet */
RXRPC_SKB_MARK_ERROR, /* Error notification */
+ RXRPC_SKB_MARK_CHALLENGE, /* Challenge notification */
RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */
RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */
RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */
@@ -98,6 +101,7 @@ struct rxrpc_net {
atomic_t stat_tx_data_send;
atomic_t stat_tx_data_send_frag;
atomic_t stat_tx_data_send_fail;
+ atomic_t stat_tx_data_send_msgsize;
atomic_t stat_tx_data_underflow;
atomic_t stat_tx_data_cwnd_reset;
atomic_t stat_rx_data;
@@ -109,6 +113,8 @@ struct rxrpc_net {
atomic_t stat_tx_ack_skip;
atomic_t stat_tx_acks[256];
atomic_t stat_rx_acks[256];
+ atomic_t stat_tx_jumbo[10];
+ atomic_t stat_rx_jumbo[10];
atomic_t stat_why_req_ack[8];
@@ -142,10 +148,12 @@ struct rxrpc_backlog {
struct rxrpc_sock {
/* WARNING: sk has to be the first member */
struct sock sk;
- rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */
- rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
+ const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */
struct rxrpc_local *local; /* local endpoint */
struct rxrpc_backlog *backlog; /* Preallocation for services */
+ struct sk_buff_head recvmsg_oobq; /* OOB messages for recvmsg to pick up */
+ struct rb_root pending_oobq; /* OOB messages awaiting userspace to respond to */
+ u64 oob_id_counter; /* OOB message ID counter */
spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */
struct list_head sock_calls; /* List of calls owned by this socket */
struct list_head to_be_accepted; /* calls awaiting acceptance */
@@ -156,6 +164,7 @@ struct rxrpc_sock {
struct rb_root calls; /* User ID -> call mapping */
unsigned long flags;
#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */
+#define RXRPC_SOCK_MANAGE_RESPONSE 1 /* User wants to manage RESPONSE packets */
rwlock_t call_lock; /* lock for calls */
u32 min_sec_level; /* minimum security level */
#define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT
@@ -199,7 +208,7 @@ struct rxrpc_host_header {
*/
struct rxrpc_skb_priv {
union {
- struct rxrpc_connection *conn; /* Connection referred to (poke packet) */
+ struct rxrpc_connection *poke_conn; /* Conn referred to (poke packet) */
struct {
u16 offset; /* Offset of data */
u16 len; /* Length of data */
@@ -210,10 +219,22 @@ struct rxrpc_skb_priv {
rxrpc_seq_t first_ack; /* First packet in acks table */
rxrpc_seq_t prev_ack; /* Highest seq seen */
rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */
+ u16 nr_acks; /* Number of acks+nacks */
u8 reason; /* Reason for ack */
- u8 nr_acks; /* Number of acks+nacks */
- u8 nr_nacks; /* Number of nacks */
} ack;
+ struct {
+ struct rxrpc_connection *conn; /* Connection referred to */
+ union {
+ u32 rxkad_nonce;
+ };
+ } chall;
+ struct {
+ rxrpc_serial_t challenge_serial;
+ u32 kvno;
+ u32 version;
+ u16 len;
+ u16 ticket_len;
+ } resp;
};
struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
};
@@ -267,9 +288,24 @@ struct rxrpc_security {
/* issue a challenge */
int (*issue_challenge)(struct rxrpc_connection *);
+ /* Validate a challenge packet */
+ bool (*validate_challenge)(struct rxrpc_connection *conn,
+ struct sk_buff *skb);
+
+ /* Fill out the cmsg for recvmsg() to pass on a challenge to userspace.
+ * The security class gets to add additional information.
+ */
+ int (*challenge_to_recvmsg)(struct rxrpc_connection *conn,
+ struct sk_buff *challenge,
+ struct msghdr *msg);
+
+ /* Parse sendmsg() control message and respond to challenge. */
+ int (*sendmsg_respond_to_challenge)(struct sk_buff *challenge,
+ struct msghdr *msg);
+
/* respond to a challenge */
- int (*respond_to_challenge)(struct rxrpc_connection *,
- struct sk_buff *);
+ int (*respond_to_challenge)(struct rxrpc_connection *conn,
+ struct sk_buff *challenge);
/* verify a response */
int (*verify_response)(struct rxrpc_connection *,
@@ -277,6 +313,11 @@ struct rxrpc_security {
/* clear connection security */
void (*clear)(struct rxrpc_connection *);
+
+ /* Default ticket -> key decoder */
+ int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb,
+ unsigned int ticket_offset, unsigned int ticket_len,
+ struct key **_key);
};
/*
@@ -320,6 +361,15 @@ struct rxrpc_local {
struct list_head new_client_calls; /* Newly created client calls need connection */
spinlock_t client_call_lock; /* Lock for ->new_client_calls */
struct sockaddr_rxrpc srx; /* local address */
+ union {
+ /* Provide a kvec table sufficiently large to manage either a
+ * DATA packet with a maximum set of jumbo subpackets or a PING
+ * ACK padded out to 64K with zeropages for PMTUD.
+ */
+ struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ?
+ 1 + RXRPC_MAX_NR_JUMBO : 3 + 16];
+ struct bio_vec bvec[3 + 16];
+ };
};
/*
@@ -335,28 +385,31 @@ struct rxrpc_peer {
struct hlist_head error_targets; /* targets for net error distribution */
struct rb_root service_conns; /* Service connections */
struct list_head keepalive_link; /* Link in net->peer_keepalive[] */
+ unsigned long app_data; /* Application data (e.g. afs_server) */
time64_t last_tx_at; /* Last time packet sent here */
seqlock_t service_conn_lock;
spinlock_t lock; /* access lock */
- unsigned int if_mtu; /* interface MTU for this peer */
- unsigned int mtu; /* network MTU for this peer */
- unsigned int maxdata; /* data size (MTU - hdrsize) */
- unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
int debug_id; /* debug ID for printks */
struct sockaddr_rxrpc srx; /* remote address */
- /* calculated RTT cache */
-#define RXRPC_RTT_CACHE_SIZE 32
- spinlock_t rtt_input_lock; /* RTT lock for input routine */
- ktime_t rtt_last_req; /* Time of last RTT request */
- unsigned int rtt_count; /* Number of samples we've got */
+ /* Path MTU discovery [RFC8899] */
+ unsigned int pmtud_trial; /* Current MTU probe size */
+ unsigned int pmtud_good; /* Largest working MTU probe we've tried */
+ unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */
+ bool pmtud_lost; /* T if MTU probe was lost */
+ bool pmtud_probing; /* T if we have an active probe outstanding */
+ bool pmtud_pending; /* T if a call to this peer should send a probe */
+ u8 pmtud_jumbo; /* Max jumbo packets for the MTU */
+ bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */
+ unsigned int ackr_max_data; /* Maximum data advertised by peer */
+ unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */
+ unsigned int max_data; /* Maximum packet data capacity for this peer */
+ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
+ unsigned short tx_seg_max; /* Maximum number of transmissable segments */
- u32 srtt_us; /* smoothed round trip time << 3 in usecs */
- u32 mdev_us; /* medium deviation */
- u32 mdev_max_us; /* maximal mdev for the last rtt period */
- u32 rttvar_us; /* smoothed mdev_max */
- u32 rto_us; /* Retransmission timeout in usec */
- u8 backoff; /* Backoff timeout (as shift) */
+ /* Calculated RTT cache */
+ unsigned int recent_srtt_us;
+ unsigned int recent_rto_us;
u8 cong_ssthresh; /* Congestion slow-start threshold */
};
@@ -514,7 +567,17 @@ struct rxrpc_connection {
struct rxrpc_crypt csum_iv; /* packet checksum base */
u32 nonce; /* response re-use preventer */
} rxkad;
+ struct {
+ struct rxgk_context *keys[4]; /* (Re-)keying buffer */
+ u64 start_time; /* The start time for TK derivation */
+ u8 nonce[20]; /* Response re-use preventer */
+ u32 enctype; /* Kerberos 5 encoding type */
+ u32 key_number; /* Current key number */
+ } rxgk;
};
+ rwlock_t security_use_lock; /* Security use/modification lock */
+ struct sk_buff *tx_response; /* Response packet to be transmitted */
+
unsigned long flags;
unsigned long events;
unsigned long idle_timestamp; /* Time at which last became idle */
@@ -525,6 +588,8 @@ struct rxrpc_connection {
int debug_id; /* debug ID for printks */
rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */
unsigned int hi_serial; /* highest serial number received */
+ rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */
+ unsigned int pmtud_call; /* ID of call used for probe */
u32 service_id; /* Service ID, possibly upgraded */
u32 security_level; /* Security level selected */
u8 security_ix; /* security type */
@@ -557,6 +622,7 @@ enum rxrpc_call_flag {
RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */
RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */
RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */
+ RXRPC_CALL_TX_NO_MORE, /* No more data to transmit (MSG_MORE deasserted) */
RXRPC_CALL_SEND_PING, /* A ping will need to be sent */
RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
@@ -567,6 +633,7 @@ enum rxrpc_call_flag {
RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */
RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */
RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */
+ RXRPC_CALL_CONN_CHALLENGING, /* The connection is being challenged */
};
/*
@@ -587,7 +654,6 @@ enum rxrpc_call_state {
RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */
RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */
RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */
- RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */
RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */
RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */
RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */
@@ -599,13 +665,25 @@ enum rxrpc_call_state {
/*
* Call Tx congestion management modes.
*/
-enum rxrpc_congest_mode {
- RXRPC_CALL_SLOW_START,
- RXRPC_CALL_CONGEST_AVOIDANCE,
- RXRPC_CALL_PACKET_LOSS,
- RXRPC_CALL_FAST_RETRANSMIT,
- NR__RXRPC_CONGEST_MODES
-};
+enum rxrpc_ca_state {
+ RXRPC_CA_SLOW_START,
+ RXRPC_CA_CONGEST_AVOIDANCE,
+ RXRPC_CA_PACKET_LOSS,
+ RXRPC_CA_FAST_RETRANSMIT,
+ NR__RXRPC_CA_STATES
+} __mode(byte);
+
+/*
+ * Current purpose of call RACK timer. According to the RACK-TLP protocol
+ * [RFC8985], the transmission timer (call->rack_timo_at) may only be used for
+ * one of these at once.
+ */
+enum rxrpc_rack_timer_mode {
+ RXRPC_CALL_RACKTIMER_OFF, /* Timer not running */
+ RXRPC_CALL_RACKTIMER_RACK_REORDER, /* RACK reordering timer */
+ RXRPC_CALL_RACKTIMER_TLP_PTO, /* TLP timeout */
+ RXRPC_CALL_RACKTIMER_RTO, /* Retransmission timeout */
+} __mode(byte);
/*
* RxRPC call definition
@@ -624,8 +702,7 @@ struct rxrpc_call {
struct mutex user_mutex; /* User access mutex */
struct sockaddr_rxrpc dest_srx; /* Destination address */
ktime_t delay_ack_at; /* When DELAY ACK needs to happen */
- ktime_t ack_lost_at; /* When ACK is figured as lost */
- ktime_t resend_at; /* When next resend needs to happen */
+ ktime_t rack_timo_at; /* When ACK is figured as lost */
ktime_t ping_at; /* When next to send a ping */
ktime_t keepalive_at; /* When next to send a keepalive ping */
ktime_t expect_rx_by; /* When we expect to get a packet by */
@@ -666,25 +743,35 @@ struct rxrpc_call {
u32 call_id; /* call ID on connection */
u32 cid; /* connection ID plus channel index */
u32 security_level; /* Security level selected */
+ u32 security_enctype; /* Security-specific encoding type (or 0) */
int debug_id; /* debug ID for printks */
unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
unsigned short rx_pkt_len; /* Current recvmsg packet len */
+ /* Sendmsg data tracking. */
+ rxrpc_seq_t send_top; /* Highest Tx slot filled by sendmsg. */
+ struct rxrpc_txqueue *send_queue; /* Queue that sendmsg is writing into */
+
/* Transmitted data tracking. */
- spinlock_t tx_lock; /* Transmit queue lock */
- struct list_head tx_sendmsg; /* Sendmsg prepared packets */
- struct list_head tx_buffer; /* Buffer of transmissible packets */
+ struct rxrpc_txqueue *tx_queue; /* Start of transmission buffers */
+ struct rxrpc_txqueue *tx_qtail; /* End of transmission buffers */
+ rxrpc_seq_t tx_qbase; /* First slot in tx_queue */
rxrpc_seq_t tx_bottom; /* First packet in buffer */
rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */
- rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */
rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */
+ rxrpc_serial_t tx_last_serial; /* Serial of last DATA transmitted */
u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */
- u8 tx_winsize; /* Maximum size of Tx window */
+ u16 tx_nr_sent; /* Number of packets sent, but unacked */
+ u16 tx_nr_lost; /* Number of packets marked lost */
+ u16 tx_nr_resent; /* Number of packets resent, but unacked */
+ u16 tx_winsize; /* Maximum size of Tx window */
#define RXRPC_TX_MAX_WINDOW 128
+ u8 tx_jumbo_max; /* Maximum subpkts peer will accept */
ktime_t tx_last_sent; /* Last time a transmission occurred */
/* Received data tracking */
struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */
+ struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */
struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */
rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */
@@ -697,15 +784,33 @@ struct rxrpc_call {
* packets) rather than bytes.
*/
#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN
-#define RXRPC_MIN_CWND (RXRPC_TX_SMSS > 2190 ? 2 : RXRPC_TX_SMSS > 1095 ? 3 : 4)
- u8 cong_cwnd; /* Congestion window size */
+#define RXRPC_MIN_CWND 4
+ enum rxrpc_ca_state cong_ca_state; /* Congestion control state */
u8 cong_extra; /* Extra to send for congestion management */
- u8 cong_ssthresh; /* Slow-start threshold */
- enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */
- u8 cong_dup_acks; /* Count of ACKs showing missing packets */
- u8 cong_cumul_acks; /* Cumulative ACK count */
+ u16 cong_cwnd; /* Congestion window size */
+ u16 cong_ssthresh; /* Slow-start threshold */
+ u16 cong_dup_acks; /* Count of ACKs showing missing packets */
+ u16 cong_cumul_acks; /* Cumulative ACK count */
ktime_t cong_tstamp; /* Last time cwnd was changed */
- struct sk_buff *cong_last_nack; /* Last ACK with nacks received */
+
+ /* RACK-TLP [RFC8985] state. */
+ ktime_t rack_xmit_ts; /* Latest transmission timestamp */
+ ktime_t rack_rtt; /* RTT of most recently ACK'd segment */
+ ktime_t rack_rtt_ts; /* Timestamp of rack_rtt */
+ ktime_t rack_reo_wnd; /* Reordering window */
+ unsigned int rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */
+ int rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */
+ rxrpc_seq_t rack_fack; /* Highest sequence so far ACK'd */
+ rxrpc_seq_t rack_end_seq; /* Highest sequence seen */
+ rxrpc_seq_t rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */
+ bool rack_dsack_round_none; /* T if dsack_round is "None" */
+ bool rack_reordering_seen; /* T if detected reordering event */
+ enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */
+ bool tlp_is_retrans; /* T if unacked TLP retransmission */
+ rxrpc_serial_t tlp_serial; /* Serial of TLP probe (or 0 if none in progress) */
+ rxrpc_seq_t tlp_seq; /* Sequence of TLP probe */
+ unsigned int tlp_rtt_taken; /* Last time RTT taken */
+ ktime_t tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */
/* Receive-phase ACK management (ACKs we send). */
u8 ackr_reason; /* reason to ACK */
@@ -730,32 +835,45 @@ struct rxrpc_call {
/* Transmission-phase ACK management (ACKs we've received). */
ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
- rxrpc_seq_t acks_first_seq; /* first sequence number received */
+ rxrpc_seq_t acks_hard_ack; /* Highest sequence hard acked */
rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */
- rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */
rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */
+ unsigned short acks_nr_sacks; /* Number of soft acks recorded */
+ unsigned short acks_nr_snacks; /* Number of soft nacks recorded */
+
+ /* Calculated RTT cache */
+ ktime_t rtt_last_req; /* Time of last RTT request */
+ unsigned int rtt_count; /* Number of samples we've got */
+ unsigned int rtt_taken; /* Number of samples taken (wrapping) */
+ struct minmax min_rtt; /* Estimated minimum RTT */
+ u32 srtt_us; /* smoothed round trip time << 3 in usecs */
+ u32 mdev_us; /* medium deviation */
+ u32 mdev_max_us; /* maximal mdev for the last rtt period */
+ u32 rttvar_us; /* smoothed mdev_max */
+ u32 rto_us; /* Retransmission timeout in usec */
+ u8 backoff; /* Backoff timeout (as shift) */
};
/*
* Summary of a new ACK and the changes it made to the Tx buffer packet states.
*/
struct rxrpc_ack_summary {
- u16 nr_acks; /* Number of ACKs in packet */
- u16 nr_new_acks; /* Number of new ACKs in packet */
- u16 nr_new_nacks; /* Number of new nacks in packet */
- u16 nr_retained_nacks; /* Number of nacks retained between ACKs */
- u8 ack_reason;
- bool saw_nacks; /* Saw NACKs in packet */
- bool new_low_nack; /* T if new low NACK found */
- bool retrans_timeo; /* T if reTx due to timeout happened */
- u8 flight_size; /* Number of unreceived transmissions */
- /* Place to stash values for tracing */
- enum rxrpc_congest_mode mode:8;
- u8 cwnd;
- u8 ssthresh;
- u8 dup_acks;
- u8 cumulative_acks;
+ rxrpc_serial_t ack_serial; /* Serial number of ACK */
+ rxrpc_serial_t acked_serial; /* Serial number ACK'd */
+ u16 in_flight; /* Number of unreceived transmissions */
+ u16 nr_new_hacks; /* Number of rotated new ACKs */
+ u16 nr_new_sacks; /* Number of new soft ACKs in packet */
+ u16 nr_new_snacks; /* Number of new soft nacks in packet */
+ u8 ack_reason;
+ bool new_low_snack:1; /* T if new low soft NACK found */
+ bool retrans_timeo:1; /* T if reTx due to timeout happened */
+ bool need_retransmit:1; /* T if we need transmission */
+ bool rtt_sample_avail:1; /* T if RTT sample available */
+ bool in_fast_or_rto_recovery:1;
+ bool exiting_fast_or_rto_recovery:1;
+ bool tlp_probe_acked:1; /* T if the TLP probe seq was acked */
+ u8 /*enum rxrpc_congest_change*/ change;
};
/*
@@ -793,25 +911,24 @@ struct rxrpc_send_params {
* Buffer of data to be output as a packet.
*/
struct rxrpc_txbuf {
- struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */
- struct list_head tx_link; /* Link in live Enc queue or Tx queue */
- ktime_t last_sent; /* Time at which last transmitted */
refcount_t ref;
rxrpc_seq_t seq; /* Sequence number of this packet */
rxrpc_serial_t serial; /* Last serial number transmitted with */
unsigned int call_debug_id;
unsigned int debug_id;
- unsigned int len; /* Amount of data in buffer */
- unsigned int space; /* Remaining data space */
- unsigned int offset; /* Offset of fill point */
+ unsigned short len; /* Amount of data in buffer */
+ unsigned short space; /* Remaining data space */
+ unsigned short offset; /* Offset of fill point */
+ unsigned short crypto_header; /* Size of crypto header */
+ unsigned short sec_header; /* Size of security header */
+ unsigned short pkt_len; /* Size of packet content */
+ unsigned short alloc_size; /* Amount of bufferage allocated */
unsigned int flags;
#define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */
#define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */
__be16 cksum; /* Checksum to go in header */
- unsigned short ack_rwind; /* ACK receive window */
- u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */
- u8 nr_kvec; /* Amount of kvec[] used */
- struct kvec kvec[3];
+ bool jumboable; /* Can be non-terminal jumbo subpacket */
+ void *data; /* Data with preceding jumbo header */
};
static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb)
@@ -824,6 +941,46 @@ static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb)
return !rxrpc_sending_to_server(txb);
}
+/*
+ * Transmit queue element, including RACK [RFC8985] per-segment metadata. The
+ * transmission timestamp is in usec from the base.
+ */
+struct rxrpc_txqueue {
+ /* Start with the members we want to prefetch. */
+ struct rxrpc_txqueue *next;
+ ktime_t xmit_ts_base;
+ rxrpc_seq_t qbase;
+ u8 nr_reported_acks; /* Number of segments explicitly acked/nacked */
+ unsigned long segment_acked; /* Bit-per-buf: Set if ACK'd */
+ unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */
+ unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */
+ unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */
+ unsigned long ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */
+
+ /* The arrays we want to pack into as few cache lines as possible. */
+ struct {
+#define RXRPC_NR_TXQUEUE BITS_PER_LONG
+#define RXRPC_TXQ_MASK (RXRPC_NR_TXQUEUE - 1)
+ struct rxrpc_txbuf *bufs[RXRPC_NR_TXQUEUE];
+ unsigned int segment_serial[RXRPC_NR_TXQUEUE];
+ unsigned int segment_xmit_ts[RXRPC_NR_TXQUEUE];
+ } ____cacheline_aligned;
+};
+
+/*
+ * Data transmission request.
+ */
+struct rxrpc_send_data_req {
+ ktime_t now; /* Current time */
+ struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */
+ rxrpc_seq_t seq; /* Sequence of first data */
+ int n; /* Number of DATA packets to glue into jumbo */
+ bool retrans; /* T if this is a retransmission */
+ bool did_send; /* T if did actually send */
+ bool tlp_probe; /* T if this is a TLP probe */
+ int /* enum rxrpc_txdata_trace */ trace;
+};
+
#include <trace/events/rxrpc.h>
/*
@@ -841,6 +998,21 @@ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn
}
/*
+ * Allocate the next serial n numbers on a connection. 0 must be skipped.
+ */
+static inline rxrpc_serial_t rxrpc_get_next_serials(struct rxrpc_connection *conn,
+ unsigned int n)
+{
+ rxrpc_serial_t serial;
+
+ serial = conn->tx_serial;
+ if (serial + n <= n)
+ serial = 1;
+ conn->tx_serial = serial + n;
+ return serial;
+}
+
+/*
* af_rxrpc.c
*/
extern atomic_t rxrpc_n_rx_skbs;
@@ -856,7 +1028,6 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local,
struct rxrpc_connection *conn,
struct sockaddr_rxrpc *peer_srx,
struct sk_buff *skb);
-void rxrpc_accept_incoming_calls(struct rxrpc_local *);
int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long);
/*
@@ -866,10 +1037,10 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial,
enum rxrpc_propose_ack_trace why);
void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t,
enum rxrpc_propose_ack_trace);
-void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *);
-void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb);
-
-bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb);
+void rxrpc_resend_tlp(struct rxrpc_call *call);
+void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit,
+ enum rxrpc_txdata_trace trace);
+bool rxrpc_input_call_event(struct rxrpc_call *call);
/*
* call_object.c
@@ -884,7 +1055,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int);
struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
struct rxrpc_conn_parameters *,
struct rxrpc_call_params *, gfp_t,
- unsigned int);
+ unsigned int)
+ __releases(&rx->sk.sk_lock)
+ __acquires(&call->user_mutex);
void rxrpc_start_call_timer(struct rxrpc_call *call);
void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *,
struct sk_buff *);
@@ -969,7 +1142,6 @@ void rxrpc_connect_client_calls(struct rxrpc_local *local);
void rxrpc_expose_client_call(struct rxrpc_call *);
void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *);
void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle);
-void rxrpc_put_client_conn(struct rxrpc_connection *, enum rxrpc_conn_trace);
void rxrpc_discard_expired_client_conns(struct rxrpc_local *local);
void rxrpc_clean_up_local_conns(struct rxrpc_local *);
@@ -1049,6 +1221,32 @@ void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *);
void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *);
/*
+ * input_rack.c
+ */
+void rxrpc_input_rack_one(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix);
+void rxrpc_input_rack(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned long new_acks);
+void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary);
+ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now);
+void rxrpc_tlp_send_probe(struct rxrpc_call *call);
+void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary);
+void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by);
+
+/* Initialise TLP state [RFC8958 7.1]. */
+static inline void rxrpc_tlp_init(struct rxrpc_call *call)
+{
+ call->tlp_serial = 0;
+ call->tlp_seq = call->acks_hard_ack;
+ call->tlp_is_retrans = false;
+}
+
+/*
* io_thread.c
*/
int rxrpc_encap_rcv(struct sock *, struct sk_buff *);
@@ -1056,9 +1254,12 @@ void rxrpc_error_report(struct sock *);
bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why,
s32 abort_code, int err);
int rxrpc_io_thread(void *data);
+void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb);
static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local)
{
- wake_up_process(local->io_thread);
+ if (!local->io_thread)
+ return;
+ wake_up_process(READ_ONCE(local->io_thread));
}
static inline bool rxrpc_protocol_error(struct sk_buff *skb, enum rxrpc_abort_reason why)
@@ -1147,21 +1348,33 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
}
/*
+ * out_of_band.c
+ */
+void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb);
+void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb);
+int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len);
+
+/*
* output.c
*/
+ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len);
void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why);
+void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call);
int rxrpc_send_abort_packet(struct rxrpc_call *);
+void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req);
void rxrpc_send_conn_abort(struct rxrpc_connection *conn);
void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb);
void rxrpc_send_keepalive(struct rxrpc_peer *);
-void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb);
+void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *skb);
/*
* peer_event.c
*/
void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *);
void rxrpc_peer_keepalive_worker(struct work_struct *);
+void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial,
+ bool sendmsg_fail);
/*
* peer_object.c
@@ -1210,10 +1423,17 @@ static inline int rxrpc_abort_eproto(struct rxrpc_call *call,
/*
* rtt.c
*/
-void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int,
- rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
-ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans);
-void rxrpc_peer_init_rtt(struct rxrpc_peer *);
+void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+ int rtt_slot,
+ rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+ ktime_t send_time, ktime_t resp_time);
+ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans);
+void rxrpc_call_init_rtt(struct rxrpc_call *call);
+
+/*
+ * rxgk.c
+ */
+extern const struct rxrpc_security rxgk_yfs;
/*
* rxkad.c
@@ -1286,8 +1506,6 @@ static inline void rxrpc_sysctl_exit(void) {}
extern atomic_t rxrpc_nr_txbuf;
struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size,
size_t data_align, gfp_t gfp);
-struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size);
-void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
@@ -1313,6 +1531,53 @@ static inline bool after_eq(u32 seq1, u32 seq2)
return (s32)(seq1 - seq2) >= 0;
}
+static inline u32 earliest(u32 seq1, u32 seq2)
+{
+ return before(seq1, seq2) ? seq1 : seq2;
+}
+
+static inline u32 latest(u32 seq1, u32 seq2)
+{
+ return after(seq1, seq2) ? seq1 : seq2;
+}
+
+static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq)
+{
+ return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase;
+}
+
+static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ rxrpc_get_skb(skb, rxrpc_skb_get_call_rx);
+ __skb_queue_tail(&call->rx_queue, skb);
+ rxrpc_poke_call(call, rxrpc_call_poke_rx_packet);
+}
+
+/*
+ * Calculate how much space there is for transmitting more DATA packets.
+ */
+static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call)
+{
+ int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra);
+ int transmitted = call->tx_top - call->tx_bottom;
+
+ return max(winsize - transmitted, 0);
+}
+
+static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call)
+{
+ return call->acks_nr_sacks + call->tx_nr_lost;
+}
+
+/*
+ * Calculate the number of transmitted DATA packets assumed to be in flight
+ * [approx RFC6675].
+ */
+static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call)
+{
+ return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent;
+}
+
/*
* debug tracing
*/
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 0f5a1d77b890..49fccee1a726 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -34,7 +34,6 @@ static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call,
static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
struct rxrpc_backlog *b,
rxrpc_notify_rx_t notify_rx,
- rxrpc_user_attach_call_t user_attach_call,
unsigned long user_call_ID, gfp_t gfp,
unsigned int debug_id)
{
@@ -123,9 +122,10 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
call->user_call_ID = user_call_ID;
call->notify_rx = notify_rx;
- if (user_attach_call) {
+ if (rx->app_ops &&
+ rx->app_ops->user_attach_call) {
rxrpc_get_call(call, rxrpc_call_get_kernel_service);
- user_attach_call(call, user_call_ID);
+ rx->app_ops->user_attach_call(call, user_call_ID);
}
rxrpc_get_call(call, rxrpc_call_get_userid);
@@ -149,6 +149,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
id_in_use:
write_unlock(&rx->call_lock);
+ rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, -EBADSLT);
rxrpc_cleanup_call(call);
_leave(" = -EBADSLT");
return -EBADSLT;
@@ -188,8 +189,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
/* Make sure that there aren't any incoming calls in progress before we
* clear the preallocation buffers.
*/
- spin_lock(&rx->incoming_lock);
- spin_unlock(&rx->incoming_lock);
+ spin_lock_irq(&rx->incoming_lock);
+ spin_unlock_irq(&rx->incoming_lock);
head = b->peer_backlog_head;
tail = b->peer_backlog_tail;
@@ -219,9 +220,10 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
while (CIRC_CNT(head, tail, size) > 0) {
struct rxrpc_call *call = b->call_backlog[tail];
rcu_assign_pointer(call->socket, rx);
- if (rx->discard_new_call) {
+ if (rx->app_ops &&
+ rx->app_ops->discard_new_call) {
_debug("discard %lx", call->user_call_ID);
- rx->discard_new_call(call, call->user_call_ID);
+ rx->app_ops->discard_new_call(call, call->user_call_ID);
if (call->notify_rx)
call->notify_rx = rxrpc_dummy_notify;
rxrpc_put_call(call, rxrpc_call_put_kernel);
@@ -253,6 +255,9 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
unsigned short call_tail, conn_tail, peer_tail;
unsigned short call_count, conn_count;
+ if (!b)
+ return NULL;
+
/* #calls >= #conns >= #peers must hold true. */
call_head = smp_load_acquire(&b->call_backlog_head);
call_tail = b->call_backlog_tail;
@@ -343,7 +348,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local,
if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call);
- read_lock(&local->services_lock);
+ read_lock_irq(&local->services_lock);
/* Weed out packets to services we're not offering. Packets that would
* begin a call are explicitly rejected and the rest are just
@@ -387,8 +392,9 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local,
rxrpc_incoming_call(rx, call, skb);
conn = call->conn;
- if (rx->notify_new_call)
- rx->notify_new_call(&rx->sk, call, call->user_call_ID);
+ if (rx->app_ops &&
+ rx->app_ops->notify_new_call)
+ rx->app_ops->notify_new_call(&rx->sk, call, call->user_call_ID);
spin_lock(&conn->state_lock);
if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) {
@@ -399,34 +405,34 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local,
spin_unlock(&conn->state_lock);
spin_unlock(&rx->incoming_lock);
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
if (hlist_unhashed(&call->error_link)) {
- spin_lock(&call->peer->lock);
+ spin_lock_irq(&call->peer->lock);
hlist_add_head(&call->error_link, &call->peer->error_targets);
- spin_unlock(&call->peer->lock);
+ spin_unlock_irq(&call->peer->lock);
}
_leave(" = %p{%d}", call, call->debug_id);
- rxrpc_input_call_event(call, skb);
+ rxrpc_queue_rx_call_packet(call, skb);
rxrpc_put_call(call, rxrpc_call_put_input);
return true;
unsupported_service:
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered,
RX_INVALID_OPERATION, -EOPNOTSUPP);
unsupported_security:
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered,
RX_INVALID_OPERATION, -EKEYREJECTED);
no_call:
spin_unlock(&rx->incoming_lock);
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
_leave(" = f [%u]", skb->mark);
return false;
discard:
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
return true;
}
@@ -440,8 +446,7 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID)
if (rx->sk.sk_state == RXRPC_CLOSE)
return -ESHUTDOWN;
- return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID,
- GFP_KERNEL,
+ return rxrpc_service_prealloc_one(rx, b, NULL, user_call_ID, GFP_KERNEL,
atomic_inc_return(&rxrpc_debug_id));
}
@@ -449,20 +454,18 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID)
* rxrpc_kernel_charge_accept - Charge up socket with preallocated calls
* @sock: The socket on which to preallocate
* @notify_rx: Event notification function for the call
- * @user_attach_call: Func to attach call to user_call_ID
* @user_call_ID: The tag to attach to the preallocated call
* @gfp: The allocation conditions.
* @debug_id: The tracing debug ID.
*
- * Charge up the socket with preallocated calls, each with a user ID. A
- * function should be provided to effect the attachment from the user's side.
- * The user is given a ref to hold on the call.
+ * Charge up the socket with preallocated calls, each with a user ID. The
+ * ->user_attach_call() callback function should be provided to effect the
+ * attachment from the user's side. The user is given a ref to hold on the
+ * call.
*
* Note that the call may be come connected before this function returns.
*/
-int rxrpc_kernel_charge_accept(struct socket *sock,
- rxrpc_notify_rx_t notify_rx,
- rxrpc_user_attach_call_t user_attach_call,
+int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx,
unsigned long user_call_ID, gfp_t gfp,
unsigned int debug_id)
{
@@ -472,8 +475,7 @@ int rxrpc_kernel_charge_accept(struct socket *sock,
if (sock->sk->sk_state == RXRPC_CLOSE)
return -ESHUTDOWN;
- return rxrpc_service_prealloc_one(rx, b, notify_rx,
- user_attach_call, user_call_ID,
+ return rxrpc_service_prealloc_one(rx, b, notify_rx, user_call_ID,
gfp, debug_id);
}
EXPORT_SYMBOL(rxrpc_kernel_charge_accept);
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 7bbb68504766..fec59d9338b9 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -44,8 +44,8 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial,
trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial);
- if (call->peer->srtt_us)
- delay = (call->peer->srtt_us >> 3) * NSEC_PER_USEC;
+ if (call->srtt_us)
+ delay = (call->srtt_us >> 3) * NSEC_PER_USEC;
else
delay = ms_to_ktime(READ_ONCE(rxrpc_soft_ack_delay));
ktime_add_ms(delay, call->tx_backoff);
@@ -55,147 +55,104 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial,
}
/*
- * Handle congestion being detected by the retransmit timeout.
+ * Retransmit one or more packets.
*/
-static void rxrpc_congestion_timeout(struct rxrpc_call *call)
+static bool rxrpc_retransmit_data(struct rxrpc_call *call,
+ struct rxrpc_send_data_req *req)
{
- set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags);
+ struct rxrpc_txqueue *tq = req->tq;
+ unsigned int ix = req->seq & RXRPC_TXQ_MASK;
+ struct rxrpc_txbuf *txb = tq->bufs[ix];
+
+ _enter("%x,%x,%x,%x", tq->qbase, req->seq, ix, txb->debug_id);
+
+ req->retrans = true;
+ trace_rxrpc_retransmit(call, req, txb);
+
+ txb->flags |= RXRPC_TXBUF_RESENT;
+ rxrpc_send_data_packet(call, req);
+ rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans);
+
+ req->tq = NULL;
+ req->n = 0;
+ req->did_send = true;
+ req->now = ktime_get_real();
+ return true;
}
/*
* Perform retransmission of NAK'd and unack'd packets.
*/
-void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
+static void rxrpc_resend(struct rxrpc_call *call)
{
- struct rxrpc_ackpacket *ack = NULL;
- struct rxrpc_skb_priv *sp;
- struct rxrpc_txbuf *txb;
- rxrpc_seq_t transmitted = call->tx_transmitted;
- ktime_t next_resend = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC);
- ktime_t resend_at = KTIME_MAX, now, delay;
- bool unacked = false, did_send = false;
- unsigned int i;
-
- _enter("{%d,%d}", call->acks_hard_ack, call->tx_top);
-
- now = ktime_get_real();
-
- if (list_empty(&call->tx_buffer))
- goto no_resend;
+ struct rxrpc_send_data_req req = {
+ .now = ktime_get_real(),
+ .trace = rxrpc_txdata_retransmit,
+ };
+ struct rxrpc_txqueue *tq;
- trace_rxrpc_resend(call, ack_skb);
- txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link);
+ _enter("{%d,%d}", call->tx_bottom, call->tx_top);
- /* Scan the soft ACK table without dropping the lock and resend any
- * explicitly NAK'd packets.
- */
- if (ack_skb) {
- sp = rxrpc_skb(ack_skb);
- ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header);
+ trace_rxrpc_resend(call, call->acks_highest_serial);
- for (i = 0; i < sp->ack.nr_acks; i++) {
- rxrpc_seq_t seq;
+ /* Scan the transmission queue, looking for lost packets. */
+ for (tq = call->tx_queue; tq; tq = tq->next) {
+ unsigned long lost = tq->segment_lost;
- if (ack->acks[i] & 1)
- continue;
- seq = sp->ack.first_ack + i;
- if (after(txb->seq, transmitted))
- break;
- if (after(txb->seq, seq))
- continue; /* A new hard ACK probably came in */
- list_for_each_entry_from(txb, &call->tx_buffer, call_link) {
- if (txb->seq == seq)
- goto found_txb;
- }
- goto no_further_resend;
-
- found_txb:
- resend_at = ktime_add(txb->last_sent, rto);
- if (after(txb->serial, call->acks_highest_serial)) {
- if (ktime_after(resend_at, now) &&
- ktime_before(resend_at, next_resend))
- next_resend = resend_at;
- continue; /* Ack point not yet reached */
- }
+ if (after(tq->qbase, call->tx_transmitted))
+ break;
- rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked);
+ _debug("retr %16lx %u c=%08x [%x]",
+ tq->segment_acked, tq->nr_reported_acks, call->debug_id, tq->qbase);
+ _debug("lost %16lx", lost);
- trace_rxrpc_retransmit(call, txb->seq, txb->serial,
- ktime_sub(resend_at, now));
+ trace_rxrpc_resend_lost(call, tq, lost);
+ while (lost) {
+ unsigned int ix = __ffs(lost);
+ struct rxrpc_txbuf *txb = tq->bufs[ix];
- txb->flags |= RXRPC_TXBUF_RESENT;
- rxrpc_transmit_one(call, txb);
- did_send = true;
- now = ktime_get_real();
+ __clear_bit(ix, &lost);
+ rxrpc_see_txbuf(txb, rxrpc_txbuf_see_lost);
- if (list_is_last(&txb->call_link, &call->tx_buffer))
- goto no_further_resend;
- txb = list_next_entry(txb, call_link);
+ req.tq = tq;
+ req.seq = tq->qbase + ix;
+ req.n = 1;
+ rxrpc_retransmit_data(call, &req);
}
}
- /* Fast-forward through the Tx queue to the point the peer says it has
- * seen. Anything between the soft-ACK table and that point will get
- * ACK'd or NACK'd in due course, so don't worry about it here; here we
- * need to consider retransmitting anything beyond that point.
- */
- if (after_eq(call->acks_prev_seq, call->tx_transmitted))
- goto no_further_resend;
-
- list_for_each_entry_from(txb, &call->tx_buffer, call_link) {
- resend_at = ktime_add(txb->last_sent, rto);
-
- if (before_eq(txb->seq, call->acks_prev_seq))
- continue;
- if (after(txb->seq, call->tx_transmitted))
- break; /* Not transmitted yet */
-
- if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE &&
- before(txb->serial, ntohl(ack->serial)))
- goto do_resend; /* Wasn't accounted for by a more recent ping. */
-
- if (ktime_after(resend_at, now)) {
- if (ktime_before(resend_at, next_resend))
- next_resend = resend_at;
- continue;
- }
-
- do_resend:
- unacked = true;
-
- txb->flags |= RXRPC_TXBUF_RESENT;
- rxrpc_transmit_one(call, txb);
- did_send = true;
- rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans);
- now = ktime_get_real();
- }
+ rxrpc_get_rto_backoff(call, req.did_send);
+ _leave("");
+}
-no_further_resend:
-no_resend:
- if (resend_at < KTIME_MAX) {
- delay = rxrpc_get_rto_backoff(call->peer, did_send);
- resend_at = ktime_add(resend_at, delay);
- trace_rxrpc_timer_set(call, resend_at - now, rxrpc_timer_trace_resend_reset);
+/*
+ * Resend the highest-seq DATA packet so far transmitted for RACK-TLP [RFC8985 7.3].
+ */
+void rxrpc_resend_tlp(struct rxrpc_call *call)
+{
+ struct rxrpc_send_data_req req = {
+ .now = ktime_get_real(),
+ .seq = call->tx_transmitted,
+ .n = 1,
+ .tlp_probe = true,
+ .trace = rxrpc_txdata_tlp_retransmit,
+ };
+
+ /* There's a chance it'll be on the tail segment of the queue. */
+ req.tq = READ_ONCE(call->tx_qtail);
+ if (req.tq &&
+ before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) {
+ rxrpc_retransmit_data(call, &req);
+ return;
}
- call->resend_at = resend_at;
-
- if (unacked)
- rxrpc_congestion_timeout(call);
-
- /* If there was nothing that needed retransmission then it's likely
- * that an ACK got lost somewhere. Send a ping to find out instead of
- * retransmitting data.
- */
- if (!did_send) {
- ktime_t next_ping = ktime_add_us(call->acks_latest_ts,
- call->peer->srtt_us >> 3);
- if (ktime_sub(next_ping, now) <= 0)
- rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
- rxrpc_propose_ack_ping_for_0_retrans);
+ for (req.tq = call->tx_queue; req.tq; req.tq = req.tq->next) {
+ if (after_eq(call->tx_transmitted, req.tq->qbase) &&
+ before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) {
+ rxrpc_retransmit_data(call, &req);
+ return;
+ }
}
-
- _leave("");
}
/*
@@ -231,68 +188,93 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call)
}
}
-static bool rxrpc_tx_window_has_space(struct rxrpc_call *call)
-{
- unsigned int winsize = min_t(unsigned int, call->tx_winsize,
- call->cong_cwnd + call->cong_extra);
- rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize;
- rxrpc_seq_t tx_top = call->tx_top;
- int space;
-
- space = wtop - tx_top;
- return space > 0;
-}
-
/*
- * Decant some if the sendmsg prepared queue into the transmission buffer.
+ * Transmit some as-yet untransmitted data, to a maximum of the supplied limit.
*/
-static void rxrpc_decant_prepared_tx(struct rxrpc_call *call)
+static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, unsigned int limit,
+ enum rxrpc_txdata_trace trace)
{
- struct rxrpc_txbuf *txb;
+ int space = rxrpc_tx_window_space(call);
if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
- if (list_empty(&call->tx_sendmsg))
+ if (call->send_top == call->tx_top)
return;
rxrpc_expose_client_call(call);
}
- while ((txb = list_first_entry_or_null(&call->tx_sendmsg,
- struct rxrpc_txbuf, call_link))) {
- spin_lock(&call->tx_lock);
- list_del(&txb->call_link);
- spin_unlock(&call->tx_lock);
+ while (space > 0) {
+ struct rxrpc_send_data_req req = {
+ .now = ktime_get_real(),
+ .seq = call->tx_transmitted + 1,
+ .n = 0,
+ .trace = trace,
+ };
+ struct rxrpc_txqueue *tq;
+ struct rxrpc_txbuf *txb;
+ rxrpc_seq_t send_top, seq;
+ int limit = min(space, max(call->peer->pmtud_jumbo, 1));
+
+ /* Order send_top before the contents of the new txbufs and
+ * txqueue pointers
+ */
+ send_top = smp_load_acquire(&call->send_top);
+ if (call->tx_top == send_top)
+ break;
- call->tx_top = txb->seq;
- list_add_tail(&txb->call_link, &call->tx_buffer);
+ trace_rxrpc_transmit(call, send_top, space);
- if (txb->flags & RXRPC_LAST_PACKET)
- rxrpc_close_tx_phase(call);
+ tq = call->tx_qtail;
+ seq = call->tx_top;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant);
- rxrpc_transmit_one(call, txb);
+ do {
+ int ix;
- if (!rxrpc_tx_window_has_space(call))
- break;
+ seq++;
+ ix = seq & RXRPC_TXQ_MASK;
+ if (!ix) {
+ tq = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant_advance);
+ }
+ if (!req.tq)
+ req.tq = tq;
+ txb = tq->bufs[ix];
+ req.n++;
+ if (!txb->jumboable)
+ break;
+ } while (req.n < limit && before(seq, send_top));
+
+ if (txb->flags & RXRPC_LAST_PACKET) {
+ rxrpc_close_tx_phase(call);
+ tq = NULL;
+ }
+ call->tx_qtail = tq;
+ call->tx_top = seq;
+
+ space -= req.n;
+ rxrpc_send_data_packet(call, &req);
}
}
-static void rxrpc_transmit_some_data(struct rxrpc_call *call)
+void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit,
+ enum rxrpc_txdata_trace trace)
{
switch (__rxrpc_call_state(call)) {
case RXRPC_CALL_SERVER_ACK_REQUEST:
- if (list_empty(&call->tx_sendmsg))
+ if (call->tx_bottom == READ_ONCE(call->send_top))
return;
rxrpc_begin_service_reply(call);
fallthrough;
case RXRPC_CALL_SERVER_SEND_REPLY:
case RXRPC_CALL_CLIENT_SEND_REQUEST:
- if (!rxrpc_tx_window_has_space(call))
+ if (!rxrpc_tx_window_space(call))
return;
- if (list_empty(&call->tx_sendmsg)) {
+ if (call->tx_bottom == READ_ONCE(call->send_top)) {
rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow);
return;
}
- rxrpc_decant_prepared_tx(call);
+ rxrpc_transmit_fresh_data(call, limit, trace);
break;
default:
return;
@@ -305,8 +287,8 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call)
*/
static void rxrpc_send_initial_ping(struct rxrpc_call *call)
{
- if (call->peer->rtt_count < 3 ||
- ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+ if (call->rtt_count < 3 ||
+ ktime_before(ktime_add_ms(call->rtt_last_req, 1000),
ktime_get_real()))
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_params);
@@ -315,10 +297,11 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call)
/*
* Handle retransmission and deferred ACK/abort generation.
*/
-bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
+bool rxrpc_input_call_event(struct rxrpc_call *call)
{
+ struct sk_buff *skb;
ktime_t now, t;
- bool resend = false;
+ bool did_receive = false, saw_ack = false;
s32 abort_code;
rxrpc_see_call(call, rxrpc_call_see_input);
@@ -328,9 +311,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)],
call->events);
- if (__rxrpc_call_is_complete(call))
- goto out;
-
/* Handle abort request locklessly, vs rxrpc_propose_abort(). */
abort_code = smp_load_acquire(&call->send_abort);
if (abort_code) {
@@ -339,11 +319,33 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
goto out;
}
- if (skb && skb->mark == RXRPC_SKB_MARK_ERROR)
- goto out;
+ do {
+ skb = __skb_dequeue(&call->rx_queue);
+ if (skb) {
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ if (__rxrpc_call_is_complete(call) ||
+ skb->mark == RXRPC_SKB_MARK_ERROR) {
+ rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
+ goto out;
+ }
+
+ saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK;
+
+ rxrpc_input_call_packet(call, skb);
+ rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
+ did_receive = true;
+ }
- if (skb)
- rxrpc_input_call_packet(call, skb);
+ t = ktime_sub(call->rack_timo_at, ktime_get_real());
+ if (t <= 0) {
+ trace_rxrpc_timer_exp(call, t,
+ rxrpc_timer_trace_rack_off + call->rack_timer_mode);
+ call->rack_timo_at = KTIME_MAX;
+ rxrpc_rack_timer_expired(call, t);
+ }
+
+ } while (!skb_queue_empty(&call->rx_queue));
/* If we see our async-event poke, check for timeout trippage. */
now = ktime_get_real();
@@ -376,13 +378,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
rxrpc_propose_ack_delayed_ack);
}
- t = ktime_sub(call->ack_lost_at, now);
- if (t <= 0) {
- trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_lost_ack);
- call->ack_lost_at = KTIME_MAX;
- set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events);
- }
-
t = ktime_sub(call->ping_at, now);
if (t <= 0) {
trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping);
@@ -391,15 +386,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
rxrpc_propose_ack_ping_for_keepalive);
}
- t = ktime_sub(call->resend_at, now);
- if (t <= 0) {
- trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_resend);
- call->resend_at = KTIME_MAX;
- resend = true;
- }
-
- rxrpc_transmit_some_data(call);
-
now = ktime_get_real();
t = ktime_sub(call->keepalive_at, now);
if (t <= 0) {
@@ -409,35 +395,40 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
rxrpc_propose_ack_ping_for_keepalive);
}
- if (skb) {
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
- if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK)
- rxrpc_congestion_degrade(call);
- }
-
if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events))
rxrpc_send_initial_ping(call);
+ rxrpc_transmit_some_data(call, UINT_MAX, rxrpc_txdata_new_data);
+
+ if (saw_ack)
+ rxrpc_congestion_degrade(call);
+
+ if (did_receive &&
+ (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_SEND_REQUEST ||
+ __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SEND_REPLY)) {
+ t = ktime_sub(call->rack_timo_at, ktime_get_real());
+ trace_rxrpc_rack(call, t);
+ }
+
/* Process events */
if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_lost_ack);
- if (resend &&
+ if (call->tx_nr_lost > 0 &&
__rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY &&
!test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags))
- rxrpc_resend(call, NULL);
+ rxrpc_resend(call);
if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags))
rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0,
rxrpc_propose_ack_rx_idle);
if (call->ackr_nr_unacked > 2) {
- if (call->peer->rtt_count < 3)
+ if (call->rtt_count < 3)
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_rtt);
- else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+ else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000),
ktime_get_real()))
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_old_rtt);
@@ -455,8 +446,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
set(call->expect_req_by);
set(call->expect_rx_by);
set(call->delay_ack_at);
- set(call->ack_lost_at);
- set(call->resend_at);
+ set(call->rack_timo_at);
set(call->keepalive_at);
set(call->ping_at);
@@ -467,7 +457,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
} else {
unsigned long nowj = jiffies, delayj, nextj;
- delayj = max(nsecs_to_jiffies(delay), 1);
+ delayj = umax(nsecs_to_jiffies(delay), 1);
nextj = nowj + delayj;
if (time_before(nextj, call->timer.expires) ||
!timer_pending(&call->timer)) {
@@ -479,14 +469,17 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
out:
if (__rxrpc_call_is_complete(call)) {
- del_timer_sync(&call->timer);
+ timer_delete_sync(&call->timer);
if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
rxrpc_disconnect_call(call);
if (call->security)
call->security->free_call_crypto(call);
+ } else {
+ if (did_receive &&
+ call->peer->ackr_adv_pmtud &&
+ call->peer->pmtud_pending)
+ rxrpc_send_probe_for_pmtud(call);
}
- if (call->acks_hard_ack != call->tx_bottom)
- rxrpc_shrink_call_tx_buffer(call);
_leave("");
return true;
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 01fa71e8b1f7..15067ff7b1f2 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -22,7 +22,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
[RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl",
[RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl",
[RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc",
- [RXRPC_CALL_SERVER_SECURING] = "SvSecure",
[RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq",
[RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq",
[RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl",
@@ -49,7 +48,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what)
bool busy;
if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) {
- spin_lock_bh(&local->lock);
+ spin_lock_irq(&local->lock);
busy = !list_empty(&call->attend_link);
trace_rxrpc_poke_call(call, busy, what);
if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke))
@@ -57,7 +56,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what)
if (!busy) {
list_add_tail(&call->attend_link, &local->call_attend_q);
}
- spin_unlock_bh(&local->lock);
+ spin_unlock_irq(&local->lock);
if (!busy)
rxrpc_wake_up_io_thread(local);
}
@@ -65,7 +64,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what)
static void rxrpc_call_timer_expired(struct timer_list *t)
{
- struct rxrpc_call *call = from_timer(call, t, timer);
+ struct rxrpc_call *call = timer_container_of(call, t, timer);
_enter("%d", call->debug_id);
@@ -146,23 +145,21 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
INIT_LIST_HEAD(&call->recvmsg_link);
INIT_LIST_HEAD(&call->sock_link);
INIT_LIST_HEAD(&call->attend_link);
- INIT_LIST_HEAD(&call->tx_sendmsg);
- INIT_LIST_HEAD(&call->tx_buffer);
skb_queue_head_init(&call->recvmsg_queue);
+ skb_queue_head_init(&call->rx_queue);
skb_queue_head_init(&call->rx_oos_queue);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->notify_lock);
- spin_lock_init(&call->tx_lock);
refcount_set(&call->ref, 1);
call->debug_id = debug_id;
call->tx_total_len = -1;
+ call->tx_jumbo_max = 1;
call->next_rx_timo = 20 * HZ;
call->next_req_timo = 1 * HZ;
call->ackr_window = 1;
call->ackr_wtop = 1;
call->delay_ack_at = KTIME_MAX;
- call->ack_lost_at = KTIME_MAX;
- call->resend_at = KTIME_MAX;
+ call->rack_timo_at = KTIME_MAX;
call->ping_at = KTIME_MAX;
call->keepalive_at = KTIME_MAX;
call->expect_rx_by = KTIME_MAX;
@@ -174,14 +171,11 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
call->rx_winsize = rxrpc_rx_window_size;
call->tx_winsize = 16;
- if (RXRPC_TX_SMSS > 2190)
- call->cong_cwnd = 2;
- else if (RXRPC_TX_SMSS > 1095)
- call->cong_cwnd = 3;
- else
- call->cong_cwnd = 4;
+ call->cong_cwnd = RXRPC_MIN_CWND;
call->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
+ rxrpc_call_init_rtt(call);
+
call->rxnet = rxnet;
call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK;
atomic_inc(&rxnet->nr_calls);
@@ -225,9 +219,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
__set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags);
if (p->timeouts.normal)
- call->next_rx_timo = min(p->timeouts.normal, 1);
+ call->next_rx_timo = umin(p->timeouts.normal, 1);
if (p->timeouts.idle)
- call->next_req_timo = min(p->timeouts.idle, 1);
+ call->next_req_timo = umin(p->timeouts.idle, 1);
if (p->timeouts.hard)
call->hard_timo = p->timeouts.hard;
@@ -307,9 +301,9 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp)
trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call);
rxrpc_get_call(call, rxrpc_call_get_io_thread);
- spin_lock(&local->client_call_lock);
+ spin_lock_irq(&local->client_call_lock);
list_add_tail(&call->wait_link, &local->new_client_calls);
- spin_unlock(&local->client_call_lock);
+ spin_unlock_irq(&local->client_call_lock);
rxrpc_wake_up_io_thread(local);
return 0;
@@ -328,7 +322,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
struct rxrpc_call_params *p,
gfp_t gfp,
unsigned int debug_id)
- __releases(&rx->sk.sk_lock.slock)
+ __releases(&rx->sk.sk_lock)
__acquires(&call->user_mutex)
{
struct rxrpc_call *call, *xcall;
@@ -439,7 +433,7 @@ error_attached_to_socket:
/*
* Set up an incoming call. call->conn points to the connection.
- * This is called in BH context and isn't allowed to fail.
+ * This is called with interrupts disabled and isn't allowed to fail.
*/
void rxrpc_incoming_call(struct rxrpc_sock *rx,
struct rxrpc_call *call,
@@ -458,17 +452,16 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx,
call->cong_tstamp = skb->tstamp;
__set_bit(RXRPC_CALL_EXPOSED, &call->flags);
- rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING);
+ rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST);
spin_lock(&conn->state_lock);
switch (conn->state) {
case RXRPC_CONN_SERVICE_UNSECURED:
case RXRPC_CONN_SERVICE_CHALLENGING:
- rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING);
+ __set_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags);
break;
case RXRPC_CONN_SERVICE:
- rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST);
break;
case RXRPC_CONN_ABORTED:
@@ -536,11 +529,29 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why)
}
/*
- * Clean up the Rx skb ring.
+ * Clean up the transmission buffers.
*/
-static void rxrpc_cleanup_ring(struct rxrpc_call *call)
+static void rxrpc_cleanup_tx_buffers(struct rxrpc_call *call)
+{
+ struct rxrpc_txqueue *tq, *next;
+
+ for (tq = call->tx_queue; tq; tq = next) {
+ next = tq->next;
+ for (int i = 0; i < RXRPC_NR_TXQUEUE; i++)
+ if (tq->bufs[i])
+ rxrpc_put_txbuf(tq->bufs[i], rxrpc_txbuf_put_cleaned);
+ trace_rxrpc_tq(call, tq, 0, rxrpc_tq_cleaned);
+ kfree(tq);
+ }
+}
+
+/*
+ * Clean up the receive buffers.
+ */
+static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call)
{
rxrpc_purge_queue(&call->recvmsg_queue);
+ rxrpc_purge_queue(&call->rx_queue);
rxrpc_purge_queue(&call->rx_oos_queue);
}
@@ -563,7 +574,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
rxrpc_put_call_slot(call);
/* Make sure we don't get any more notifications */
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
if (!list_empty(&call->recvmsg_link)) {
_debug("unlinking once-pending call %p { e=%lx f=%lx }",
@@ -576,7 +587,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
call->recvmsg_link.next = NULL;
call->recvmsg_link.prev = NULL;
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
if (put)
rxrpc_put_call(call, rxrpc_call_put_unnotify);
@@ -676,23 +687,11 @@ static void rxrpc_rcu_free_call(struct rcu_head *rcu)
static void rxrpc_destroy_call(struct work_struct *work)
{
struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer);
- struct rxrpc_txbuf *txb;
- del_timer_sync(&call->timer);
-
- rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack);
- rxrpc_cleanup_ring(call);
- while ((txb = list_first_entry_or_null(&call->tx_sendmsg,
- struct rxrpc_txbuf, call_link))) {
- list_del(&txb->call_link);
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned);
- }
- while ((txb = list_first_entry_or_null(&call->tx_buffer,
- struct rxrpc_txbuf, call_link))) {
- list_del(&txb->call_link);
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned);
- }
+ timer_delete_sync(&call->timer);
+ rxrpc_cleanup_tx_buffers(call);
+ rxrpc_cleanup_rx_buffers(call);
rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned);
rxrpc_put_connection(call->conn, rxrpc_conn_put_call);
rxrpc_deactivate_bundle(call->bundle);
@@ -712,7 +711,7 @@ void rxrpc_cleanup_call(struct rxrpc_call *call)
ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE);
ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
- del_timer(&call->timer);
+ timer_delete(&call->timer);
if (rcu_read_lock_held())
/* Can't use the rxrpc workqueue as we need to cancel/flush
@@ -761,3 +760,23 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet)
atomic_dec(&rxnet->nr_calls);
wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls));
}
+
+/**
+ * rxrpc_kernel_query_call_security - Query call's security parameters
+ * @call: The call to query
+ * @_service_id: Where to return the service ID
+ * @_enctype: Where to return the "encoding type"
+ *
+ * This queries the security parameters of a call, setting *@_service_id and
+ * *@_enctype and returning the security class.
+ *
+ * Return: The security class protocol number.
+ */
+u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call,
+ u16 *_service_id, u32 *_enctype)
+{
+ *_service_id = call->dest_srx.srx_service;
+ *_enctype = call->security_enctype;
+ return call->security_ix;
+}
+EXPORT_SYMBOL(rxrpc_kernel_query_call_security);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index d25bf1cf3670..63bbcc567f59 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -231,7 +231,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
distance = id - id_cursor;
if (distance < 0)
distance = -distance;
- limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024);
+ limit = umax(atomic_read(&rxnet->nr_conns) * 4, 1024);
if (distance > limit)
goto mark_dont_reuse;
@@ -437,9 +437,9 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
call->dest_srx.srx_service = conn->service_id;
call->cong_ssthresh = call->peer->cong_ssthresh;
if (call->cong_cwnd >= call->cong_ssthresh)
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
else
- call->cong_mode = RXRPC_CALL_SLOW_START;
+ call->cong_ca_state = RXRPC_CA_SLOW_START;
chan->call_id = call_id;
chan->call_debug_id = call->debug_id;
@@ -508,15 +508,18 @@ static void rxrpc_activate_channels(struct rxrpc_bundle *bundle)
void rxrpc_connect_client_calls(struct rxrpc_local *local)
{
struct rxrpc_call *call;
+ LIST_HEAD(new_client_calls);
- while ((call = list_first_entry_or_null(&local->new_client_calls,
- struct rxrpc_call, wait_link))
- ) {
+ spin_lock_irq(&local->client_call_lock);
+ list_splice_tail_init(&local->new_client_calls, &new_client_calls);
+ spin_unlock_irq(&local->client_call_lock);
+
+ while ((call = list_first_entry_or_null(&new_client_calls,
+ struct rxrpc_call, wait_link))) {
struct rxrpc_bundle *bundle = call->bundle;
- spin_lock(&local->client_call_lock);
list_move_tail(&call->wait_link, &bundle->waiting_calls);
- spin_unlock(&local->client_call_lock);
+ rxrpc_see_call(call, rxrpc_call_see_waiting_call);
if (rxrpc_bundle_has_space(bundle))
rxrpc_activate_channels(bundle);
@@ -544,9 +547,9 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
- spin_lock(&call->peer->lock);
+ spin_lock_irq(&call->peer->lock);
hlist_add_head(&call->error_link, &call->peer->error_targets);
- spin_unlock(&call->peer->lock);
+ spin_unlock_irq(&call->peer->lock);
}
}
@@ -586,7 +589,10 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call
_debug("call is waiting");
ASSERTCMP(call->call_id, ==, 0);
ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
+ /* May still be on ->new_client_calls. */
+ spin_lock_irq(&local->client_call_lock);
list_del_init(&call->wait_link);
+ spin_unlock_irq(&local->client_call_lock);
return;
}
@@ -812,7 +818,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local)
local->kill_all_client_conns = true;
- del_timer_sync(&local->client_conn_reap_timer);
+ timer_delete_sync(&local->client_conn_reap_timer);
while ((conn = list_first_entry_or_null(&local->idle_client_conns,
struct rxrpc_connection, cache_link))) {
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 598b4ee389fc..232b6986da83 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -19,14 +19,14 @@
/*
* Set the completion state on an aborted connection.
*/
-static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff *skb,
+static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn,
s32 abort_code, int err,
enum rxrpc_call_completion compl)
{
bool aborted = false;
if (conn->state != RXRPC_CONN_ABORTED) {
- spin_lock(&conn->state_lock);
+ spin_lock_irq(&conn->state_lock);
if (conn->state != RXRPC_CONN_ABORTED) {
conn->abort_code = abort_code;
conn->error = err;
@@ -37,7 +37,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff
set_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events);
aborted = true;
}
- spin_unlock(&conn->state_lock);
+ spin_unlock_irq(&conn->state_lock);
}
return aborted;
@@ -49,12 +49,20 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff
int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb,
s32 abort_code, int err, enum rxrpc_abort_reason why)
{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- if (rxrpc_set_conn_aborted(conn, skb, abort_code, err,
+ u32 cid = conn->proto.cid, call = 0, seq = 0;
+
+ if (skb) {
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ cid = sp->hdr.cid;
+ call = sp->hdr.callNumber;
+ seq = sp->hdr.seq;
+ }
+
+ if (rxrpc_set_conn_aborted(conn, abort_code, err,
RXRPC_CALL_LOCALLY_ABORTED)) {
- trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber,
- sp->hdr.seq, abort_code, err);
+ trace_rxrpc_abort(0, why, cid, call, seq, abort_code, err);
rxrpc_poke_conn(conn, rxrpc_conn_get_poke_abort);
}
return -EPROTO;
@@ -63,11 +71,12 @@ int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb,
/*
* Mark a connection as being remotely aborted.
*/
-static bool rxrpc_input_conn_abort(struct rxrpc_connection *conn,
+static void rxrpc_input_conn_abort(struct rxrpc_connection *conn,
struct sk_buff *skb)
{
- return rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED,
- RXRPC_CALL_REMOTELY_ABORTED);
+ trace_rxrpc_rx_conn_abort(conn, skb);
+ rxrpc_set_conn_aborted(conn, skb->priority, -ECONNABORTED,
+ RXRPC_CALL_REMOTELY_ABORTED);
}
/*
@@ -91,7 +100,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
struct rxrpc_acktrailer trailer;
size_t len;
int ret, ioc;
- u32 serial, mtu, call_id, padding;
+ u32 serial, max_mtu, if_mtu, call_id, padding;
_enter("%d", conn->debug_id);
@@ -149,8 +158,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
break;
case RXRPC_PACKET_TYPE_ACK:
- mtu = conn->peer->if_mtu;
- mtu -= conn->peer->hdrsize;
+ if_mtu = conn->peer->if_mtu - conn->peer->hdrsize;
+ if (conn->peer->ackr_adv_pmtud) {
+ max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu);
+ } else {
+ if_mtu = umin(1444, if_mtu);
+ max_mtu = if_mtu;
+ }
pkt.ack.bufferSpace = 0;
pkt.ack.maxSkew = htons(skb ? skb->priority : 0);
pkt.ack.firstPacket = htonl(chan->last_seq + 1);
@@ -158,10 +172,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0);
pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE;
pkt.ack.nAcks = 0;
- trailer.maxMTU = htonl(rxrpc_rx_mtu);
- trailer.ifMTU = htonl(mtu);
+ trailer.maxMTU = htonl(max_mtu);
+ trailer.ifMTU = htonl(if_mtu);
trailer.rwind = htonl(rxrpc_rx_window_size);
- trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max);
+ trailer.jumbo_max = 0;
pkt.whdr.flags |= RXRPC_SLOW_START_OK;
padding = 0;
iov[0].iov_len += sizeof(pkt.ack);
@@ -171,7 +185,8 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
trace_rxrpc_tx_ack(chan->call_debug_id, serial,
ntohl(pkt.ack.firstPacket),
ntohl(pkt.ack.serial),
- pkt.ack.reason, 0, rxrpc_rx_window_size);
+ pkt.ack.reason, 0, rxrpc_rx_window_size,
+ rxrpc_propose_ack_retransmit);
break;
default:
@@ -202,11 +217,14 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn)
for (i = 0; i < RXRPC_MAXCALLS; i++) {
call = conn->channels[i].call;
- if (call)
+ if (call) {
+ rxrpc_see_call(call, rxrpc_call_see_conn_abort);
rxrpc_set_call_completion(call,
conn->completion,
conn->abort_code,
conn->error);
+ rxrpc_poke_call(call, rxrpc_call_poke_conn_abort);
+ }
}
_leave("");
@@ -218,10 +236,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn)
*/
static void rxrpc_call_is_secure(struct rxrpc_call *call)
{
- if (call && __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SECURING) {
- rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST);
+ if (call && __test_and_clear_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags))
rxrpc_notify_socket(call);
- }
}
/*
@@ -240,7 +256,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
switch (sp->hdr.type) {
case RXRPC_PACKET_TYPE_CHALLENGE:
- return conn->security->respond_to_challenge(conn, skb);
+ ret = conn->security->respond_to_challenge(conn, skb);
+ sp->chall.conn = NULL;
+ rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input);
+ return ret;
case RXRPC_PACKET_TYPE_RESPONSE:
ret = conn->security->verify_response(conn, skb);
@@ -252,16 +271,18 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
if (ret < 0)
return ret;
- spin_lock(&conn->state_lock);
+ spin_lock_irq(&conn->state_lock);
if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING)
conn->state = RXRPC_CONN_SERVICE;
- spin_unlock(&conn->state_lock);
+ spin_unlock_irq(&conn->state_lock);
if (conn->state == RXRPC_CONN_SERVICE) {
/* Offload call state flipping to the I/O thread. As
* we've already received the packet, put it on the
* front of the queue.
*/
+ sp->poke_conn = rxrpc_get_connection(
+ conn, rxrpc_conn_get_poke_secured);
skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED;
rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured);
skb_queue_head(&conn->local->rx_queue, skb);
@@ -383,6 +404,61 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
}
/*
+ * Post a CHALLENGE packet to the socket of one of a connection's calls so that
+ * it can get application data to include in the packet, possibly querying
+ * userspace.
+ */
+static bool rxrpc_post_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_call *call = NULL;
+ struct rxrpc_sock *rx;
+ bool respond = false;
+
+ sp->chall.conn =
+ rxrpc_get_connection(conn, rxrpc_conn_get_challenge_input);
+
+ if (!conn->security->challenge_to_recvmsg) {
+ rxrpc_post_packet_to_conn(conn, skb);
+ return true;
+ }
+
+ rcu_read_lock();
+
+ for (int i = 0; i < ARRAY_SIZE(conn->channels); i++) {
+ if (conn->channels[i].call) {
+ call = conn->channels[i].call;
+ rx = rcu_dereference(call->socket);
+ if (!rx) {
+ call = NULL;
+ continue;
+ }
+
+ respond = true;
+ if (test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags))
+ break;
+ call = NULL;
+ }
+ }
+
+ if (!respond) {
+ rcu_read_unlock();
+ rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input);
+ sp->chall.conn = NULL;
+ return false;
+ }
+
+ if (call)
+ rxrpc_notify_socket_oob(call, skb);
+ rcu_read_unlock();
+
+ if (!call)
+ rxrpc_post_packet_to_conn(conn, skb);
+ return true;
+}
+
+/*
* Input a connection-level packet.
*/
bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
@@ -402,6 +478,16 @@ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
return true;
case RXRPC_PACKET_TYPE_CHALLENGE:
+ rxrpc_see_skb(skb, rxrpc_skb_see_oob_challenge);
+ if (rxrpc_is_conn_aborted(conn)) {
+ if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED)
+ rxrpc_send_conn_abort(conn);
+ return true;
+ }
+ if (!conn->security->validate_challenge(conn, skb))
+ return false;
+ return rxrpc_post_challenge(conn, skb);
+
case RXRPC_PACKET_TYPE_RESPONSE:
if (rxrpc_is_conn_aborted(conn)) {
if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED)
@@ -427,17 +513,60 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb)
if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events))
rxrpc_abort_calls(conn);
- switch (skb->mark) {
- case RXRPC_SKB_MARK_SERVICE_CONN_SECURED:
- if (conn->state != RXRPC_CONN_SERVICE)
- break;
+ if (conn->tx_response) {
+ struct sk_buff *skb;
- for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
- rxrpc_call_is_secure(conn->channels[loop].call);
- break;
+ spin_lock_irq(&conn->local->lock);
+ skb = conn->tx_response;
+ conn->tx_response = NULL;
+ spin_unlock_irq(&conn->local->lock);
+
+ if (conn->state != RXRPC_CONN_ABORTED)
+ rxrpc_send_response(conn, skb);
+ rxrpc_free_skb(skb, rxrpc_skb_put_response);
+ }
+
+ if (skb) {
+ switch (skb->mark) {
+ case RXRPC_SKB_MARK_SERVICE_CONN_SECURED:
+ if (conn->state != RXRPC_CONN_SERVICE)
+ break;
+
+ for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
+ rxrpc_call_is_secure(conn->channels[loop].call);
+ break;
+ }
}
/* Process delayed ACKs whose time has come. */
if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK)
rxrpc_process_delayed_final_acks(conn, false);
}
+
+/*
+ * Post a RESPONSE message to the I/O thread for transmission.
+ */
+void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_local *local = conn->local;
+ struct sk_buff *old;
+
+ _enter("%x", sp->resp.challenge_serial);
+
+ spin_lock_irq(&local->lock);
+ old = conn->tx_response;
+ if (old) {
+ struct rxrpc_skb_priv *osp = rxrpc_skb(skb);
+
+ /* Always go with the response to the most recent challenge. */
+ if (after(sp->resp.challenge_serial, osp->resp.challenge_serial))
+ conn->tx_response = old;
+ else
+ old = skb;
+ } else {
+ conn->tx_response = skb;
+ }
+ spin_unlock_irq(&local->lock);
+ rxrpc_poke_conn(conn, rxrpc_conn_get_poke_response);
+}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 0af4642aeec4..37340becb224 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -31,13 +31,13 @@ void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why)
if (WARN_ON_ONCE(!local))
return;
- spin_lock_bh(&local->lock);
+ spin_lock_irq(&local->lock);
busy = !list_empty(&conn->attend_link);
if (!busy) {
rxrpc_get_connection(conn, why);
list_add_tail(&conn->attend_link, &local->conn_attend_q);
}
- spin_unlock_bh(&local->lock);
+ spin_unlock_irq(&local->lock);
rxrpc_wake_up_io_thread(local);
}
@@ -67,11 +67,13 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet,
INIT_WORK(&conn->destructor, rxrpc_clean_up_connection);
INIT_LIST_HEAD(&conn->proc_link);
INIT_LIST_HEAD(&conn->link);
+ INIT_LIST_HEAD(&conn->attend_link);
mutex_init(&conn->security_lock);
mutex_init(&conn->tx_data_alloc_lock);
skb_queue_head_init(&conn->rx_queue);
conn->rxnet = rxnet;
conn->security = &rxrpc_no_security;
+ rwlock_init(&conn->security_use_lock);
spin_lock_init(&conn->state_lock);
conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
conn->idle_timestamp = jiffies;
@@ -119,18 +121,13 @@ struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *lo
switch (srx->transport.family) {
case AF_INET:
if (peer->srx.transport.sin.sin_port !=
- srx->transport.sin.sin_port ||
- peer->srx.transport.sin.sin_addr.s_addr !=
- srx->transport.sin.sin_addr.s_addr)
+ srx->transport.sin.sin_port)
goto not_found;
break;
#ifdef CONFIG_AF_RXRPC_IPV6
case AF_INET6:
if (peer->srx.transport.sin6.sin6_port !=
- srx->transport.sin6.sin6_port ||
- memcmp(&peer->srx.transport.sin6.sin6_addr,
- &srx->transport.sin6.sin6_addr,
- sizeof(struct in6_addr)) != 0)
+ srx->transport.sin6.sin6_port)
goto not_found;
break;
#endif
@@ -201,9 +198,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
call->peer->cong_ssthresh = call->cong_ssthresh;
if (!hlist_unhashed(&call->error_link)) {
- spin_lock(&call->peer->lock);
+ spin_lock_irq(&call->peer->lock);
hlist_del_init(&call->error_link);
- spin_unlock(&call->peer->lock);
+ spin_unlock_irq(&call->peer->lock);
}
if (rxrpc_is_client_call(call)) {
@@ -318,15 +315,22 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
!conn->channels[3].call);
ASSERT(list_empty(&conn->cache_link));
- del_timer_sync(&conn->timer);
+ timer_delete_sync(&conn->timer);
cancel_work_sync(&conn->processor); /* Processing may restart the timer */
- del_timer_sync(&conn->timer);
+ timer_delete_sync(&conn->timer);
write_lock(&rxnet->conn_lock);
list_del_init(&conn->proc_link);
write_unlock(&rxnet->conn_lock);
+ if (conn->pmtud_probe) {
+ trace_rxrpc_pmtud_lost(conn, 0);
+ conn->peer->pmtud_probing = false;
+ conn->peer->pmtud_pending = true;
+ }
+
rxrpc_purge_queue(&conn->rx_queue);
+ rxrpc_free_skb(conn->tx_response, rxrpc_skb_put_response);
rxrpc_kill_client_conn(conn);
@@ -342,9 +346,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
*/
rxrpc_purge_queue(&conn->rx_queue);
- if (conn->tx_data_alloc.va)
- __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
- conn->tx_data_alloc.pagecnt_bias);
+ page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
}
@@ -365,7 +367,7 @@ void rxrpc_put_connection(struct rxrpc_connection *conn,
dead = __refcount_dec_and_test(&conn->ref, &r);
trace_rxrpc_conn(debug_id, r - 1, why);
if (dead) {
- del_timer(&conn->timer);
+ timer_delete(&conn->timer);
cancel_work(&conn->processor);
if (in_softirq() || work_busy(&conn->processor) ||
@@ -470,7 +472,7 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet)
atomic_dec(&rxnet->nr_conns);
- del_timer_sync(&rxnet->service_conn_reap_timer);
+ timer_delete_sync(&rxnet->service_conn_reap_timer);
rxrpc_queue_work(&rxnet->service_conn_reaper);
flush_workqueue(rxrpc_workqueue);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 3dedb8c0618c..24aceb183c2c 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -9,6 +9,17 @@
#include "ar-internal.h"
+/* Override priority when generating ACKs for received DATA */
+static const u8 rxrpc_ack_priority[RXRPC_ACK__INVALID] = {
+ [RXRPC_ACK_IDLE] = 1,
+ [RXRPC_ACK_DELAY] = 2,
+ [RXRPC_ACK_REQUESTED] = 3,
+ [RXRPC_ACK_DUPLICATE] = 4,
+ [RXRPC_ACK_EXCEEDS_WINDOW] = 5,
+ [RXRPC_ACK_NOSPACE] = 6,
+ [RXRPC_ACK_OUT_OF_SEQUENCE] = 7,
+};
+
static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq,
enum rxrpc_abort_reason why)
{
@@ -16,80 +27,68 @@ static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq,
}
/*
- * Do TCP-style congestion management [RFC 5681].
+ * Do TCP-style congestion management [RFC5681].
*/
static void rxrpc_congestion_management(struct rxrpc_call *call,
- struct sk_buff *skb,
- struct rxrpc_ack_summary *summary,
- rxrpc_serial_t acked_serial)
+ struct rxrpc_ack_summary *summary)
{
- enum rxrpc_congest_change change = rxrpc_cong_no_change;
- unsigned int cumulative_acks = call->cong_cumul_acks;
- unsigned int cwnd = call->cong_cwnd;
- bool resend = false;
-
- summary->flight_size =
- (call->tx_top - call->acks_hard_ack) - summary->nr_acks;
+ summary->change = rxrpc_cong_no_change;
+ summary->in_flight = rxrpc_tx_in_flight(call);
if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
summary->retrans_timeo = true;
- call->cong_ssthresh = max_t(unsigned int,
- summary->flight_size / 2, 2);
- cwnd = 1;
- if (cwnd >= call->cong_ssthresh &&
- call->cong_mode == RXRPC_CALL_SLOW_START) {
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
- call->cong_tstamp = skb->tstamp;
- cumulative_acks = 0;
+ call->cong_ssthresh = umax(summary->in_flight / 2, 2);
+ call->cong_cwnd = 1;
+ if (call->cong_cwnd >= call->cong_ssthresh &&
+ call->cong_ca_state == RXRPC_CA_SLOW_START) {
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
+ call->cong_tstamp = call->acks_latest_ts;
+ call->cong_cumul_acks = 0;
}
}
- cumulative_acks += summary->nr_new_acks;
- if (cumulative_acks > 255)
- cumulative_acks = 255;
+ call->cong_cumul_acks += summary->nr_new_sacks;
+ call->cong_cumul_acks += summary->nr_new_hacks;
+ if (call->cong_cumul_acks > 255)
+ call->cong_cumul_acks = 255;
- summary->cwnd = call->cong_cwnd;
- summary->ssthresh = call->cong_ssthresh;
- summary->cumulative_acks = cumulative_acks;
- summary->dup_acks = call->cong_dup_acks;
-
- switch (call->cong_mode) {
- case RXRPC_CALL_SLOW_START:
- if (summary->saw_nacks)
+ switch (call->cong_ca_state) {
+ case RXRPC_CA_SLOW_START:
+ if (call->acks_nr_snacks > 0)
goto packet_loss_detected;
- if (summary->cumulative_acks > 0)
- cwnd += 1;
- if (cwnd >= call->cong_ssthresh) {
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
- call->cong_tstamp = skb->tstamp;
+ if (call->cong_cumul_acks > 0)
+ call->cong_cwnd += 1;
+ if (call->cong_cwnd >= call->cong_ssthresh) {
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
+ call->cong_tstamp = call->acks_latest_ts;
}
goto out;
- case RXRPC_CALL_CONGEST_AVOIDANCE:
- if (summary->saw_nacks)
+ case RXRPC_CA_CONGEST_AVOIDANCE:
+ if (call->acks_nr_snacks > 0)
goto packet_loss_detected;
/* We analyse the number of packets that get ACK'd per RTT
* period and increase the window if we managed to fill it.
*/
- if (call->peer->rtt_count == 0)
+ if (call->rtt_count == 0)
goto out;
- if (ktime_before(skb->tstamp,
+ if (ktime_before(call->acks_latest_ts,
ktime_add_us(call->cong_tstamp,
- call->peer->srtt_us >> 3)))
+ call->srtt_us >> 3)))
goto out_no_clear_ca;
- change = rxrpc_cong_rtt_window_end;
- call->cong_tstamp = skb->tstamp;
- if (cumulative_acks >= cwnd)
- cwnd++;
+ summary->change = rxrpc_cong_rtt_window_end;
+ call->cong_tstamp = call->acks_latest_ts;
+ if (call->cong_cumul_acks >= call->cong_cwnd)
+ call->cong_cwnd++;
goto out;
- case RXRPC_CALL_PACKET_LOSS:
- if (!summary->saw_nacks)
+ case RXRPC_CA_PACKET_LOSS:
+ if (call->acks_nr_snacks == 0)
goto resume_normality;
- if (summary->new_low_nack) {
- change = rxrpc_cong_new_low_nack;
+ if (summary->new_low_snack) {
+ summary->change = rxrpc_cong_new_low_nack;
call->cong_dup_acks = 1;
if (call->cong_extra > 1)
call->cong_extra = 1;
@@ -100,31 +99,35 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
if (call->cong_dup_acks < 3)
goto send_extra_data;
- change = rxrpc_cong_begin_retransmission;
- call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT;
- call->cong_ssthresh = max_t(unsigned int,
- summary->flight_size / 2, 2);
- cwnd = call->cong_ssthresh + 3;
+ summary->change = rxrpc_cong_begin_retransmission;
+ call->cong_ca_state = RXRPC_CA_FAST_RETRANSMIT;
+ call->cong_ssthresh = umax(summary->in_flight / 2, 2);
+ call->cong_cwnd = call->cong_ssthresh + 3;
call->cong_extra = 0;
call->cong_dup_acks = 0;
- resend = true;
+ summary->need_retransmit = true;
+ summary->in_fast_or_rto_recovery = true;
goto out;
- case RXRPC_CALL_FAST_RETRANSMIT:
- if (!summary->new_low_nack) {
- if (summary->nr_new_acks == 0)
- cwnd += 1;
+ case RXRPC_CA_FAST_RETRANSMIT:
+ rxrpc_tlp_init(call);
+ summary->in_fast_or_rto_recovery = true;
+ if (!summary->new_low_snack) {
+ if (summary->nr_new_sacks == 0)
+ call->cong_cwnd += 1;
call->cong_dup_acks++;
if (call->cong_dup_acks == 2) {
- change = rxrpc_cong_retransmit_again;
+ summary->change = rxrpc_cong_retransmit_again;
call->cong_dup_acks = 0;
- resend = true;
+ summary->need_retransmit = true;
}
} else {
- change = rxrpc_cong_progress;
- cwnd = call->cong_ssthresh;
- if (!summary->saw_nacks)
+ summary->change = rxrpc_cong_progress;
+ call->cong_cwnd = call->cong_ssthresh;
+ if (call->acks_nr_snacks == 0) {
+ summary->exiting_fast_or_rto_recovery = true;
goto resume_normality;
+ }
}
goto out;
@@ -134,30 +137,25 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
}
resume_normality:
- change = rxrpc_cong_cleared_nacks;
+ summary->change = rxrpc_cong_cleared_nacks;
call->cong_dup_acks = 0;
call->cong_extra = 0;
- call->cong_tstamp = skb->tstamp;
- if (cwnd < call->cong_ssthresh)
- call->cong_mode = RXRPC_CALL_SLOW_START;
+ call->cong_tstamp = call->acks_latest_ts;
+ if (call->cong_cwnd < call->cong_ssthresh)
+ call->cong_ca_state = RXRPC_CA_SLOW_START;
else
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
out:
- cumulative_acks = 0;
+ call->cong_cumul_acks = 0;
out_no_clear_ca:
- if (cwnd >= RXRPC_TX_MAX_WINDOW)
- cwnd = RXRPC_TX_MAX_WINDOW;
- call->cong_cwnd = cwnd;
- call->cong_cumul_acks = cumulative_acks;
- summary->mode = call->cong_mode;
- trace_rxrpc_congest(call, summary, acked_serial, change);
- if (resend)
- rxrpc_resend(call, skb);
+ if (call->cong_cwnd >= RXRPC_TX_MAX_WINDOW)
+ call->cong_cwnd = RXRPC_TX_MAX_WINDOW;
+ trace_rxrpc_congest(call, summary);
return;
packet_loss_detected:
- change = rxrpc_cong_saw_nack;
- call->cong_mode = RXRPC_CALL_PACKET_LOSS;
+ summary->change = rxrpc_cong_saw_nack;
+ call->cong_ca_state = RXRPC_CA_PACKET_LOSS;
call->cong_dup_acks = 0;
goto send_extra_data;
@@ -166,7 +164,7 @@ send_extra_data:
* state.
*/
if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ||
- summary->nr_acks != call->tx_top - call->acks_hard_ack) {
+ call->acks_nr_sacks != call->tx_top - call->tx_bottom) {
call->cong_extra++;
wake_up(&call->waitq);
}
@@ -178,26 +176,42 @@ send_extra_data:
*/
void rxrpc_congestion_degrade(struct rxrpc_call *call)
{
- ktime_t rtt, now;
+ ktime_t rtt, now, time_since;
- if (call->cong_mode != RXRPC_CALL_SLOW_START &&
- call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE)
+ if (call->cong_ca_state != RXRPC_CA_SLOW_START &&
+ call->cong_ca_state != RXRPC_CA_CONGEST_AVOIDANCE)
return;
if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY)
return;
- rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8));
+ rtt = ns_to_ktime(call->srtt_us * (NSEC_PER_USEC / 8));
now = ktime_get_real();
- if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now))
+ time_since = ktime_sub(now, call->tx_last_sent);
+ if (ktime_before(time_since, rtt))
return;
- trace_rxrpc_reset_cwnd(call, now);
+ trace_rxrpc_reset_cwnd(call, time_since, rtt);
rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset);
call->tx_last_sent = now;
- call->cong_mode = RXRPC_CALL_SLOW_START;
- call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh,
- call->cong_cwnd * 3 / 4);
- call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND);
+ call->cong_ca_state = RXRPC_CA_SLOW_START;
+ call->cong_ssthresh = umax(call->cong_ssthresh, call->cong_cwnd * 3 / 4);
+ call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND);
+}
+
+/*
+ * Add an RTT sample derived from an ACK'd DATA packet.
+ */
+static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ int ix)
+{
+ ktime_t xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
+
+ rxrpc_call_add_rtt(call, rxrpc_rtt_rx_data_ack, -1,
+ summary->acked_serial, summary->ack_serial,
+ xmit_ts, call->acks_latest_ts);
+ __clear_bit(ix, &tq->rtt_samples); /* Prevent repeat RTT sample */
}
/*
@@ -206,37 +220,120 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call)
static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
struct rxrpc_ack_summary *summary)
{
- struct rxrpc_txbuf *txb;
- bool rot_last = false;
+ struct rxrpc_txqueue *tq = call->tx_queue;
+ rxrpc_seq_t seq = call->tx_bottom + 1;
+ bool rot_last = false, trace = false;
- list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) {
- if (before_eq(txb->seq, call->acks_hard_ack))
- continue;
- if (txb->flags & RXRPC_LAST_PACKET) {
+ _enter("%x,%x", call->tx_bottom, to);
+
+ trace_rxrpc_tx_rotate(call, seq, to);
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate);
+
+ if (call->acks_lowest_nak == call->tx_bottom) {
+ call->acks_lowest_nak = to;
+ } else if (after(to, call->acks_lowest_nak)) {
+ summary->new_low_snack = true;
+ call->acks_lowest_nak = to;
+ }
+
+ /* We may have a left over fully-consumed buffer at the front that we
+ * couldn't drop before (rotate_and_keep below).
+ */
+ if (seq == call->tx_qbase + RXRPC_NR_TXQUEUE) {
+ call->tx_qbase += RXRPC_NR_TXQUEUE;
+ call->tx_queue = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free);
+ kfree(tq);
+ tq = call->tx_queue;
+ }
+
+ do {
+ unsigned int ix = seq - call->tx_qbase;
+
+ _debug("tq=%x seq=%x i=%d f=%x", tq->qbase, seq, ix, tq->bufs[ix]->flags);
+ if (tq->bufs[ix]->flags & RXRPC_LAST_PACKET) {
set_bit(RXRPC_CALL_TX_LAST, &call->flags);
rot_last = true;
}
- if (txb->seq == to)
- break;
- }
- if (rot_last)
- set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags);
+ if (summary->acked_serial == tq->segment_serial[ix] &&
+ test_bit(ix, &tq->rtt_samples))
+ rxrpc_add_data_rtt_sample(call, summary, tq, ix);
+
+ if (ix == tq->nr_reported_acks) {
+ /* Packet directly hard ACK'd. */
+ tq->nr_reported_acks++;
+ rxrpc_input_rack_one(call, summary, tq, ix);
+ if (seq == call->tlp_seq)
+ summary->tlp_probe_acked = true;
+ summary->nr_new_hacks++;
+ __set_bit(ix, &tq->segment_acked);
+ trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_hack);
+ } else if (test_bit(ix, &tq->segment_acked)) {
+ /* Soft ACK -> hard ACK. */
+ call->acks_nr_sacks--;
+ trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_sack);
+ } else {
+ /* Soft NAK -> hard ACK. */
+ call->acks_nr_snacks--;
+ rxrpc_input_rack_one(call, summary, tq, ix);
+ if (seq == call->tlp_seq)
+ summary->tlp_probe_acked = true;
+ summary->nr_new_hacks++;
+ __set_bit(ix, &tq->segment_acked);
+ trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_snak);
+ }
- _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last);
+ call->tx_nr_sent--;
+ if (__test_and_clear_bit(ix, &tq->segment_lost))
+ call->tx_nr_lost--;
+ if (__test_and_clear_bit(ix, &tq->segment_retransmitted))
+ call->tx_nr_resent--;
+ __clear_bit(ix, &tq->ever_retransmitted);
- if (call->acks_lowest_nak == call->acks_hard_ack) {
- call->acks_lowest_nak = to;
- } else if (after(to, call->acks_lowest_nak)) {
- summary->new_low_nack = true;
- call->acks_lowest_nak = to;
+ rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated);
+ tq->bufs[ix] = NULL;
+
+ WRITE_ONCE(call->tx_bottom, seq);
+ trace_rxrpc_txqueue(call, (rot_last ?
+ rxrpc_txqueue_rotate_last :
+ rxrpc_txqueue_rotate));
+
+ seq++;
+ trace = true;
+ if (!(seq & RXRPC_TXQ_MASK)) {
+ trace_rxrpc_rack_update(call, summary);
+ trace = false;
+ prefetch(tq->next);
+ if (tq != call->tx_qtail) {
+ call->tx_qbase += RXRPC_NR_TXQUEUE;
+ call->tx_queue = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free);
+ kfree(tq);
+ tq = call->tx_queue;
+ } else {
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_keep);
+ tq = NULL;
+ break;
+ }
+ }
+
+ } while (before_eq(seq, to));
+
+ if (trace)
+ trace_rxrpc_rack_update(call, summary);
+
+ if (rot_last) {
+ set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags);
+ if (tq) {
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free);
+ kfree(tq);
+ call->tx_queue = NULL;
+ }
}
- smp_store_release(&call->acks_hard_ack, to);
+ _debug("%x,%x,%x,%d", to, call->tx_bottom, call->tx_top, rot_last);
- trace_rxrpc_txqueue(call, (rot_last ?
- rxrpc_txqueue_rotate_last :
- rxrpc_txqueue_rotate));
wake_up(&call->waitq);
return rot_last;
}
@@ -252,13 +349,10 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
{
ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
- call->resend_at = KTIME_MAX;
- trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend);
-
- if (unlikely(call->cong_last_nack)) {
- rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack);
- call->cong_last_nack = NULL;
- }
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF;
+ call->rack_timo_at = KTIME_MAX;
+ trace_rxrpc_rack_timer(call, 0, false);
+ trace_rxrpc_timer_can(call, rxrpc_timer_trace_rack_off + call->rack_timer_mode);
switch (__rxrpc_call_state(call)) {
case RXRPC_CALL_CLIENT_SEND_REQUEST:
@@ -354,18 +448,26 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb,
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
bool last = sp->hdr.flags & RXRPC_LAST_PACKET;
+ spin_lock_irq(&call->recvmsg_queue.lock);
+
__skb_queue_tail(&call->recvmsg_queue, skb);
rxrpc_input_update_ack_window(call, window, wtop);
trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq);
if (last)
+ /* Change the state inside the lock so that recvmsg syncs
+ * correctly with it and using sendmsg() to send a reply
+ * doesn't race.
+ */
rxrpc_end_rx_phase(call, sp->hdr.serial);
+
+ spin_unlock_irq(&call->recvmsg_queue.lock);
}
/*
* Process a DATA packet.
*/
static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb,
- bool *_notify)
+ bool *_notify, rxrpc_serial_t *_ack_serial, int *_ack_reason)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct sk_buff *oos;
@@ -418,8 +520,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb,
/* Send an immediate ACK if we fill in a hole */
else if (!skb_queue_empty(&call->rx_oos_queue))
ack_reason = RXRPC_ACK_DELAY;
- else
- call->ackr_nr_unacked++;
window++;
if (after(window, wtop)) {
@@ -433,7 +533,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb,
rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg);
- spin_lock(&call->recvmsg_queue.lock);
rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue);
*_notify = true;
@@ -455,8 +554,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb,
rxrpc_receive_queue_oos);
}
- spin_unlock(&call->recvmsg_queue.lock);
-
call->ackr_sack_base = sack;
} else {
unsigned int slot;
@@ -497,12 +594,16 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb,
}
send_ack:
- if (ack_reason >= 0)
- rxrpc_send_ACK(call, ack_reason, serial,
- rxrpc_propose_ack_input_data);
- else
- rxrpc_propose_delay_ACK(call, serial,
- rxrpc_propose_ack_input_data);
+ if (ack_reason >= 0) {
+ if (rxrpc_ack_priority[ack_reason] > rxrpc_ack_priority[*_ack_reason]) {
+ *_ack_serial = serial;
+ *_ack_reason = ack_reason;
+ } else if (rxrpc_ack_priority[ack_reason] == rxrpc_ack_priority[*_ack_reason] &&
+ ack_reason == RXRPC_ACK_REQUESTED) {
+ *_ack_serial = serial;
+ *_ack_reason = ack_reason;
+ }
+ }
}
/*
@@ -513,9 +614,11 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb
struct rxrpc_jumbo_header jhdr;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb), *jsp;
struct sk_buff *jskb;
+ rxrpc_serial_t ack_serial = 0;
unsigned int offset = sizeof(struct rxrpc_wire_header);
unsigned int len = skb->len - offset;
bool notify = false;
+ int ack_reason = 0, count = 1, stat_ix;
while (sp->hdr.flags & RXRPC_JUMBO_PACKET) {
if (len < RXRPC_JUMBO_SUBPKTLEN)
@@ -535,7 +638,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb
jsp = rxrpc_skb(jskb);
jsp->offset = offset;
jsp->len = RXRPC_JUMBO_DATALEN;
- rxrpc_input_data_one(call, jskb, &notify);
+ rxrpc_input_data_one(call, jskb, &notify, &ack_serial, &ack_reason);
rxrpc_free_skb(jskb, rxrpc_skb_put_jumbo_subpacket);
sp->hdr.flags = jhdr.flags;
@@ -544,12 +647,25 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb
sp->hdr.serial++;
offset += RXRPC_JUMBO_SUBPKTLEN;
len -= RXRPC_JUMBO_SUBPKTLEN;
+ count++;
}
sp->offset = offset;
sp->len = len;
- rxrpc_input_data_one(call, skb, &notify);
- if (notify) {
+ rxrpc_input_data_one(call, skb, &notify, &ack_serial, &ack_reason);
+
+ stat_ix = umin(count, ARRAY_SIZE(call->rxnet->stat_rx_jumbo)) - 1;
+ atomic_inc(&call->rxnet->stat_rx_jumbo[stat_ix]);
+
+ if (ack_reason > 0) {
+ rxrpc_send_ACK(call, ack_reason, ack_serial,
+ rxrpc_propose_ack_input_data);
+ } else {
+ call->ackr_nr_unacked++;
+ rxrpc_propose_delay_ACK(call, sp->hdr.serial,
+ rxrpc_propose_ack_input_data);
+ }
+ if (notify && !test_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) {
trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial);
rxrpc_notify_socket(call);
}
@@ -643,7 +759,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
smp_mb(); /* Read data before setting avail bit */
set_bit(i, &call->rtt_avail);
- rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial,
+ rxrpc_call_add_rtt(call, type, i, acked_serial, ack_serial,
sent_at, resp_time);
matched = true;
}
@@ -653,7 +769,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
*/
if (after(acked_serial, orig_serial)) {
trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_obsolete, i,
- orig_serial, acked_serial, 0, 0);
+ orig_serial, acked_serial, 0, 0, 0);
clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
smp_wmb();
set_bit(i, &call->rtt_avail);
@@ -661,7 +777,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
}
if (!matched)
- trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0);
+ trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0, 0);
}
/*
@@ -671,10 +787,13 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb
struct rxrpc_acktrailer *trailer)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_peer *peer;
- unsigned int mtu;
+ struct rxrpc_peer *peer = call->peer;
+ unsigned int max_data, capacity;
bool wake = false;
- u32 rwind = ntohl(trailer->rwind);
+ u32 max_mtu = ntohl(trailer->maxMTU);
+ //u32 if_mtu = ntohl(trailer->ifMTU);
+ u32 rwind = ntohl(trailer->rwind);
+ u32 jumbo_max = ntohl(trailer->jumbo_max);
if (rwind > RXRPC_TX_MAX_WINDOW)
rwind = RXRPC_TX_MAX_WINDOW;
@@ -685,57 +804,147 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb
call->tx_winsize = rwind;
}
- if (call->cong_ssthresh > rwind)
- call->cong_ssthresh = rwind;
+ max_mtu = clamp(max_mtu, 500, 65535);
+ peer->ackr_max_data = max_mtu;
+
+ if (max_mtu < peer->max_data) {
+ trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu,
+ rxrpc_pmtud_reduce_ack);
+ peer->max_data = max_mtu;
+ }
- mtu = min(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU));
+ max_data = umin(max_mtu, peer->max_data);
+ capacity = max_data;
+ capacity += sizeof(struct rxrpc_jumbo_header); /* First subpacket has main hdr, not jumbo */
+ capacity /= sizeof(struct rxrpc_jumbo_header) + RXRPC_JUMBO_DATALEN;
- peer = call->peer;
- if (mtu < peer->maxdata) {
- spin_lock(&peer->lock);
- peer->maxdata = mtu;
- peer->mtu = mtu + peer->hdrsize;
- spin_unlock(&peer->lock);
+ if (jumbo_max == 0) {
+ /* The peer says it supports pmtu discovery */
+ peer->ackr_adv_pmtud = true;
+ } else {
+ peer->ackr_adv_pmtud = false;
+ capacity = clamp(capacity, 1, jumbo_max);
}
+ call->tx_jumbo_max = capacity;
+
if (wake)
wake_up(&call->waitq);
}
+#if defined(CONFIG_X86) && __GNUC__ && !defined(__clang__)
+/* Clang doesn't support the %z constraint modifier */
+#define shiftr_adv_rotr(shift_from, rotate_into) ({ \
+ asm(" shr%z1 %1\n" \
+ " inc %0\n" \
+ " rcr%z2 %2\n" \
+ : "+d"(shift_from), "+m"(*(shift_from)), "+rm"(rotate_into) \
+ ); \
+ })
+#else
+#define shiftr_adv_rotr(shift_from, rotate_into) ({ \
+ typeof(rotate_into) __bit0 = *(shift_from) & 1; \
+ *(shift_from) >>= 1; \
+ shift_from++; \
+ rotate_into >>= 1; \
+ rotate_into |= __bit0 << (sizeof(rotate_into) * 8 - 1); \
+ })
+#endif
+
/*
- * Determine how many nacks from the previous ACK have now been satisfied.
+ * Deal with RTT samples from soft ACKs.
*/
-static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call,
- struct rxrpc_ack_summary *summary,
- rxrpc_seq_t seq)
+static void rxrpc_input_soft_rtt(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq)
{
- struct sk_buff *skb = call->cong_last_nack;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int i, new_acks = 0, retained_nacks = 0;
- rxrpc_seq_t old_seq = sp->ack.first_ack;
- u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket);
+ for (int ix = 0; ix < RXRPC_NR_TXQUEUE; ix++)
+ if (summary->acked_serial == tq->segment_serial[ix])
+ return rxrpc_add_data_rtt_sample(call, summary, tq, ix);
+}
- if (after_eq(seq, old_seq + sp->ack.nr_acks)) {
- summary->nr_new_acks += sp->ack.nr_nacks;
- summary->nr_new_acks += seq - (old_seq + sp->ack.nr_acks);
- summary->nr_retained_nacks = 0;
- } else if (seq == old_seq) {
- summary->nr_retained_nacks = sp->ack.nr_nacks;
- } else {
- for (i = 0; i < sp->ack.nr_acks; i++) {
- if (acks[i] == RXRPC_ACK_TYPE_NACK) {
- if (before(old_seq + i, seq))
- new_acks++;
- else
- retained_nacks++;
- }
+/*
+ * Process a batch of soft ACKs specific to a transmission queue segment.
+ */
+static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned long extracted_acks,
+ int nr_reported,
+ rxrpc_seq_t seq,
+ rxrpc_seq_t *lowest_nak)
+{
+ unsigned long old_reported = 0, flipped, new_acks = 0;
+ unsigned long a_to_n, n_to_a = 0;
+ int new, a, n;
+
+ if (tq->nr_reported_acks > 0)
+ old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks);
+
+ _enter("{%x,%lx,%d},%lx,%d,%x",
+ tq->qbase, tq->segment_acked, tq->nr_reported_acks,
+ extracted_acks, nr_reported, seq);
+
+ _debug("[%x]", tq->qbase);
+ _debug("tq %16lx %u", tq->segment_acked, tq->nr_reported_acks);
+ _debug("sack %16lx %u", extracted_acks, nr_reported);
+
+ /* See how many previously logged ACKs/NAKs have flipped. */
+ flipped = (tq->segment_acked ^ extracted_acks) & old_reported;
+ if (flipped) {
+ n_to_a = ~tq->segment_acked & flipped; /* Old NAK -> ACK */
+ a_to_n = tq->segment_acked & flipped; /* Old ACK -> NAK */
+ a = hweight_long(n_to_a);
+ n = hweight_long(a_to_n);
+ _debug("flip %16lx", flipped);
+ _debug("ntoa %16lx %d", n_to_a, a);
+ _debug("aton %16lx %d", a_to_n, n);
+ call->acks_nr_sacks += a - n;
+ call->acks_nr_snacks += n - a;
+ summary->nr_new_sacks += a;
+ summary->nr_new_snacks += n;
+ }
+
+ /* See how many new ACKs/NAKs have been acquired. */
+ new = nr_reported - tq->nr_reported_acks;
+ if (new > 0) {
+ new_acks = extracted_acks & ~old_reported;
+ if (new_acks) {
+ a = hweight_long(new_acks);
+ n = new - a;
+ _debug("new_a %16lx new=%d a=%d n=%d", new_acks, new, a, n);
+ call->acks_nr_sacks += a;
+ call->acks_nr_snacks += n;
+ summary->nr_new_sacks += a;
+ summary->nr_new_snacks += n;
+ } else {
+ call->acks_nr_snacks += new;
+ summary->nr_new_snacks += new;
}
+ }
+
+ tq->nr_reported_acks = nr_reported;
+ tq->segment_acked = extracted_acks;
+ trace_rxrpc_apply_acks(call, tq);
- summary->nr_new_acks += new_acks;
- summary->nr_retained_nacks = retained_nacks;
+ if (extracted_acks != ~0UL) {
+ rxrpc_seq_t lowest = seq + ffz(extracted_acks);
+
+ if (before(lowest, *lowest_nak))
+ *lowest_nak = lowest;
}
- return old_seq + sp->ack.nr_acks;
+ if (summary->acked_serial)
+ rxrpc_input_soft_rtt(call, summary, tq);
+
+ new_acks |= n_to_a;
+ if (new_acks)
+ rxrpc_input_rack(call, summary, tq, new_acks);
+
+ if (call->tlp_serial &&
+ rxrpc_seq_in_txq(tq, call->tlp_seq) &&
+ test_bit(call->tlp_seq - tq->qbase, &new_acks))
+ summary->tlp_probe_acked = true;
}
/*
@@ -749,39 +958,50 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call,
*/
static void rxrpc_input_soft_acks(struct rxrpc_call *call,
struct rxrpc_ack_summary *summary,
- struct sk_buff *skb,
- rxrpc_seq_t seq,
- rxrpc_seq_t since)
+ struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int i, old_nacks = 0;
+ struct rxrpc_txqueue *tq = call->tx_queue;
+ unsigned long extracted = ~0UL;
+ unsigned int nr = 0;
+ rxrpc_seq_t seq = call->acks_hard_ack + 1;
rxrpc_seq_t lowest_nak = seq + sp->ack.nr_acks;
u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket);
- for (i = 0; i < sp->ack.nr_acks; i++) {
- if (acks[i] == RXRPC_ACK_TYPE_ACK) {
- summary->nr_acks++;
- if (after_eq(seq, since))
- summary->nr_new_acks++;
- } else {
- summary->saw_nacks = true;
- if (before(seq, since)) {
- /* Overlap with previous ACK */
- old_nacks++;
- } else {
- summary->nr_new_nacks++;
- sp->ack.nr_nacks++;
- }
+ _enter("%x,%x,%u", tq->qbase, seq, sp->ack.nr_acks);
- if (before(seq, lowest_nak))
- lowest_nak = seq;
+ while (after(seq, tq->qbase + RXRPC_NR_TXQUEUE - 1))
+ tq = tq->next;
+
+ for (unsigned int i = 0; i < sp->ack.nr_acks; i++) {
+ /* Decant ACKs until we hit a txqueue boundary. */
+ shiftr_adv_rotr(acks, extracted);
+ if (i == 256) {
+ acks -= i;
+ i = 0;
}
seq++;
+ nr++;
+ if ((seq & RXRPC_TXQ_MASK) != 0)
+ continue;
+
+ _debug("bound %16lx %u", extracted, nr);
+
+ rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE,
+ seq - RXRPC_NR_TXQUEUE, &lowest_nak);
+ extracted = ~0UL;
+ nr = 0;
+ tq = tq->next;
+ prefetch(tq);
}
- if (lowest_nak != call->acks_lowest_nak) {
- call->acks_lowest_nak = lowest_nak;
- summary->new_low_nack = true;
+ if (nr) {
+ unsigned int nr_reported = seq & RXRPC_TXQ_MASK;
+
+ extracted >>= RXRPC_NR_TXQUEUE - nr_reported;
+ _debug("tail %16lx %u", extracted, nr_reported);
+ rxrpc_input_soft_ack_tq(call, summary, tq, extracted, nr_reported,
+ seq & ~RXRPC_TXQ_MASK, &lowest_nak);
}
/* We *can* have more nacks than we did - the peer is permitted to drop
@@ -789,9 +1009,14 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call,
* possible for the nack distribution to change whilst the number of
* nacks stays the same or goes down.
*/
- if (old_nacks < summary->nr_retained_nacks)
- summary->nr_new_acks += summary->nr_retained_nacks - old_nacks;
- summary->nr_retained_nacks = old_nacks;
+ if (lowest_nak != call->acks_lowest_nak) {
+ call->acks_lowest_nak = lowest_nak;
+ summary->new_low_snack = true;
+ }
+
+ _debug("summary A=%d+%d N=%d+%d",
+ call->acks_nr_sacks, summary->nr_new_sacks,
+ call->acks_nr_snacks, summary->nr_new_snacks);
}
/*
@@ -799,21 +1024,21 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call,
* with respect to the ack state conveyed by preceding ACKs.
*/
static bool rxrpc_is_ack_valid(struct rxrpc_call *call,
- rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt)
+ rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt)
{
- rxrpc_seq_t base = READ_ONCE(call->acks_first_seq);
+ rxrpc_seq_t base = READ_ONCE(call->acks_hard_ack);
- if (after(first_pkt, base))
+ if (after(hard_ack, base))
return true; /* The window advanced */
- if (before(first_pkt, base))
+ if (before(hard_ack, base))
return false; /* firstPacket regressed */
if (after_eq(prev_pkt, call->acks_prev_seq))
return true; /* previousPacket hasn't regressed. */
/* Some rx implementations put a serial number in previousPacket. */
- if (after_eq(prev_pkt, base + call->tx_winsize))
+ if (after(prev_pkt, base + call->tx_winsize))
return false;
return true;
}
@@ -831,53 +1056,34 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call,
static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
{
struct rxrpc_ack_summary summary = { 0 };
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_acktrailer trailer;
- rxrpc_serial_t ack_serial, acked_serial;
- rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt;
int nr_acks, offset, ioffset;
_enter("");
offset = sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket);
- ack_serial = sp->hdr.serial;
- acked_serial = sp->ack.acked_serial;
- first_soft_ack = sp->ack.first_ack;
- prev_pkt = sp->ack.prev_ack;
- nr_acks = sp->ack.nr_acks;
- hard_ack = first_soft_ack - 1;
- summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ?
- sp->ack.reason : RXRPC_ACK__INVALID);
-
- trace_rxrpc_rx_ack(call, ack_serial, acked_serial,
- first_soft_ack, prev_pkt,
- summary.ack_reason, nr_acks);
- rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]);
+ summary.ack_serial = sp->hdr.serial;
+ first_soft_ack = sp->ack.first_ack;
+ prev_pkt = sp->ack.prev_ack;
+ nr_acks = sp->ack.nr_acks;
+ hard_ack = first_soft_ack - 1;
+ summary.acked_serial = sp->ack.acked_serial;
+ summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ?
+ sp->ack.reason : RXRPC_ACK__INVALID);
- if (acked_serial != 0) {
- switch (summary.ack_reason) {
- case RXRPC_ACK_PING_RESPONSE:
- rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
- rxrpc_rtt_rx_ping_response);
- break;
- case RXRPC_ACK_REQUESTED:
- rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
- rxrpc_rtt_rx_requested_ack);
- break;
- default:
- rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
- rxrpc_rtt_rx_other_ack);
- break;
- }
- }
+ trace_rxrpc_rx_ack(call, sp);
+ rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]);
+ prefetch(call->tx_queue);
/* If we get an EXCEEDS_WINDOW ACK from the server, it probably
* indicates that the client address changed due to NAT. The server
* lost the call because it switched to a different peer.
*/
if (unlikely(summary.ack_reason == RXRPC_ACK_EXCEEDS_WINDOW) &&
- first_soft_ack == 1 &&
+ hard_ack == 0 &&
prev_pkt == 0 &&
rxrpc_is_client_call(call)) {
rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
@@ -890,9 +1096,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
* if we still have it buffered to the beginning.
*/
if (unlikely(summary.ack_reason == RXRPC_ACK_OUT_OF_SEQUENCE) &&
- first_soft_ack == 1 &&
+ hard_ack == 0 &&
prev_pkt == 0 &&
- call->acks_hard_ack == 0 &&
+ call->tx_bottom == 0 &&
rxrpc_is_client_call(call)) {
rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
0, -ENETRESET);
@@ -900,11 +1106,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
}
/* Discard any out-of-order or duplicate ACKs (outside lock). */
- if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) {
- trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial,
- first_soft_ack, call->acks_first_seq,
- prev_pkt, call->acks_prev_seq);
- goto send_response;
+ if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) {
+ trace_rxrpc_rx_discard_ack(call, summary.ack_serial, hard_ack, prev_pkt);
+ goto send_response; /* Still respond if requested. */
}
trailer.maxMTU = 0;
@@ -916,34 +1120,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
if (nr_acks > 0)
skb_condense(skb);
- if (call->cong_last_nack) {
- since = rxrpc_input_check_prev_ack(call, &summary, first_soft_ack);
- rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack);
- call->cong_last_nack = NULL;
- } else {
- summary.nr_new_acks = first_soft_ack - call->acks_first_seq;
- call->acks_lowest_nak = first_soft_ack + nr_acks;
- since = first_soft_ack;
- }
-
- call->acks_latest_ts = skb->tstamp;
- call->acks_first_seq = first_soft_ack;
+ call->acks_latest_ts = ktime_get_real();
+ call->acks_hard_ack = hard_ack;
call->acks_prev_seq = prev_pkt;
- switch (summary.ack_reason) {
- case RXRPC_ACK_PING:
- break;
- default:
- if (acked_serial && after(acked_serial, call->acks_highest_serial))
- call->acks_highest_serial = acked_serial;
- break;
+ if (summary.acked_serial) {
+ switch (summary.ack_reason) {
+ case RXRPC_ACK_PING_RESPONSE:
+ rxrpc_complete_rtt_probe(call, call->acks_latest_ts,
+ summary.acked_serial, summary.ack_serial,
+ rxrpc_rtt_rx_ping_response);
+ break;
+ default:
+ if (after(summary.acked_serial, call->acks_highest_serial))
+ call->acks_highest_serial = summary.acked_serial;
+ summary.rtt_sample_avail = true;
+ break;
+ }
}
/* Parse rwind and mtu sizes if provided. */
if (trailer.maxMTU)
rxrpc_input_ack_trailer(call, skb, &trailer);
- if (first_soft_ack == 0)
+ if (hard_ack + 1 == 0)
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero);
/* Ignore ACKs unless we are or have just been transmitting. */
@@ -957,13 +1157,13 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
goto send_response;
}
- if (before(hard_ack, call->acks_hard_ack) ||
+ if (before(hard_ack, call->tx_bottom) ||
after(hard_ack, call->tx_top))
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_outside_window);
if (nr_acks > call->tx_top - hard_ack)
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow);
- if (after(hard_ack, call->acks_hard_ack)) {
+ if (after(hard_ack, call->tx_bottom)) {
if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) {
rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack);
goto send_response;
@@ -973,25 +1173,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
if (nr_acks > 0) {
if (offset > (int)skb->len - nr_acks)
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack);
- rxrpc_input_soft_acks(call, &summary, skb, first_soft_ack, since);
- rxrpc_get_skb(skb, rxrpc_skb_get_last_nack);
- call->cong_last_nack = skb;
+ rxrpc_input_soft_acks(call, &summary, skb);
}
if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) &&
- summary.nr_acks == call->tx_top - hard_ack &&
+ call->acks_nr_sacks == call->tx_top - hard_ack &&
rxrpc_is_client_call(call))
- rxrpc_propose_ping(call, ack_serial,
+ rxrpc_propose_ping(call, summary.ack_serial,
rxrpc_propose_ack_ping_for_lost_reply);
- rxrpc_congestion_management(call, skb, &summary, acked_serial);
+ /* Drive the congestion management algorithm first and then RACK-TLP as
+ * the latter depends on the state/change in state in the former.
+ */
+ rxrpc_congestion_management(call, &summary);
+ rxrpc_rack_detect_loss_and_arm_timer(call, &summary);
+ rxrpc_tlp_process_ack(call, &summary);
+ if (call->tlp_serial && after_eq(summary.acked_serial, call->tlp_serial))
+ call->tlp_serial = 0;
send_response:
if (summary.ack_reason == RXRPC_ACK_PING)
- rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial,
+ rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, summary.ack_serial,
rxrpc_propose_ack_respond_to_ping);
else if (sp->hdr.flags & RXRPC_REQUEST_ACK)
- rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial,
+ rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, summary.ack_serial,
rxrpc_propose_ack_respond_to_ack);
}
@@ -1090,5 +1295,5 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb)
break;
}
- rxrpc_input_call_event(call, skb);
+ rxrpc_input_call_event(call);
}
diff --git a/net/rxrpc/input_rack.c b/net/rxrpc/input_rack.c
new file mode 100644
index 000000000000..13c371261e0a
--- /dev/null
+++ b/net/rxrpc/input_rack.c
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* RACK-TLP [RFC8958] Implementation
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "ar-internal.h"
+
+static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1,
+ ktime_t t2, rxrpc_seq_t seq2)
+{
+ if (ktime_after(t1, t2))
+ return true;
+ return t1 == t2 && after(seq1, seq2);
+}
+
+/*
+ * Mark a packet lost.
+ */
+static void rxrpc_rack_mark_lost(struct rxrpc_call *call,
+ struct rxrpc_txqueue *tq, unsigned int ix)
+{
+ if (__test_and_set_bit(ix, &tq->segment_lost)) {
+ if (__test_and_clear_bit(ix, &tq->segment_retransmitted))
+ call->tx_nr_resent--;
+ } else {
+ call->tx_nr_lost++;
+ }
+ tq->segment_xmit_ts[ix] = UINT_MAX;
+}
+
+/*
+ * Get the transmission time of a packet in the Tx queue.
+ */
+static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix)
+{
+ if (tq->segment_xmit_ts[ix] == UINT_MAX)
+ return KTIME_MAX;
+ return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
+}
+
+/*
+ * Get a bitmask of nack bits for a queue segment and mask off any that aren't
+ * yet reported.
+ */
+static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq)
+{
+ unsigned long nacks = ~tq->segment_acked;
+
+ if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE)
+ nacks &= (1UL << tq->nr_reported_acks) - 1;
+ return nacks;
+}
+
+/*
+ * Update the RACK state for the most recently sent packet that has been
+ * delivered [RFC8958 6.2 Step 2].
+ */
+static void rxrpc_rack_update(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix)
+{
+ rxrpc_seq_t seq = tq->qbase + ix;
+ ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+ ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts);
+
+ if (__test_and_clear_bit(ix, &tq->segment_lost))
+ call->tx_nr_lost--;
+
+ if (test_bit(ix, &tq->segment_retransmitted)) {
+ /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */
+ if (before(call->acks_highest_serial, tq->segment_serial[ix]))
+ return;
+ if (rtt < minmax_get(&call->min_rtt))
+ return;
+ }
+
+ /* The RACK algorithm requires the segment ACKs to be traversed in
+ * order of segment transmission - but the only thing this seems to
+ * matter for is that RACK.rtt is set to the rtt of the most recently
+ * transmitted segment. We should be able to achieve the same by only
+ * setting RACK.rtt if the xmit time is greater.
+ */
+ if (ktime_after(xmit_ts, call->rack_rtt_ts)) {
+ call->rack_rtt = rtt;
+ call->rack_rtt_ts = xmit_ts;
+ }
+
+ if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) {
+ call->rack_rtt = rtt;
+ call->rack_xmit_ts = xmit_ts;
+ call->rack_end_seq = seq;
+ }
+}
+
+/*
+ * Detect data segment reordering [RFC8958 6.2 Step 3].
+ */
+static void rxrpc_rack_detect_reordering(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix)
+{
+ rxrpc_seq_t seq = tq->qbase + ix;
+
+ /* Track the highest sequence number so far ACK'd. This is not
+ * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer
+ * could put a NACK in the last SACK slot.
+ */
+ if (after(seq, call->rack_fack))
+ call->rack_fack = seq;
+ else if (before(seq, call->rack_fack) &&
+ test_bit(ix, &tq->segment_retransmitted))
+ call->rack_reordering_seen = true;
+}
+
+void rxrpc_input_rack_one(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix)
+{
+ rxrpc_rack_update(call, summary, tq, ix);
+ rxrpc_rack_detect_reordering(call, summary, tq, ix);
+}
+
+void rxrpc_input_rack(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned long new_acks)
+{
+ while (new_acks) {
+ unsigned int ix = __ffs(new_acks);
+
+ __clear_bit(ix, &new_acks);
+ rxrpc_input_rack_one(call, summary, tq, ix);
+ }
+
+ trace_rxrpc_rack_update(call, summary);
+}
+
+/*
+ * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated
+ * duration of the reordering window.
+ *
+ * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can
+ * be given a 'DUPLICATE' reason with the serial number referring to the
+ * duplicated DATA packet. Rx does not inform as to whether this was a
+ * reception of the same packet twice or of a retransmission of a packet we
+ * already received (though this could be determined by the transmitter based
+ * on the serial number).
+ */
+static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary)
+{
+ rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
+ rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */
+ bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE;
+ int dup_thresh = 3;
+
+ /* DSACK-based reordering window adaptation */
+ if (!call->rack_dsack_round_none &&
+ after_eq(snd_una, call->rack_dsack_round))
+ call->rack_dsack_round_none = true;
+
+ /* Grow the reordering window per round that sees DSACK. Reset the
+ * window after 16 DSACK-free recoveries.
+ */
+ if (call->rack_dsack_round_none && have_dsack_option) {
+ call->rack_dsack_round_none = false;
+ call->rack_dsack_round = snd_nxt;
+ call->rack_reo_wnd_mult++;
+ call->rack_reo_wnd_persist = 16;
+ } else if (summary->exiting_fast_or_rto_recovery) {
+ call->rack_reo_wnd_persist--;
+ if (call->rack_reo_wnd_persist <= 0)
+ call->rack_reo_wnd_mult = 1;
+ }
+
+ if (!call->rack_reordering_seen) {
+ if (summary->in_fast_or_rto_recovery)
+ return 0;
+ if (call->acks_nr_sacks >= dup_thresh)
+ return 0;
+ }
+
+ return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4,
+ call->srtt_us >> 3));
+}
+
+/*
+ * Detect losses [RFC8958 6.2 Step 5].
+ */
+static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary)
+{
+ struct rxrpc_txqueue *tq;
+ ktime_t timeout = 0, lost_after, now = ktime_get_real();
+
+ call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary);
+ lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
+ trace_rxrpc_rack_scan_loss(call);
+
+ for (tq = call->tx_queue; tq; tq = tq->next) {
+ unsigned long nacks = rxrpc_tq_nacks(tq);
+
+ if (after(tq->qbase, call->tx_transmitted))
+ break;
+ trace_rxrpc_rack_scan_loss_tq(call, tq, nacks);
+
+ /* Skip ones marked lost but not yet retransmitted */
+ nacks &= ~tq->segment_lost | tq->segment_retransmitted;
+
+ while (nacks) {
+ unsigned int ix = __ffs(nacks);
+ rxrpc_seq_t seq = tq->qbase + ix;
+ ktime_t remaining;
+ ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+
+ __clear_bit(ix, &nacks);
+
+ if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq,
+ xmit_ts, seq)) {
+ remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now);
+ if (remaining <= 0) {
+ rxrpc_rack_mark_lost(call, tq, ix);
+ trace_rxrpc_rack_detect_loss(call, summary, seq);
+ } else {
+ timeout = max(remaining, timeout);
+ }
+ }
+ }
+ }
+
+ return timeout;
+}
+
+/*
+ * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5].
+ */
+void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary)
+{
+ ktime_t timeout = rxrpc_rack_detect_loss(call, summary);
+
+ if (timeout) {
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER;
+ call->rack_timo_at = ktime_add(ktime_get_real(), timeout);
+ trace_rxrpc_rack_timer(call, timeout, false);
+ trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo);
+ }
+}
+
+/*
+ * Handle RACK-TLP RTO expiration [RFC8958 6.3].
+ */
+static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call)
+{
+ struct rxrpc_txqueue *tq;
+ rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
+ ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
+ ktime_t deadline = ktime_sub(ktime_get_real(), lost_after);
+
+ for (tq = call->tx_queue; tq; tq = tq->next) {
+ unsigned long unacked = ~tq->segment_acked;
+
+ trace_rxrpc_rack_mark_loss_tq(call, tq);
+ while (unacked) {
+ unsigned int ix = __ffs(unacked);
+ rxrpc_seq_t seq = tq->qbase + ix;
+ ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+
+ if (after(seq, call->tx_transmitted))
+ return;
+ __clear_bit(ix, &unacked);
+
+ if (seq == snd_una ||
+ ktime_before(xmit_ts, deadline))
+ rxrpc_rack_mark_lost(call, tq, ix);
+ }
+ }
+}
+
+/*
+ * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2].
+ */
+ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now)
+{
+ unsigned int flight_size = rxrpc_tx_in_flight(call);
+ ktime_t rto_at = ktime_add(call->tx_last_sent,
+ rxrpc_get_rto_backoff(call, false));
+ ktime_t pto;
+
+ if (call->rtt_count > 0) {
+ /* Use 2*SRTT as the timeout. */
+ pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4);
+ if (flight_size)
+ pto = ktime_add(pto, call->tlp_max_ack_delay);
+ } else {
+ pto = NSEC_PER_SEC;
+ }
+
+ if (ktime_after(ktime_add(now, pto), rto_at))
+ pto = ktime_sub(rto_at, now);
+ return pto;
+}
+
+/*
+ * Send a TLP loss probe on PTO expiration [RFC8958 7.3].
+ */
+void rxrpc_tlp_send_probe(struct rxrpc_call *call)
+{
+ unsigned int in_flight = rxrpc_tx_in_flight(call);
+
+ if (after_eq(call->acks_hard_ack, call->tx_transmitted))
+ return; /* Everything we transmitted has been acked. */
+
+ /* There must be no other loss probe still in flight and we need to
+ * have taken a new RTT sample since last probe or the start of
+ * connection.
+ */
+ if (!call->tlp_serial &&
+ call->tlp_rtt_taken != call->rtt_taken) {
+ call->tlp_is_retrans = false;
+ if (after(call->send_top, call->tx_transmitted) &&
+ rxrpc_tx_window_space(call) > 0) {
+ /* Transmit the lowest-sequence unsent DATA */
+ call->tx_last_serial = 0;
+ rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data);
+ call->tlp_serial = call->tx_last_serial;
+ call->tlp_seq = call->tx_transmitted;
+ trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new);
+ in_flight = rxrpc_tx_in_flight(call);
+ } else {
+ /* Retransmit the highest-sequence DATA sent */
+ call->tx_last_serial = 0;
+ rxrpc_resend_tlp(call);
+ call->tlp_is_retrans = true;
+ trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit);
+ }
+ } else {
+ trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy);
+ }
+
+ if (in_flight != 0) {
+ ktime_t rto = rxrpc_get_rto_backoff(call, false);
+
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO;
+ call->rack_timo_at = ktime_add(ktime_get_real(), rto);
+ trace_rxrpc_rack_timer(call, rto, false);
+ trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto);
+ }
+}
+
+/*
+ * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4].
+ */
+void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary)
+{
+ if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack))
+ return;
+
+ if (!call->tlp_is_retrans) {
+ /* TLP of new data delivered */
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data);
+ call->tlp_serial = 0;
+ } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE &&
+ summary->acked_serial == call->tlp_serial) {
+ /* General Case: Detected packet losses using RACK [7.4.1] */
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked);
+ call->tlp_serial = 0;
+ } else if (after(call->acks_hard_ack, call->tlp_seq)) {
+ /* Repaired the single loss */
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond);
+ call->tlp_serial = 0;
+ // TODO: Invoke congestion control to react to the loss
+ // event the probe has repaired
+ } else if (summary->tlp_probe_acked) {
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked);
+ /* Special Case: Detected a single loss repaired by the loss
+ * probe [7.4.2]
+ */
+ call->tlp_serial = 0;
+ } else {
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete);
+ }
+}
+
+/*
+ * Handle RACK timer expiration; returns true to request a resend.
+ */
+void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by)
+{
+ struct rxrpc_ack_summary summary = {};
+ enum rxrpc_rack_timer_mode mode = call->rack_timer_mode;
+
+ trace_rxrpc_rack_timer(call, overran_by, true);
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF;
+
+ switch (mode) {
+ case RXRPC_CALL_RACKTIMER_RACK_REORDER:
+ rxrpc_rack_detect_loss_and_arm_timer(call, &summary);
+ break;
+ case RXRPC_CALL_RACKTIMER_TLP_PTO:
+ rxrpc_tlp_send_probe(call);
+ break;
+ case RXRPC_CALL_RACKTIMER_RTO:
+ // Might need to poke the congestion algo in some way
+ rxrpc_rack_mark_losses_on_rto(call);
+ break;
+ //case RXRPC_CALL_RACKTIMER_ZEROWIN:
+ default:
+ pr_warn("Unexpected rack timer %u", call->rack_timer_mode);
+ }
+}
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index f2701068ed9e..0a260df45d25 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -19,11 +19,14 @@ static int none_init_connection_security(struct rxrpc_connection *conn,
*/
static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp)
{
- return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 0, gfp);
+ return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp);
}
static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
{
+ txb->pkt_len = txb->len;
+ if (txb->len == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
return 0;
}
@@ -39,11 +42,18 @@ static void none_free_call_crypto(struct rxrpc_call *call)
{
}
-static int none_respond_to_challenge(struct rxrpc_connection *conn,
- struct sk_buff *skb)
+static bool none_validate_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
{
- return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO,
- rxrpc_eproto_rxnull_challenge);
+ rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO,
+ rxrpc_eproto_rxnull_challenge);
+ return true;
+}
+
+static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge,
+ struct msghdr *msg)
+{
+ return -EINVAL;
}
static int none_verify_response(struct rxrpc_connection *conn,
@@ -79,7 +89,8 @@ const struct rxrpc_security rxrpc_no_security = {
.alloc_txbuf = none_alloc_txbuf,
.secure_packet = none_secure_packet,
.verify_packet = none_verify_packet,
- .respond_to_challenge = none_respond_to_challenge,
+ .validate_challenge = none_validate_challenge,
+ .sendmsg_respond_to_challenge = none_sendmsg_respond_to_challenge,
.verify_response = none_verify_response,
.clear = none_clear,
};
diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c
index 0300baa9afcd..27b650d30f4d 100644
--- a/net/rxrpc/io_thread.c
+++ b/net/rxrpc/io_thread.c
@@ -27,11 +27,17 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb)
{
struct sk_buff_head *rx_queue;
struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk);
+ struct task_struct *io_thread;
if (unlikely(!local)) {
kfree_skb(skb);
return 0;
}
+ io_thread = READ_ONCE(local->io_thread);
+ if (!io_thread) {
+ kfree_skb(skb);
+ return 0;
+ }
if (skb->tstamp == 0)
skb->tstamp = ktime_get_real();
@@ -47,7 +53,7 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb)
#endif
skb_queue_tail(rx_queue, skb);
- rxrpc_wake_up_io_thread(local);
+ wake_up_process(io_thread);
return 0;
}
@@ -332,7 +338,6 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
struct rxrpc_channel *chan;
struct rxrpc_call *call = NULL;
unsigned int channel;
- bool ret;
if (sp->hdr.securityIndex != conn->security_ix)
return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security,
@@ -358,6 +363,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
if (sp->hdr.callNumber == 0)
return rxrpc_input_conn_packet(conn, skb);
+ /* Deal with path MTU discovery probing. */
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK &&
+ conn->pmtud_probe &&
+ after_eq(sp->ack.acked_serial, conn->pmtud_probe))
+ rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false);
+
/* Call-bound packets are routed by connection channel. */
channel = sp->hdr.cid & RXRPC_CHANNELMASK;
chan = &conn->channels[channel];
@@ -413,9 +424,9 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
peer_srx, skb);
}
- ret = rxrpc_input_call_event(call, skb);
+ rxrpc_queue_rx_call_packet(call, skb);
rxrpc_put_call(call, rxrpc_call_put_input);
- return ret;
+ return true;
}
/*
@@ -432,6 +443,8 @@ int rxrpc_io_thread(void *data)
ktime_t now;
#endif
bool should_stop;
+ LIST_HEAD(conn_attend_q);
+ LIST_HEAD(call_attend_q);
complete(&local->io_thread_ready);
@@ -442,43 +455,26 @@ int rxrpc_io_thread(void *data)
for (;;) {
rxrpc_inc_stat(local->rxnet, stat_io_loop);
- /* Deal with connections that want immediate attention. */
- conn = list_first_entry_or_null(&local->conn_attend_q,
- struct rxrpc_connection,
- attend_link);
- if (conn) {
- spin_lock_bh(&local->lock);
- list_del_init(&conn->attend_link);
- spin_unlock_bh(&local->lock);
-
- rxrpc_input_conn_event(conn, NULL);
- rxrpc_put_connection(conn, rxrpc_conn_put_poke);
- continue;
+ /* Inject a delay into packets if requested. */
+#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
+ now = ktime_get_real();
+ while ((skb = skb_peek(&local->rx_delay_queue))) {
+ if (ktime_before(now, skb->tstamp))
+ break;
+ skb = skb_dequeue(&local->rx_delay_queue);
+ skb_queue_tail(&local->rx_queue, skb);
}
+#endif
- if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER,
- &local->client_conn_flags))
- rxrpc_discard_expired_client_conns(local);
-
- /* Deal with calls that want immediate attention. */
- if ((call = list_first_entry_or_null(&local->call_attend_q,
- struct rxrpc_call,
- attend_link))) {
- spin_lock_bh(&local->lock);
- list_del_init(&call->attend_link);
- spin_unlock_bh(&local->lock);
-
- trace_rxrpc_call_poked(call);
- rxrpc_input_call_event(call, NULL);
- rxrpc_put_call(call, rxrpc_call_put_poke);
- continue;
+ if (!skb_queue_empty(&local->rx_queue)) {
+ spin_lock_irq(&local->rx_queue.lock);
+ skb_queue_splice_tail_init(&local->rx_queue, &rx_queue);
+ spin_unlock_irq(&local->rx_queue.lock);
+ trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue));
}
- if (!list_empty(&local->new_client_calls))
- rxrpc_connect_client_calls(local);
-
- /* Process received packets and errors. */
- if ((skb = __skb_dequeue(&rx_queue))) {
+ /* Distribute packets and errors. */
+ while ((skb = __skb_dequeue(&rx_queue))) {
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
switch (skb->mark) {
case RXRPC_SKB_MARK_PACKET:
@@ -493,8 +489,8 @@ int rxrpc_io_thread(void *data)
rxrpc_free_skb(skb, rxrpc_skb_put_error_report);
break;
case RXRPC_SKB_MARK_SERVICE_CONN_SECURED:
- rxrpc_input_conn_event(sp->conn, skb);
- rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke);
+ rxrpc_input_conn_event(sp->poke_conn, skb);
+ rxrpc_put_connection(sp->poke_conn, rxrpc_conn_put_poke);
rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured);
break;
default:
@@ -502,27 +498,48 @@ int rxrpc_io_thread(void *data)
rxrpc_free_skb(skb, rxrpc_skb_put_unknown);
break;
}
- continue;
}
- /* Inject a delay into packets if requested. */
-#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
- now = ktime_get_real();
- while ((skb = skb_peek(&local->rx_delay_queue))) {
- if (ktime_before(now, skb->tstamp))
- break;
- skb = skb_dequeue(&local->rx_delay_queue);
- skb_queue_tail(&local->rx_queue, skb);
+ /* Deal with connections that want immediate attention. */
+ if (!list_empty_careful(&local->conn_attend_q)) {
+ spin_lock_irq(&local->lock);
+ list_splice_tail_init(&local->conn_attend_q, &conn_attend_q);
+ spin_unlock_irq(&local->lock);
}
-#endif
- if (!skb_queue_empty(&local->rx_queue)) {
- spin_lock_irq(&local->rx_queue.lock);
- skb_queue_splice_tail_init(&local->rx_queue, &rx_queue);
- spin_unlock_irq(&local->rx_queue.lock);
- continue;
+ while ((conn = list_first_entry_or_null(&conn_attend_q,
+ struct rxrpc_connection,
+ attend_link))) {
+ spin_lock_irq(&local->lock);
+ list_del_init(&conn->attend_link);
+ spin_unlock_irq(&local->lock);
+ rxrpc_input_conn_event(conn, NULL);
+ rxrpc_put_connection(conn, rxrpc_conn_put_poke);
}
+ if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER,
+ &local->client_conn_flags))
+ rxrpc_discard_expired_client_conns(local);
+
+ /* Deal with calls that want immediate attention. */
+ spin_lock_irq(&local->lock);
+ list_splice_tail_init(&local->call_attend_q, &call_attend_q);
+ spin_unlock_irq(&local->lock);
+
+ while ((call = list_first_entry_or_null(&call_attend_q,
+ struct rxrpc_call,
+ attend_link))) {
+ spin_lock_irq(&local->lock);
+ list_del_init(&call->attend_link);
+ spin_unlock_irq(&local->lock);
+ trace_rxrpc_call_poked(call);
+ rxrpc_input_call_event(call);
+ rxrpc_put_call(call, rxrpc_call_put_poke);
+ }
+
+ if (!list_empty(&local->new_client_calls))
+ rxrpc_connect_client_calls(local);
+
set_current_state(TASK_INTERRUPTIBLE);
should_stop = kthread_should_stop();
if (!skb_queue_empty(&local->rx_queue) ||
@@ -552,7 +569,7 @@ int rxrpc_io_thread(void *data)
}
timeout = nsecs_to_jiffies(delay_ns);
- timeout = max(timeout, 1UL);
+ timeout = umax(timeout, 1);
schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);
continue;
@@ -565,7 +582,7 @@ int rxrpc_io_thread(void *data)
__set_current_state(TASK_RUNNING);
rxrpc_see_local(local, rxrpc_local_stop);
rxrpc_destroy_local(local);
- local->io_thread = NULL;
+ WRITE_ONCE(local->io_thread, NULL);
rxrpc_see_local(local, rxrpc_local_stopped);
return 0;
}
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 33e8302a79e3..9fdc1f031c9d 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -129,6 +129,160 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep,
return 0;
}
+static u64 xdr_dec64(const __be32 *xdr)
+{
+ return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]);
+}
+
+static time64_t rxrpc_s64_to_time64(s64 time_in_100ns)
+{
+ bool neg = false;
+ u64 tmp = time_in_100ns;
+
+ if (time_in_100ns < 0) {
+ tmp = -time_in_100ns;
+ neg = true;
+ }
+ do_div(tmp, 10000000);
+ return neg ? -tmp : tmp;
+}
+
+/*
+ * Parse a YFS-RxGK type XDR format token
+ * - the caller guarantees we have at least 4 words
+ *
+ * struct token_rxgk {
+ * opr_time begintime;
+ * opr_time endtime;
+ * afs_int64 level;
+ * afs_int64 lifetime;
+ * afs_int64 bytelife;
+ * afs_int64 enctype;
+ * opaque key<>;
+ * opaque ticket<>;
+ * };
+ */
+static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep,
+ size_t datalen,
+ const __be32 *xdr, unsigned int toklen)
+{
+ struct rxrpc_key_token *token, **pptoken;
+ time64_t expiry;
+ size_t plen;
+ const __be32 *ticket, *key;
+ s64 tmp;
+ u32 tktlen, keylen;
+
+ _enter(",{%x,%x,%x,%x},%x",
+ ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+ toklen);
+
+ if (6 * 2 + 2 > toklen / 4)
+ goto reject;
+
+ key = xdr + (6 * 2 + 1);
+ keylen = ntohl(key[-1]);
+ _debug("keylen: %x", keylen);
+ keylen = round_up(keylen, 4);
+ if ((6 * 2 + 2) * 4 + keylen > toklen)
+ goto reject;
+
+ ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1);
+ tktlen = ntohl(ticket[-1]);
+ _debug("tktlen: %x", tktlen);
+ tktlen = round_up(tktlen, 4);
+ if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) {
+ kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]",
+ (6 * 2 + 2) * 4 + keylen + tktlen, toklen,
+ keylen, tktlen);
+ goto reject;
+ }
+
+ plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen;
+ prep->quotalen = datalen + plen;
+
+ plen -= sizeof(*token);
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
+ if (!token)
+ goto nomem;
+
+ token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL);
+ if (!token->rxgk)
+ goto nomem_token;
+
+ token->security_index = RXRPC_SECURITY_YFS_RXGK;
+ token->rxgk->begintime = xdr_dec64(xdr + 0 * 2);
+ token->rxgk->endtime = xdr_dec64(xdr + 1 * 2);
+ token->rxgk->level = tmp = xdr_dec64(xdr + 2 * 2);
+ if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT)
+ goto reject_token;
+ token->rxgk->lifetime = xdr_dec64(xdr + 3 * 2);
+ token->rxgk->bytelife = xdr_dec64(xdr + 4 * 2);
+ token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2);
+ if (tmp < 0 || tmp > UINT_MAX)
+ goto reject_token;
+ token->rxgk->key.len = ntohl(key[-1]);
+ token->rxgk->key.data = token->rxgk->_key;
+ token->rxgk->ticket.len = ntohl(ticket[-1]);
+
+ if (token->rxgk->endtime != 0) {
+ expiry = rxrpc_s64_to_time64(token->rxgk->endtime);
+ if (expiry < 0)
+ goto expired;
+ if (expiry < prep->expiry)
+ prep->expiry = expiry;
+ }
+
+ memcpy(token->rxgk->key.data, key, token->rxgk->key.len);
+
+ /* Pad the ticket so that we can use it directly in XDR */
+ token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4),
+ GFP_KERNEL);
+ if (!token->rxgk->ticket.data)
+ goto nomem_yrxgk;
+ memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len);
+
+ _debug("SCIX: %u", token->security_index);
+ _debug("EXPY: %llx", token->rxgk->endtime);
+ _debug("LIFE: %llx", token->rxgk->lifetime);
+ _debug("BYTE: %llx", token->rxgk->bytelife);
+ _debug("ENC : %u", token->rxgk->enctype);
+ _debug("LEVL: %u", token->rxgk->level);
+ _debug("KLEN: %u", token->rxgk->key.len);
+ _debug("TLEN: %u", token->rxgk->ticket.len);
+ _debug("KEY0: %*phN", token->rxgk->key.len, token->rxgk->key.data);
+ _debug("TICK: %*phN",
+ min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data);
+
+ /* count the number of tokens attached */
+ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1);
+
+ /* attach the data */
+ for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0];
+ *pptoken;
+ pptoken = &(*pptoken)->next)
+ continue;
+ *pptoken = token;
+
+ _leave(" = 0");
+ return 0;
+
+nomem_yrxgk:
+ kfree(token->rxgk);
+nomem_token:
+ kfree(token);
+nomem:
+ return -ENOMEM;
+reject_token:
+ kfree(token);
+reject:
+ return -EKEYREJECTED;
+expired:
+ kfree(token->rxgk);
+ kfree(token);
+ return -EKEYEXPIRED;
+}
+
/*
* attempt to parse the data as the XDR format
* - the caller guarantees we have more than 7 words
@@ -228,6 +382,9 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
case RXRPC_SECURITY_RXKAD:
ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen);
break;
+ case RXRPC_SECURITY_YFS_RXGK:
+ ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen);
+ break;
default:
ret2 = -EPROTONOSUPPORT;
break;
@@ -390,6 +547,10 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token)
case RXRPC_SECURITY_RXKAD:
kfree(token->kad);
break;
+ case RXRPC_SECURITY_YFS_RXGK:
+ kfree(token->rxgk->ticket.data);
+ kfree(token->rxgk);
+ break;
default:
pr_err("Unknown token type %x on rxrpc key\n",
token->security_index);
@@ -433,6 +594,9 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m)
case RXRPC_SECURITY_RXKAD:
seq_puts(m, "ka");
break;
+ case RXRPC_SECURITY_YFS_RXGK:
+ seq_puts(m, "ygk");
+ break;
default: /* we have a ticket we can't encode */
seq_printf(m, "%u", token->security_index);
break;
@@ -531,6 +695,8 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key);
*
* Generate a null RxRPC key that can be used to indicate anonymous security is
* required for a particular domain.
+ *
+ * Return: The new key or a negative error code.
*/
struct key *rxrpc_get_null_key(const char *keyname)
{
@@ -595,6 +761,13 @@ static long rxrpc_read(const struct key *key,
toksize += RND(token->kad->ticket_len);
break;
+ case RXRPC_SECURITY_YFS_RXGK:
+ toksize += 6 * 8 + 2 * 4;
+ if (!token->no_leak_key)
+ toksize += RND(token->rxgk->key.len);
+ toksize += RND(token->rxgk->ticket.len);
+ break;
+
default: /* we have a ticket we can't encode */
pr_err("Unsupported key token type (%u)\n",
token->security_index);
@@ -674,6 +847,20 @@ static long rxrpc_read(const struct key *key,
ENCODE_DATA(token->kad->ticket_len, token->kad->ticket);
break;
+ case RXRPC_SECURITY_YFS_RXGK:
+ ENCODE64(token->rxgk->begintime);
+ ENCODE64(token->rxgk->endtime);
+ ENCODE64(token->rxgk->level);
+ ENCODE64(token->rxgk->lifetime);
+ ENCODE64(token->rxgk->bytelife);
+ ENCODE64(token->rxgk->enctype);
+ if (token->no_leak_key)
+ ENCODE(0);
+ else
+ ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data);
+ ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data);
+ break;
+
default:
pr_err("Unsupported key token type (%u)\n",
token->security_index);
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a74a4b43904f 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -215,9 +215,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
/* we want to set the don't fragment bit */
rxrpc_local_dont_fragment(local, true);
-
- /* We want receive timestamps. */
- sock_enable_timestamps(usk);
break;
default:
@@ -232,7 +229,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
}
wait_for_completion(&local->io_thread_ready);
- local->io_thread = io_thread;
+ WRITE_ONCE(local->io_thread, io_thread);
_leave(" = 0");
return 0;
@@ -452,9 +449,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
#endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
- if (local->tx_alloc.va)
- __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
- local->tx_alloc.pagecnt_bias);
+ page_frag_cache_drain(&local->tx_alloc);
}
/*
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 657cf35089a6..8fcc8139d771 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255;
* Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
* made by gluing normal packets together that we're willing to handle.
*/
-unsigned int rxrpc_rx_mtu = 5692;
+unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46);
/*
* The maximum number of fragments in a received jumbo packet that we tell the
* sender that we're willing to handle.
*/
-unsigned int rxrpc_rx_jumbo_max = 4;
+unsigned int rxrpc_rx_jumbo_max = 46;
#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
/*
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index a4c135d0fbcc..9a9834145e81 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -105,10 +105,10 @@ static __net_exit void rxrpc_exit_net(struct net *net)
struct rxrpc_net *rxnet = rxrpc_net(net);
rxnet->live = false;
- del_timer_sync(&rxnet->peer_keepalive_timer);
+ timer_delete_sync(&rxnet->peer_keepalive_timer);
cancel_work_sync(&rxnet->peer_keepalive_work);
/* Remove the timer again as the worker may have restarted it. */
- del_timer_sync(&rxnet->peer_keepalive_timer);
+ timer_delete_sync(&rxnet->peer_keepalive_timer);
rxrpc_destroy_all_calls(rxnet);
rxrpc_destroy_all_connections(rxnet);
rxrpc_destroy_all_peers(rxnet);
diff --git a/net/rxrpc/oob.c b/net/rxrpc/oob.c
new file mode 100644
index 000000000000..05ca9c1faa57
--- /dev/null
+++ b/net/rxrpc/oob.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Out of band message handling (e.g. challenge-response)
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/net.h>
+#include <linux/gfp.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <linux/sched/signal.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+enum rxrpc_oob_command {
+ RXRPC_OOB_CMD_UNSET,
+ RXRPC_OOB_CMD_RESPOND,
+} __mode(byte);
+
+struct rxrpc_oob_params {
+ u64 oob_id; /* ID number of message if reply */
+ s32 abort_code;
+ enum rxrpc_oob_command command;
+ bool have_oob_id:1;
+};
+
+/*
+ * Post an out-of-band message for attention by the socket or kernel service
+ * associated with a reference call.
+ */
+void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_sock *rx;
+ struct sock *sk;
+
+ rcu_read_lock();
+
+ rx = rcu_dereference(call->socket);
+ if (rx) {
+ sk = &rx->sk;
+ spin_lock_irq(&rx->recvmsg_lock);
+
+ if (sk->sk_state < RXRPC_CLOSE) {
+ skb->skb_mstamp_ns = rx->oob_id_counter++;
+ rxrpc_get_skb(skb, rxrpc_skb_get_post_oob);
+ skb_queue_tail(&rx->recvmsg_oobq, skb);
+
+ trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial);
+ if (rx->app_ops)
+ rx->app_ops->notify_oob(sk, skb);
+ }
+
+ spin_unlock_irq(&rx->recvmsg_lock);
+ if (!rx->app_ops && !sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Locate the OOB message to respond to by its ID.
+ */
+static struct sk_buff *rxrpc_find_pending_oob(struct rxrpc_sock *rx, u64 oob_id)
+{
+ struct rb_node *p;
+ struct sk_buff *skb;
+
+ p = rx->pending_oobq.rb_node;
+ while (p) {
+ skb = rb_entry(p, struct sk_buff, rbnode);
+
+ if (oob_id < skb->skb_mstamp_ns)
+ p = p->rb_left;
+ else if (oob_id > skb->skb_mstamp_ns)
+ p = p->rb_right;
+ else
+ return skb;
+ }
+
+ return NULL;
+}
+
+/*
+ * Add an OOB message into the pending-response set. We always assign the next
+ * value from a 64-bit counter to the oob_id, so just assume we're always going
+ * to be on the right-hand edge of the tree and that the counter won't wrap.
+ * The tree is also given a ref to the message.
+ */
+void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb)
+{
+ struct rb_node **pp = &rx->pending_oobq.rb_node, *p = NULL;
+
+ while (*pp) {
+ p = *pp;
+ pp = &(*pp)->rb_right;
+ }
+
+ rb_link_node(&skb->rbnode, p, pp);
+ rb_insert_color(&skb->rbnode, &rx->pending_oobq);
+}
+
+/*
+ * Extract control messages from the sendmsg() control buffer.
+ */
+static int rxrpc_sendmsg_oob_cmsg(struct msghdr *msg, struct rxrpc_oob_params *p)
+{
+ struct cmsghdr *cmsg;
+ int len;
+
+ if (msg->msg_controllen == 0)
+ return -EINVAL;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ len = cmsg->cmsg_len - sizeof(struct cmsghdr);
+ _debug("CMSG %d, %d, %d",
+ cmsg->cmsg_level, cmsg->cmsg_type, len);
+
+ if (cmsg->cmsg_level != SOL_RXRPC)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case RXRPC_OOB_ID:
+ if (len != sizeof(p->oob_id) || p->have_oob_id)
+ return -EINVAL;
+ memcpy(&p->oob_id, CMSG_DATA(cmsg), sizeof(p->oob_id));
+ p->have_oob_id = true;
+ break;
+ case RXRPC_RESPOND:
+ if (p->command != RXRPC_OOB_CMD_UNSET)
+ return -EINVAL;
+ p->command = RXRPC_OOB_CMD_RESPOND;
+ break;
+ case RXRPC_ABORT:
+ if (len != sizeof(p->abort_code) || p->abort_code)
+ return -EINVAL;
+ memcpy(&p->abort_code, CMSG_DATA(cmsg), sizeof(p->abort_code));
+ if (p->abort_code == 0)
+ return -EINVAL;
+ break;
+ case RXRPC_RESP_RXGK_APPDATA:
+ if (p->command != RXRPC_OOB_CMD_RESPOND)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ switch (p->command) {
+ case RXRPC_OOB_CMD_RESPOND:
+ if (!p->have_oob_id)
+ return -EBADSLT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Allow userspace to respond to an OOB using sendmsg().
+ */
+static int rxrpc_respond_to_oob(struct rxrpc_sock *rx,
+ struct rxrpc_oob_params *p,
+ struct msghdr *msg)
+{
+ struct rxrpc_connection *conn;
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ int ret;
+
+ skb = rxrpc_find_pending_oob(rx, p->oob_id);
+ if (skb)
+ rb_erase(&skb->rbnode, &rx->pending_oobq);
+ release_sock(&rx->sk);
+ if (!skb)
+ return -EBADSLT;
+
+ sp = rxrpc_skb(skb);
+
+ switch (p->command) {
+ case RXRPC_OOB_CMD_RESPOND:
+ ret = -EPROTO;
+ if (skb->mark != RXRPC_OOB_CHALLENGE)
+ break;
+ conn = sp->chall.conn;
+ ret = -EOPNOTSUPP;
+ if (!conn->security->sendmsg_respond_to_challenge)
+ break;
+ if (p->abort_code) {
+ rxrpc_abort_conn(conn, NULL, p->abort_code, -ECONNABORTED,
+ rxrpc_abort_response_sendmsg);
+ ret = 0;
+ } else {
+ ret = conn->security->sendmsg_respond_to_challenge(skb, msg);
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ rxrpc_free_skb(skb, rxrpc_skb_put_oob);
+ return ret;
+}
+
+/*
+ * Send an out-of-band message or respond to a received out-of-band message.
+ * - caller gives us the socket lock
+ * - the socket may be either a client socket or a server socket
+ */
+int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
+{
+ struct rxrpc_oob_params p = {};
+ int ret;
+
+ _enter("");
+
+ ret = rxrpc_sendmsg_oob_cmsg(msg, &p);
+ if (ret < 0)
+ goto error_release_sock;
+
+ if (p.have_oob_id)
+ return rxrpc_respond_to_oob(rx, &p, msg);
+
+ release_sock(&rx->sk);
+
+ switch (p.command) {
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+
+error_release_sock:
+ release_sock(&rx->sk);
+ return ret;
+}
+
+/**
+ * rxrpc_kernel_query_oob - Query the parameters of an out-of-band message
+ * @oob: The message to query
+ * @_peer: Where to return the peer record
+ * @_peer_appdata: The application data attached to a peer record
+ *
+ * Extract useful parameters from an out-of-band message. The source peer
+ * parameters are returned through the argument list and the message type is
+ * returned.
+ *
+ * Return:
+ * * %RXRPC_OOB_CHALLENGE - Challenge wanting a response.
+ */
+enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob,
+ struct rxrpc_peer **_peer,
+ unsigned long *_peer_appdata)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(oob);
+ enum rxrpc_oob_type type = oob->mark;
+
+ switch (type) {
+ case RXRPC_OOB_CHALLENGE:
+ *_peer = sp->chall.conn->peer;
+ *_peer_appdata = sp->chall.conn->peer->app_data;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ *_peer = NULL;
+ *_peer_appdata = 0;
+ break;
+ }
+
+ return type;
+}
+EXPORT_SYMBOL(rxrpc_kernel_query_oob);
+
+/**
+ * rxrpc_kernel_dequeue_oob - Dequeue and return the front OOB message
+ * @sock: The socket to query
+ * @_type: Where to return the message type
+ *
+ * Dequeue the front OOB message, if there is one, and return it and
+ * its type.
+ *
+ * Return: The sk_buff representing the OOB message or %NULL if the queue was
+ * empty.
+ */
+struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock,
+ enum rxrpc_oob_type *_type)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct sk_buff *oob;
+
+ oob = skb_dequeue(&rx->recvmsg_oobq);
+ if (oob)
+ *_type = oob->mark;
+ return oob;
+}
+EXPORT_SYMBOL(rxrpc_kernel_dequeue_oob);
+
+/**
+ * rxrpc_kernel_free_oob - Free an out-of-band message
+ * @oob: The OOB message to free
+ *
+ * Free an OOB message along with any resources it holds.
+ */
+void rxrpc_kernel_free_oob(struct sk_buff *oob)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(oob);
+
+ switch (oob->mark) {
+ case RXRPC_OOB_CHALLENGE:
+ rxrpc_put_connection(sp->chall.conn, rxrpc_conn_put_oob);
+ break;
+ }
+
+ rxrpc_free_skb(oob, rxrpc_skb_put_purge_oob);
+}
+EXPORT_SYMBOL(rxrpc_kernel_free_oob);
+
+/**
+ * rxrpc_kernel_query_challenge - Query the parameters of a challenge
+ * @challenge: The challenge to query
+ * @_peer: Where to return the peer record
+ * @_peer_appdata: The application data attached to a peer record
+ * @_service_id: Where to return the connection service ID
+ * @_security_index: Where to return the connection security index
+ *
+ * Extract useful parameters from a CHALLENGE message.
+ */
+void rxrpc_kernel_query_challenge(struct sk_buff *challenge,
+ struct rxrpc_peer **_peer,
+ unsigned long *_peer_appdata,
+ u16 *_service_id, u8 *_security_index)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(challenge);
+
+ *_peer = sp->chall.conn->peer;
+ *_peer_appdata = sp->chall.conn->peer->app_data;
+ *_service_id = sp->hdr.serviceId;
+ *_security_index = sp->hdr.securityIndex;
+}
+EXPORT_SYMBOL(rxrpc_kernel_query_challenge);
+
+/**
+ * rxrpc_kernel_reject_challenge - Allow a kernel service to reject a challenge
+ * @challenge: The challenge to be rejected
+ * @abort_code: The abort code to stick into the ABORT packet
+ * @error: Local error value
+ * @why: Indication as to why.
+ *
+ * Allow a kernel service to reject a challenge by aborting the connection if
+ * it's still in an abortable state. The error is returned so this function
+ * can be used with a return statement.
+ *
+ * Return: The %error parameter.
+ */
+int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code,
+ int error, enum rxrpc_abort_reason why)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(challenge);
+
+ _enter("{%x},%d,%d,%u", sp->hdr.serial, abort_code, error, why);
+
+ rxrpc_abort_conn(sp->chall.conn, NULL, abort_code, error, why);
+ return error;
+}
+EXPORT_SYMBOL(rxrpc_kernel_reject_challenge);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 5ea9601efd05..ef7b3096c95e 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -18,7 +18,7 @@
extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
-static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len)
+ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len)
{
struct sockaddr *sa = msg->msg_name;
struct sock *sk = socket->sk;
@@ -72,22 +72,96 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call, ktime_t now)
}
/*
+ * Allocate transmission buffers for an ACK and attach them to local->kv[].
+ */
+static int rxrpc_alloc_ack(struct rxrpc_call *call, size_t sack_size)
+{
+ struct rxrpc_wire_header *whdr;
+ struct rxrpc_acktrailer *trailer;
+ struct rxrpc_ackpacket *ack;
+ struct kvec *kv = call->local->kvec;
+ gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS;
+ void *buf, *buf2 = NULL;
+ u8 *filler;
+
+ buf = page_frag_alloc(&call->local->tx_alloc,
+ sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp);
+ if (!buf)
+ return -ENOMEM;
+
+ if (sack_size) {
+ buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp);
+ if (!buf2) {
+ page_frag_free(buf);
+ return -ENOMEM;
+ }
+ }
+
+ whdr = buf;
+ ack = buf + sizeof(*whdr);
+ filler = buf + sizeof(*whdr) + sizeof(*ack) + 1;
+ trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3;
+
+ kv[0].iov_base = whdr;
+ kv[0].iov_len = sizeof(*whdr) + sizeof(*ack);
+ kv[1].iov_base = buf2;
+ kv[1].iov_len = sack_size;
+ kv[2].iov_base = filler;
+ kv[2].iov_len = 3 + sizeof(*trailer);
+ return 3; /* Number of kvec[] used. */
+}
+
+static void rxrpc_free_ack(struct rxrpc_call *call)
+{
+ page_frag_free(call->local->kvec[0].iov_base);
+ if (call->local->kvec[1].iov_base)
+ page_frag_free(call->local->kvec[1].iov_base);
+}
+
+/*
+ * Record the beginning of an RTT probe.
+ */
+static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial,
+ ktime_t now, enum rxrpc_rtt_tx_trace why)
+{
+ unsigned long avail = call->rtt_avail;
+ int rtt_slot = 9;
+
+ if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK))
+ goto no_slot;
+
+ rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK);
+ if (!test_and_clear_bit(rtt_slot, &call->rtt_avail))
+ goto no_slot;
+
+ call->rtt_serial[rtt_slot] = serial;
+ call->rtt_sent_at[rtt_slot] = now;
+ smp_wmb(); /* Write data before avail bit */
+ set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
+
+ trace_rxrpc_rtt_tx(call, why, rtt_slot, serial);
+ return;
+
+no_slot:
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial);
+}
+
+/*
* Fill out an ACK packet.
*/
-static void rxrpc_fill_out_ack(struct rxrpc_call *call,
- struct rxrpc_txbuf *txb,
- u8 ack_reason,
- rxrpc_serial_t serial)
+static int rxrpc_fill_out_ack(struct rxrpc_call *call, int nr_kv, u8 ack_reason,
+ rxrpc_serial_t serial_to_ack, rxrpc_serial_t *_ack_serial)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
- struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3;
+ struct kvec *kv = call->local->kvec;
+ struct rxrpc_wire_header *whdr = kv[0].iov_base;
+ struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3;
struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1);
- unsigned int qsize, sack, wrap, to;
+ unsigned int qsize, sack, wrap, to, max_mtu, if_mtu;
rxrpc_seq_t window, wtop;
+ ktime_t now = ktime_get_real();
int rsize;
- u32 mtu, jmax;
- u8 *filler = txb->kvec[2].iov_base;
- u8 *sackp = txb->kvec[1].iov_base;
+ u8 *filler = kv[2].iov_base;
+ u8 *sackp = kv[1].iov_base;
rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill);
@@ -95,14 +169,25 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
wtop = call->ackr_wtop;
sack = call->ackr_sack_base % RXRPC_SACK_SIZE;
+ *_ack_serial = rxrpc_get_next_serial(call->conn);
+
+ whdr->epoch = htonl(call->conn->proto.epoch);
+ whdr->cid = htonl(call->cid);
+ whdr->callNumber = htonl(call->call_id);
+ whdr->serial = htonl(*_ack_serial);
whdr->seq = 0;
whdr->type = RXRPC_PACKET_TYPE_ACK;
- txb->flags |= RXRPC_SLOW_START_OK;
+ whdr->flags = call->conn->out_clientflag | RXRPC_SLOW_START_OK;
+ whdr->userStatus = 0;
+ whdr->securityIndex = call->security_ix;
+ whdr->_rsvd = 0;
+ whdr->serviceId = htons(call->dest_srx.srx_service);
+
ack->bufferSpace = 0;
ack->maxSkew = 0;
ack->firstPacket = htonl(window);
ack->previousPacket = htonl(call->rx_highest_seq);
- ack->serial = htonl(serial);
+ ack->serial = htonl(serial_to_ack);
ack->reason = ack_reason;
ack->nAcks = wtop - window;
filler[0] = 0;
@@ -110,15 +195,13 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
filler[2] = 0;
if (ack_reason == RXRPC_ACK_PING)
- txb->flags |= RXRPC_REQUEST_ACK;
+ whdr->flags |= RXRPC_REQUEST_ACK;
if (after(wtop, window)) {
- txb->len += ack->nAcks;
- txb->kvec[1].iov_base = sackp;
- txb->kvec[1].iov_len = ack->nAcks;
+ kv[1].iov_len = ack->nAcks;
wrap = RXRPC_SACK_SIZE - sack;
- to = min_t(unsigned int, ack->nAcks, RXRPC_SACK_SIZE);
+ to = umin(ack->nAcks, RXRPC_SACK_SIZE);
if (sack + ack->nAcks <= RXRPC_SACK_SIZE) {
memcpy(sackp, call->ackr_sack_table + sack, ack->nAcks);
@@ -132,56 +215,42 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
ack->reason = RXRPC_ACK_IDLE;
}
- mtu = call->peer->if_mtu;
- mtu -= call->peer->hdrsize;
- jmax = rxrpc_rx_jumbo_max;
qsize = (window - 1) - call->rx_consumed;
rsize = max_t(int, call->rx_winsize - qsize, 0);
- txb->ack_rwind = rsize;
- trailer->maxMTU = htonl(rxrpc_rx_mtu);
- trailer->ifMTU = htonl(mtu);
- trailer->rwind = htonl(rsize);
- trailer->jumbo_max = htonl(jmax);
-}
-
-/*
- * Record the beginning of an RTT probe.
- */
-static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial,
- ktime_t now, enum rxrpc_rtt_tx_trace why)
-{
- unsigned long avail = call->rtt_avail;
- int rtt_slot = 9;
-
- if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK))
- goto no_slot;
-
- rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK);
- if (!test_and_clear_bit(rtt_slot, &call->rtt_avail))
- goto no_slot;
- call->rtt_serial[rtt_slot] = serial;
- call->rtt_sent_at[rtt_slot] = now;
- smp_wmb(); /* Write data before avail bit */
- set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
+ if_mtu = call->peer->if_mtu - call->peer->hdrsize;
+ if (call->peer->ackr_adv_pmtud) {
+ max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu);
+ } else {
+ if_mtu = umin(if_mtu, 1444);
+ max_mtu = if_mtu;
+ }
- trace_rxrpc_rtt_tx(call, why, rtt_slot, serial);
- return;
+ trailer->maxMTU = htonl(max_mtu);
+ trailer->ifMTU = htonl(if_mtu);
+ trailer->rwind = htonl(rsize);
+ trailer->jumbo_max = 0; /* Advertise pmtu discovery */
-no_slot:
- trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial);
+ if (ack_reason == RXRPC_ACK_PING)
+ rxrpc_begin_rtt_probe(call, *_ack_serial, now, rxrpc_rtt_tx_ping);
+ if (whdr->flags & RXRPC_REQUEST_ACK)
+ call->rtt_last_req = now;
+ rxrpc_set_keepalive(call, now);
+ return nr_kv;
}
/*
* Transmit an ACK packet.
*/
-static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+static void rxrpc_send_ack_packet(struct rxrpc_call *call, int nr_kv, size_t len,
+ rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
+ struct kvec *kv = call->local->kvec;
+ struct rxrpc_wire_header *whdr = kv[0].iov_base;
+ struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3;
struct rxrpc_connection *conn;
struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1);
struct msghdr msg;
- ktime_t now;
int ret;
if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
@@ -195,33 +264,34 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
msg.msg_controllen = 0;
msg.msg_flags = MSG_SPLICE_PAGES;
- whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
-
- txb->serial = rxrpc_get_next_serial(conn);
- whdr->serial = htonl(txb->serial);
- trace_rxrpc_tx_ack(call->debug_id, txb->serial,
+ trace_rxrpc_tx_ack(call->debug_id, serial,
ntohl(ack->firstPacket),
ntohl(ack->serial), ack->reason, ack->nAcks,
- txb->ack_rwind);
+ ntohl(trailer->rwind), why);
rxrpc_inc_stat(call->rxnet, stat_tx_ack_send);
- iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, txb->len);
- rxrpc_local_dont_fragment(conn->local, false);
- ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len);
+ iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, len);
+ rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe);
+
+ ret = do_udp_sendmsg(conn->local->socket, &msg, len);
call->peer->last_tx_at = ktime_get_seconds();
if (ret < 0) {
- trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret,
+ trace_rxrpc_tx_fail(call->debug_id, serial, ret,
rxrpc_tx_point_call_ack);
+ if (why == rxrpc_propose_ack_ping_for_mtu_probe &&
+ ret == -EMSGSIZE)
+ rxrpc_input_probe_for_pmtud(conn, serial, true);
} else {
trace_rxrpc_tx_packet(call->debug_id, whdr,
rxrpc_tx_point_call_ack);
- now = ktime_get_real();
- if (ack->reason == RXRPC_ACK_PING)
- rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_ping);
- if (txb->flags & RXRPC_REQUEST_ACK)
- call->peer->rtt_last_req = now;
- rxrpc_set_keepalive(call, now);
+ if (why == rxrpc_propose_ack_ping_for_mtu_probe) {
+ call->peer->pmtud_pending = false;
+ call->peer->pmtud_probing = true;
+ call->conn->pmtud_probe = serial;
+ call->conn->pmtud_call = call->debug_id;
+ trace_rxrpc_pmtud_tx(call);
+ }
}
rxrpc_tx_backoff(call, ret);
}
@@ -230,31 +300,62 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
* Queue an ACK for immediate transmission.
*/
void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
- rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why)
+ rxrpc_serial_t serial_to_ack, enum rxrpc_propose_ack_trace why)
{
- struct rxrpc_txbuf *txb;
+ struct kvec *kv = call->local->kvec;
+ rxrpc_serial_t ack_serial;
+ size_t len;
+ int nr_kv;
if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
return;
rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]);
- txb = rxrpc_alloc_ack_txbuf(call, call->ackr_wtop - call->ackr_window);
- if (!txb) {
+ nr_kv = rxrpc_alloc_ack(call, call->ackr_wtop - call->ackr_window);
+ if (nr_kv < 0) {
kleave(" = -ENOMEM");
return;
}
- txb->ack_why = why;
+ nr_kv = rxrpc_fill_out_ack(call, nr_kv, ack_reason, serial_to_ack, &ack_serial);
+ len = kv[0].iov_len;
+ len += kv[1].iov_len;
+ len += kv[2].iov_len;
+
+ /* Extend a path MTU probe ACK. */
+ if (why == rxrpc_propose_ack_ping_for_mtu_probe) {
+ size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header);
+
+ if (len > probe_mtu)
+ goto skip;
+ while (len < probe_mtu) {
+ size_t part = umin(probe_mtu - len, PAGE_SIZE);
+
+ kv[nr_kv].iov_base = page_address(ZERO_PAGE(0));
+ kv[nr_kv].iov_len = part;
+ len += part;
+ nr_kv++;
+ }
+ }
- rxrpc_fill_out_ack(call, txb, ack_reason, serial);
call->ackr_nr_unacked = 0;
atomic_set(&call->ackr_nr_consumed, 0);
clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags);
- trace_rxrpc_send_ack(call, why, ack_reason, serial);
- rxrpc_send_ack_packet(call, txb);
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx);
+ trace_rxrpc_send_ack(call, why, ack_reason, ack_serial);
+ rxrpc_send_ack_packet(call, nr_kv, len, ack_serial, why);
+skip:
+ rxrpc_free_ack(call);
+}
+
+/*
+ * Send an ACK probe for path MTU discovery.
+ */
+void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call)
+{
+ rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
+ rxrpc_propose_ack_ping_for_mtu_probe);
}
/*
@@ -324,14 +425,21 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
/*
* Prepare a (sub)packet for transmission.
*/
-static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb,
- rxrpc_serial_t serial)
+static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call,
+ struct rxrpc_send_data_req *req,
+ struct rxrpc_txbuf *txb,
+ struct rxrpc_wire_header *whdr,
+ rxrpc_serial_t serial, int subpkt)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
+ struct rxrpc_jumbo_header *jumbo = txb->data - sizeof(*jumbo);
enum rxrpc_req_ack_trace why;
struct rxrpc_connection *conn = call->conn;
+ struct kvec *kv = &call->local->kvec[1 + subpkt];
+ size_t len = txb->pkt_len;
+ bool last;
+ u8 flags;
- _enter("%x,{%d}", txb->seq, txb->len);
+ _enter("%x,%zd", txb->seq, len);
txb->serial = serial;
@@ -339,6 +447,15 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t
txb->seq == 1)
whdr->userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE;
+ txb->flags &= ~RXRPC_REQUEST_ACK;
+ flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
+ last = txb->flags & RXRPC_LAST_PACKET;
+
+ if (subpkt < req->n - 1) {
+ len = RXRPC_JUMBO_DATALEN;
+ goto dont_set_request_ack;
+ }
+
/* If our RTT cache needs working on, request an ACK. Also request
* ACKs if a DATA packet appears to have been lost.
*
@@ -346,113 +463,208 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t
* service call, lest OpenAFS incorrectly send us an ACK with some
* soft-ACKs in it and then never follow up with a proper hard ACK.
*/
- if (txb->flags & RXRPC_REQUEST_ACK)
- why = rxrpc_reqack_already_on;
- else if ((txb->flags & RXRPC_LAST_PACKET) && rxrpc_sending_to_client(txb))
+ if (last && rxrpc_sending_to_client(txb))
why = rxrpc_reqack_no_srv_last;
else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
why = rxrpc_reqack_ack_lost;
else if (txb->flags & RXRPC_TXBUF_RESENT)
why = rxrpc_reqack_retrans;
- else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2)
+ else if (call->cong_ca_state == RXRPC_CA_SLOW_START && call->cong_cwnd <= RXRPC_MIN_CWND)
why = rxrpc_reqack_slow_start;
else if (call->tx_winsize <= 2)
why = rxrpc_reqack_small_txwin;
- else if (call->peer->rtt_count < 3 && txb->seq & 1)
+ else if (call->rtt_count < 3)
why = rxrpc_reqack_more_rtt;
- else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real()))
+ else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real()))
why = rxrpc_reqack_old_rtt;
+ else if (!last && !after(READ_ONCE(call->send_top), txb->seq))
+ why = rxrpc_reqack_app_stall;
else
goto dont_set_request_ack;
rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]);
trace_rxrpc_req_ack(call->debug_id, txb->seq, why);
- if (why != rxrpc_reqack_no_srv_last)
- txb->flags |= RXRPC_REQUEST_ACK;
+ if (why != rxrpc_reqack_no_srv_last) {
+ flags |= RXRPC_REQUEST_ACK;
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, -1, serial);
+ call->rtt_last_req = req->now;
+ }
dont_set_request_ack:
- whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
- whdr->serial = htonl(txb->serial);
- whdr->cksum = txb->cksum;
+ /* There's a jumbo header prepended to the data if we need it. */
+ if (subpkt < req->n - 1)
+ flags |= RXRPC_JUMBO_PACKET;
+ else
+ flags &= ~RXRPC_JUMBO_PACKET;
+ if (subpkt == 0) {
+ whdr->flags = flags;
+ whdr->cksum = txb->cksum;
+ kv->iov_base = txb->data;
+ } else {
+ jumbo->flags = flags;
+ jumbo->pad = 0;
+ jumbo->cksum = txb->cksum;
+ kv->iov_base = jumbo;
+ len += sizeof(*jumbo);
+ }
- trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, false);
+ trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, req->trace);
+ kv->iov_len = len;
+ return len;
}
/*
- * Prepare a packet for transmission.
+ * Prepare a transmission queue object for initial transmission. Returns the
+ * number of microseconds since the transmission queue base timestamp.
*/
-static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq,
+ struct rxrpc_send_data_req *req)
{
- rxrpc_serial_t serial;
-
- /* Each transmission of a Tx packet needs a new serial number */
- serial = rxrpc_get_next_serial(call->conn);
-
- rxrpc_prepare_data_subpacket(call, txb, serial);
-
- return txb->len;
+ if (!tq)
+ return 0;
+ if (tq->xmit_ts_base == KTIME_MIN) {
+ tq->xmit_ts_base = req->now;
+ return 0;
+ }
+ return ktime_to_us(ktime_sub(req->now, tq->xmit_ts_base));
}
/*
- * Set timeouts after transmitting a packet.
+ * Prepare a (jumbo) packet for transmission.
*/
-static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call,
+ struct rxrpc_send_data_req *req,
+ struct rxrpc_wire_header *whdr)
{
- ktime_t now = ktime_get_real();
- bool ack_requested = txb->flags & RXRPC_REQUEST_ACK;
+ struct rxrpc_txqueue *tq = req->tq;
+ rxrpc_serial_t serial;
+ unsigned int xmit_ts;
+ rxrpc_seq_t seq = req->seq;
+ size_t len = 0;
+ bool start_tlp = false;
- call->tx_last_sent = now;
- txb->last_sent = now;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit);
- if (ack_requested) {
- rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_data);
+ /* Each transmission of a Tx packet needs a new serial number */
+ serial = rxrpc_get_next_serials(call->conn, req->n);
+
+ whdr->epoch = htonl(call->conn->proto.epoch);
+ whdr->cid = htonl(call->cid);
+ whdr->callNumber = htonl(call->call_id);
+ whdr->seq = htonl(seq);
+ whdr->serial = htonl(serial);
+ whdr->type = RXRPC_PACKET_TYPE_DATA;
+ whdr->flags = 0;
+ whdr->userStatus = 0;
+ whdr->securityIndex = call->security_ix;
+ whdr->_rsvd = 0;
+ whdr->serviceId = htons(call->conn->service_id);
+
+ call->tx_last_serial = serial + req->n - 1;
+ call->tx_last_sent = req->now;
+ xmit_ts = rxrpc_prepare_txqueue(tq, req);
+ prefetch(tq->next);
+
+ for (int i = 0;;) {
+ int ix = seq & RXRPC_TXQ_MASK;
+ struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK];
+
+ _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq);
+
+ /* Record (re-)transmission for RACK [RFC8985 6.1]. */
+ if (__test_and_clear_bit(ix, &tq->segment_lost))
+ call->tx_nr_lost--;
+ if (req->retrans) {
+ __set_bit(ix, &tq->ever_retransmitted);
+ __set_bit(ix, &tq->segment_retransmitted);
+ call->tx_nr_resent++;
+ } else {
+ call->tx_nr_sent++;
+ start_tlp = true;
+ }
+ tq->segment_xmit_ts[ix] = xmit_ts;
+ tq->segment_serial[ix] = serial;
+ if (i + 1 == req->n)
+ /* Only sample the last subpacket in a jumbo. */
+ __set_bit(ix, &tq->rtt_samples);
+ len += rxrpc_prepare_data_subpacket(call, req, txb, whdr, serial, i);
+ serial++;
+ seq++;
+ i++;
+ if (i >= req->n)
+ break;
+ if (!(seq & RXRPC_TXQ_MASK)) {
+ tq = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit_advance);
+ xmit_ts = rxrpc_prepare_txqueue(tq, req);
+ }
+ }
- call->peer->rtt_last_req = now;
- if (call->peer->rtt_count > 1) {
- ktime_t delay = rxrpc_get_rto_backoff(call->peer, false);
+ /* Set timeouts */
+ if (req->tlp_probe) {
+ /* Sending TLP loss probe [RFC8985 7.3]. */
+ call->tlp_serial = serial - 1;
+ call->tlp_seq = seq - 1;
+ } else if (start_tlp) {
+ /* Schedule TLP loss probe [RFC8985 7.2]. */
+ ktime_t pto;
+
+ if (!test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags))
+ /* The first packet may take longer to elicit a response. */
+ pto = NSEC_PER_SEC;
+ else
+ pto = rxrpc_tlp_calc_pto(call, req->now);
- call->ack_lost_at = ktime_add(now, delay);
- trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack);
- }
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_TLP_PTO;
+ call->rack_timo_at = ktime_add(req->now, pto);
+ trace_rxrpc_rack_timer(call, pto, false);
+ trace_rxrpc_timer_set(call, pto, rxrpc_timer_trace_rack_tlp_pto);
}
if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) {
ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo));
- call->expect_rx_by = ktime_add(now, delay);
+ call->expect_rx_by = ktime_add(req->now, delay);
trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx);
}
- rxrpc_set_keepalive(call, now);
+ rxrpc_set_keepalive(call, req->now);
+ page_frag_free(whdr);
+ return len;
}
/*
- * send a packet through the transport endpoint
+ * Send one or more packets through the transport endpoint
*/
-static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
+ struct rxrpc_wire_header *whdr;
struct rxrpc_connection *conn = call->conn;
enum rxrpc_tx_point frag;
+ struct rxrpc_txqueue *tq = req->tq;
+ struct rxrpc_txbuf *txb;
struct msghdr msg;
- size_t len;
- int ret;
+ rxrpc_seq_t seq = req->seq;
+ size_t len = sizeof(*whdr);
+ bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags);
+ int ret, stat_ix;
- _enter("%x,{%d}", txb->seq, txb->len);
+ _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1);
- len = rxrpc_prepare_data_packet(call, txb);
+ whdr = page_frag_alloc(&call->local->tx_alloc, sizeof(*whdr), GFP_NOFS);
+ if (!whdr)
+ return; /* Drop the packet if no memory. */
- if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
- static int lose;
- if ((lose++ & 7) == 7) {
- ret = 0;
- trace_rxrpc_tx_data(call, txb->seq, txb->serial,
- txb->flags, true);
- goto done;
- }
- }
+ call->local->kvec[0].iov_base = whdr;
+ call->local->kvec[0].iov_len = sizeof(*whdr);
+
+ stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1;
+ atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]);
- iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, len);
+ len += rxrpc_prepare_data_packet(call, req, whdr);
+ txb = tq->bufs[seq & RXRPC_TXQ_MASK];
+
+ iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1 + req->n, len);
msg.msg_name = &call->peer->srx.transport;
msg.msg_namelen = call->peer->srx.transport_len;
@@ -460,16 +672,11 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
msg.msg_controllen = 0;
msg.msg_flags = MSG_SPLICE_PAGES;
- /* Track what we've attempted to transmit at least once so that the
- * retransmission algorithm doesn't try to resend what we haven't sent
- * yet.
+ /* Send the packet with the don't fragment bit set unless we think it's
+ * too big or if this is a retransmission.
*/
- if (txb->seq == call->tx_transmitted + 1)
- call->tx_transmitted = txb->seq;
-
- /* send the packet with the don't fragment bit set if we currently
- * think it's small enough */
- if (txb->len >= call->peer->maxdata) {
+ if (seq == call->tx_transmitted + 1 &&
+ len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) {
rxrpc_local_dont_fragment(conn->local, false);
frag = rxrpc_tx_point_call_data_frag;
} else {
@@ -477,7 +684,25 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
frag = rxrpc_tx_point_call_data_nofrag;
}
-retry:
+ /* Track what we've attempted to transmit at least once so that the
+ * retransmission algorithm doesn't try to resend what we haven't sent
+ * yet.
+ */
+ if (seq == call->tx_transmitted + 1)
+ call->tx_transmitted = seq + req->n - 1;
+
+ if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+ static int lose;
+
+ if ((lose++ & 7) == 7) {
+ ret = 0;
+ trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags,
+ rxrpc_txdata_inject_loss);
+ conn->peer->last_tx_at = ktime_get_seconds();
+ goto done;
+ }
+ }
+
/* send the packet by UDP
* - returns -EMSGSIZE if UDP would have to fragment the packet
* to go out of the interface
@@ -488,7 +713,11 @@ retry:
ret = do_udp_sendmsg(conn->local->socket, &msg, len);
conn->peer->last_tx_at = ktime_get_seconds();
- if (ret < 0) {
+ if (ret == -EMSGSIZE) {
+ rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize);
+ trace_rxrpc_tx_packet(call->debug_id, whdr, frag);
+ ret = 0;
+ } else if (ret < 0) {
rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail);
trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag);
} else {
@@ -496,28 +725,23 @@ retry:
}
rxrpc_tx_backoff(call, ret);
- if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_frag) {
- rxrpc_local_dont_fragment(conn->local, false);
- frag = rxrpc_tx_point_call_data_frag;
- goto retry;
- }
-done:
- if (ret >= 0) {
- rxrpc_tstamp_data_packets(call, txb);
- } else {
- /* Cancel the call if the initial transmission fails,
- * particularly if that's due to network routing issues that
- * aren't going away anytime soon. The layer above can arrange
- * the retransmission.
+ if (ret < 0) {
+ /* Cancel the call if the initial transmission fails or if we
+ * hit due to network routing issues that aren't going away
+ * anytime soon. The layer above can arrange the
+ * retransmission.
*/
- if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags))
+ if (new_call ||
+ ret == -ENETUNREACH ||
+ ret == -EHOSTUNREACH ||
+ ret == -ECONNREFUSED)
rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
RX_USER_ABORT, ret);
}
- _leave(" = %d [%u]", ret, call->peer->maxdata);
- return ret;
+done:
+ _leave(" = %d [%u]", ret, call->peer->max_data);
}
/*
@@ -694,39 +918,62 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
}
/*
- * Schedule an instant Tx resend.
+ * Send a RESPONSE message.
*/
-static inline void rxrpc_instant_resend(struct rxrpc_call *call,
- struct rxrpc_txbuf *txb)
+void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response)
{
- if (!__rxrpc_call_is_complete(call))
- kdebug("resend");
-}
+ struct rxrpc_skb_priv *sp = rxrpc_skb(response);
+ struct scatterlist sg[16];
+ struct bio_vec *bvec = conn->local->bvec;
+ struct msghdr msg;
+ size_t len = sp->resp.len;
+ __be32 wserial;
+ u32 serial = 0;
+ int ret, nr_sg;
-/*
- * Transmit one packet.
- */
-void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
-{
- int ret;
+ _enter("C=%x,%x", conn->debug_id, sp->resp.challenge_serial);
- ret = rxrpc_send_data_packet(call, txb);
- if (ret < 0) {
- switch (ret) {
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -ECONNREFUSED:
- rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
- 0, ret);
- break;
- default:
- _debug("need instant resend %d", ret);
- rxrpc_instant_resend(call, txb);
- }
- } else {
- ktime_t delay = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC);
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ ret = skb_to_sgvec(response, sg, 0, len);
+ if (ret < 0)
+ goto fail;
+ nr_sg = ret;
+ ret = -EIO;
+ if (WARN_ON_ONCE(nr_sg > ARRAY_SIZE(conn->local->bvec)))
+ goto fail;
- call->resend_at = ktime_add(ktime_get_real(), delay);
- trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend_tx);
- }
+ for (int i = 0; i < nr_sg; i++)
+ bvec_set_page(&bvec[i], sg_page(&sg[i]), sg[i].length, sg[i].offset);
+
+ iov_iter_bvec(&msg.msg_iter, WRITE, bvec, nr_sg, len);
+
+ msg.msg_name = &conn->peer->srx.transport;
+ msg.msg_namelen = conn->peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_SPLICE_PAGES;
+
+ serial = rxrpc_get_next_serials(conn, 1);
+ wserial = htonl(serial);
+
+ trace_rxrpc_tx_response(conn, serial, sp);
+
+ ret = skb_store_bits(response, offsetof(struct rxrpc_wire_header, serial),
+ &wserial, sizeof(wserial));
+ if (ret < 0)
+ goto fail;
+
+ rxrpc_local_dont_fragment(conn->local, false);
+
+ ret = do_udp_sendmsg(conn->local->socket, &msg, len);
+ if (ret < 0)
+ goto fail;
+
+ conn->peer->last_tx_at = ktime_get_seconds();
+ return;
+
+fail:
+ trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
+ rxrpc_tx_point_response);
+ kleave(" = %d", ret);
}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 552ba84a255c..7f4729234957 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -102,6 +102,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local,
*/
static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
{
+ unsigned int max_data;
+
/* wind down the local interface MTU */
if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu)
peer->if_mtu = mtu;
@@ -120,11 +122,15 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
}
}
- if (mtu < peer->mtu) {
- spin_lock(&peer->lock);
- peer->mtu = mtu;
- peer->maxdata = peer->mtu - peer->hdrsize;
- spin_unlock(&peer->lock);
+ max_data = max_t(int, mtu - peer->hdrsize, 500);
+ if (max_data < peer->max_data) {
+ if (peer->pmtud_good > max_data)
+ peer->pmtud_good = max_data;
+ if (peer->pmtud_bad > max_data + 1)
+ peer->pmtud_bad = max_data + 1;
+
+ trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp);
+ peer->max_data = max_data;
}
}
@@ -161,6 +167,13 @@ void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb)
goto out;
}
+ if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 &&
+ serr->ee.ee_type == ICMPV6_PKT_TOOBIG &&
+ serr->ee.ee_code == 0)) {
+ rxrpc_adjust_mtu(peer, serr->ee.ee_info);
+ goto out;
+ }
+
rxrpc_store_error(peer, skb);
out:
rxrpc_put_peer(peer, rxrpc_peer_put_input_error);
@@ -205,23 +218,23 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb,
struct rxrpc_call *call;
HLIST_HEAD(error_targets);
- spin_lock(&peer->lock);
+ spin_lock_irq(&peer->lock);
hlist_move_list(&peer->error_targets, &error_targets);
while (!hlist_empty(&error_targets)) {
call = hlist_entry(error_targets.first,
struct rxrpc_call, error_link);
hlist_del_init(&call->error_link);
- spin_unlock(&peer->lock);
+ spin_unlock_irq(&peer->lock);
rxrpc_see_call(call, rxrpc_call_see_distribute_error);
rxrpc_set_call_completion(call, compl, 0, -err);
- rxrpc_input_call_event(call, skb);
+ rxrpc_input_call_event(call);
- spin_lock(&peer->lock);
+ spin_lock_irq(&peer->lock);
}
- spin_unlock(&peer->lock);
+ spin_unlock_irq(&peer->lock);
}
/*
@@ -238,7 +251,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet,
bool use;
int slot;
- spin_lock(&rxnet->peer_hash_lock);
+ spin_lock_bh(&rxnet->peer_hash_lock);
while (!list_empty(collector)) {
peer = list_entry(collector->next,
@@ -249,7 +262,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet,
continue;
use = __rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive);
- spin_unlock(&rxnet->peer_hash_lock);
+ spin_unlock_bh(&rxnet->peer_hash_lock);
if (use) {
keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME;
@@ -269,17 +282,17 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet,
*/
slot += cursor;
slot &= mask;
- spin_lock(&rxnet->peer_hash_lock);
+ spin_lock_bh(&rxnet->peer_hash_lock);
list_add_tail(&peer->keepalive_link,
&rxnet->peer_keepalive[slot & mask]);
- spin_unlock(&rxnet->peer_hash_lock);
+ spin_unlock_bh(&rxnet->peer_hash_lock);
rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive);
}
rxrpc_put_peer(peer, rxrpc_peer_put_keepalive);
- spin_lock(&rxnet->peer_hash_lock);
+ spin_lock_bh(&rxnet->peer_hash_lock);
}
- spin_unlock(&rxnet->peer_hash_lock);
+ spin_unlock_bh(&rxnet->peer_hash_lock);
}
/*
@@ -309,7 +322,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work)
* second; the bucket at cursor + 1 goes at now + 1s and so
* on...
*/
- spin_lock(&rxnet->peer_hash_lock);
+ spin_lock_bh(&rxnet->peer_hash_lock);
list_splice_init(&rxnet->peer_keepalive_new, &collector);
stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive);
@@ -321,7 +334,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work)
}
base = now;
- spin_unlock(&rxnet->peer_hash_lock);
+ spin_unlock_bh(&rxnet->peer_hash_lock);
rxnet->peer_keepalive_base = base;
rxnet->peer_keepalive_cursor = cursor;
@@ -347,3 +360,84 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work)
_leave("");
}
+
+/*
+ * Do path MTU probing.
+ */
+void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial,
+ bool sendmsg_fail)
+{
+ struct rxrpc_peer *peer = conn->peer;
+ unsigned int max_data = peer->max_data;
+ int good, trial, bad, jumbo;
+
+ good = peer->pmtud_good;
+ trial = peer->pmtud_trial;
+ bad = peer->pmtud_bad;
+ if (good >= bad - 1) {
+ conn->pmtud_probe = 0;
+ peer->pmtud_lost = false;
+ return;
+ }
+
+ if (!peer->pmtud_probing)
+ goto send_probe;
+
+ if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) {
+ /* Retry a lost probe. */
+ if (!peer->pmtud_lost) {
+ trace_rxrpc_pmtud_lost(conn, acked_serial);
+ conn->pmtud_probe = 0;
+ peer->pmtud_lost = true;
+ goto send_probe;
+ }
+
+ /* The probed size didn't seem to get through. */
+ bad = trial;
+ peer->pmtud_bad = bad;
+ if (bad <= max_data)
+ max_data = bad - 1;
+ } else {
+ /* It did get through. */
+ good = trial;
+ peer->pmtud_good = good;
+ if (good > max_data)
+ max_data = good;
+ }
+
+ max_data = umin(max_data, peer->ackr_max_data);
+ if (max_data != peer->max_data)
+ peer->max_data = max_data;
+
+ jumbo = max_data + sizeof(struct rxrpc_jumbo_header);
+ jumbo /= RXRPC_JUMBO_SUBPKTLEN;
+ peer->pmtud_jumbo = jumbo;
+
+ trace_rxrpc_pmtud_rx(conn, acked_serial);
+ conn->pmtud_probe = 0;
+ peer->pmtud_lost = false;
+
+ if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2))
+ trial = RXRPC_JUMBO(2);
+ else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4))
+ trial = RXRPC_JUMBO(4);
+ else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3))
+ trial = RXRPC_JUMBO(3);
+ else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6))
+ trial = RXRPC_JUMBO(6);
+ else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5))
+ trial = RXRPC_JUMBO(5);
+ else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8))
+ trial = RXRPC_JUMBO(8);
+ else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7))
+ trial = RXRPC_JUMBO(7);
+ else
+ trial = (good + bad) / 2;
+ peer->pmtud_trial = trial;
+
+ if (good >= bad)
+ return;
+
+send_probe:
+ peer->pmtud_pending = true;
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 49dcda67a0d5..e2f35e6c04d6 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -162,6 +162,11 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local,
#endif
peer->if_mtu = 1500;
+ if (peer->max_data < peer->if_mtu - peer->hdrsize) {
+ trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize,
+ rxrpc_pmtud_reduce_route);
+ peer->max_data = peer->if_mtu - peer->hdrsize;
+ }
memset(&fl, 0, sizeof(fl));
switch (peer->srx.transport.family) {
@@ -199,8 +204,16 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local,
}
peer->if_mtu = dst_mtu(dst);
+ peer->hdrsize += dst->header_len + dst->trailer_len;
+ peer->tx_seg_max = dst->dev->gso_max_segs;
dst_release(dst);
+ peer->max_data = umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize);
+ peer->pmtud_good = 500;
+ peer->pmtud_bad = peer->if_mtu - peer->hdrsize + 1;
+ peer->pmtud_trial = umin(peer->max_data, peer->pmtud_bad - 1);
+ peer->pmtud_pending = true;
+
_leave(" [if_mtu %u]", peer->if_mtu);
}
@@ -222,11 +235,8 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp,
peer->service_conns = RB_ROOT;
seqlock_init(&peer->service_conn_lock);
spin_lock_init(&peer->lock);
- spin_lock_init(&peer->rtt_input_lock);
peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
-
- rxrpc_peer_init_rtt(peer);
-
+ peer->recent_srtt_us = UINT_MAX;
peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
trace_rxrpc_peer(peer->debug_id, 1, why);
}
@@ -242,9 +252,7 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer,
unsigned long hash_key)
{
peer->hash_key = hash_key;
- rxrpc_assess_MTU_size(local, peer);
- peer->mtu = peer->if_mtu;
- peer->rtt_last_req = ktime_get_real();
+
switch (peer->srx.transport.family) {
case AF_INET:
@@ -268,7 +276,9 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer,
}
peer->hdrsize += sizeof(struct rxrpc_wire_header);
- peer->maxdata = peer->mtu - peer->hdrsize;
+ peer->max_data = peer->if_mtu - peer->hdrsize;
+
+ rxrpc_assess_MTU_size(local, peer);
}
/*
@@ -304,6 +314,7 @@ static void rxrpc_free_peer(struct rxrpc_peer *peer)
* Set up a new incoming peer. There shouldn't be any other matching peers
* since we've already done a search in the list from the non-reentrant context
* (the data_ready handler) that is the only place we can add new peers.
+ * Called with interrupts disabled.
*/
void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer)
{
@@ -348,7 +359,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
return NULL;
}
- spin_lock(&rxnet->peer_hash_lock);
+ spin_lock_bh(&rxnet->peer_hash_lock);
/* Need to check that we aren't racing with someone else */
peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
@@ -361,7 +372,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
&rxnet->peer_keepalive_new);
}
- spin_unlock(&rxnet->peer_hash_lock);
+ spin_unlock_bh(&rxnet->peer_hash_lock);
if (peer)
rxrpc_free_peer(candidate);
@@ -411,10 +422,10 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer)
ASSERT(hlist_empty(&peer->error_targets));
- spin_lock(&rxnet->peer_hash_lock);
+ spin_lock_bh(&rxnet->peer_hash_lock);
hash_del_rcu(&peer->hash_link);
list_del_init(&peer->keepalive_link);
- spin_unlock(&rxnet->peer_hash_lock);
+ spin_unlock_bh(&rxnet->peer_hash_lock);
rxrpc_free_peer(peer);
}
@@ -450,7 +461,7 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet)
continue;
hlist_for_each_entry(peer, &rxnet->peer_hash[i], hash_link) {
- pr_err("Leaked peer %u {%u} %pISp\n",
+ pr_err("Leaked peer %x {%u} %pISp\n",
peer->debug_id,
refcount_read(&peer->ref),
&peer->srx.transport);
@@ -464,10 +475,12 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet)
* @call: The call to query
*
* Get a record for the remote peer in a call.
+ *
+ * Return: The call's peer record.
*/
struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call)
{
- return call->peer;
+ return rxrpc_get_peer(call->peer, rxrpc_peer_get_application);
}
EXPORT_SYMBOL(rxrpc_kernel_get_call_peer);
@@ -475,11 +488,13 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer);
* rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT
* @peer: The peer to query
*
- * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples.
+ * Get the call's peer smoothed RTT.
+ *
+ * Return: The RTT in uS or %UINT_MAX if we have no samples.
*/
unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer)
{
- return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX;
+ return READ_ONCE(peer->recent_srtt_us);
}
EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
@@ -488,7 +503,10 @@ EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
* @peer: The peer to query
*
* Get a pointer to the address from a peer record. The caller is responsible
- * for making sure that the address is not deallocated.
+ * for making sure that the address is not deallocated. A fake address will be
+ * substituted if %peer in NULL.
+ *
+ * Return: The rxrpc address record or a fake record.
*/
const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer)
{
@@ -501,7 +519,10 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_srx);
* @peer: The peer to query
*
* Get a pointer to the transport address from a peer record. The caller is
- * responsible for making sure that the address is not deallocated.
+ * responsible for making sure that the address is not deallocated. A fake
+ * address will be substituted if %peer in NULL.
+ *
+ * Return: The transport address record or a fake record.
*/
const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer)
{
@@ -509,3 +530,33 @@ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer)
(peer ? &peer->srx.transport : &rxrpc_null_addr.transport);
}
EXPORT_SYMBOL(rxrpc_kernel_remote_addr);
+
+/**
+ * rxrpc_kernel_set_peer_data - Set app-specific data on a peer.
+ * @peer: The peer to alter
+ * @app_data: The data to set
+ *
+ * Set the app-specific data on a peer. AF_RXRPC makes no effort to retain
+ * anything the data might refer to.
+ *
+ * Return: The previous app_data.
+ */
+unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data)
+{
+ return xchg(&peer->app_data, app_data);
+}
+EXPORT_SYMBOL(rxrpc_kernel_set_peer_data);
+
+/**
+ * rxrpc_kernel_get_peer_data - Get app-specific data from a peer.
+ * @peer: The peer to query
+ *
+ * Retrieve the app-specific data from a peer.
+ *
+ * Return: The peer's app data.
+ */
+unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer)
+{
+ return peer->app_data;
+}
+EXPORT_SYMBOL(rxrpc_kernel_get_peer_data);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 263a2251e3d2..d803562ca0ac 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -52,7 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
struct rxrpc_call *call;
struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
enum rxrpc_call_state state;
- rxrpc_seq_t acks_hard_ack;
+ rxrpc_seq_t tx_bottom;
char lbuff[50], rbuff[50];
long timeout = 0;
@@ -79,7 +79,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
if (state != RXRPC_CALL_SERVER_PREALLOC)
timeout = ktime_ms_delta(READ_ONCE(call->expect_rx_by), ktime_get_real());
- acks_hard_ack = READ_ONCE(call->acks_hard_ack);
+ tx_bottom = READ_ONCE(call->tx_bottom);
seq_printf(seq,
"UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
" %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n",
@@ -93,7 +93,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
rxrpc_call_states[state],
call->abort_code,
call->debug_id,
- acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack,
+ tx_bottom, READ_ONCE(call->tx_top) - tx_bottom,
call->ackr_window, call->ackr_wtop - call->ackr_window,
call->rx_serial,
call->cong_cwnd,
@@ -283,9 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN) {
seq_puts(seq,
- "Proto Local "
- " Remote "
- " Use SST MTU LastUse RTT RTO\n"
+ "Proto Local Remote Use SST Maxd LastUse RTT RTO\n"
);
return 0;
}
@@ -298,16 +296,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
now = ktime_get_seconds();
seq_printf(seq,
- "UDP %-47.47s %-47.47s %3u"
- " %3u %5u %6llus %8u %8u\n",
+ "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8d %8d\n",
lbuff,
rbuff,
refcount_read(&peer->ref),
peer->cong_ssthresh,
- peer->mtu,
+ peer->max_data,
now - peer->last_tx_at,
- peer->srtt_us >> 3,
- peer->rto_us);
+ READ_ONCE(peer->recent_srtt_us),
+ READ_ONCE(peer->recent_rto_us));
return 0;
}
@@ -476,10 +473,11 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq));
seq_printf(seq,
- "Data : send=%u sendf=%u fail=%u\n",
+ "Data : send=%u sendf=%u fail=%u emsz=%u\n",
atomic_read(&rxnet->stat_tx_data_send),
atomic_read(&rxnet->stat_tx_data_send_frag),
- atomic_read(&rxnet->stat_tx_data_send_fail));
+ atomic_read(&rxnet->stat_tx_data_send_fail),
+ atomic_read(&rxnet->stat_tx_data_send_msgsize));
seq_printf(seq,
"Data-Tx : nr=%u retrans=%u uf=%u cwr=%u\n",
atomic_read(&rxnet->stat_tx_data),
@@ -508,7 +506,7 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]),
atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE]));
seq_printf(seq,
- "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n",
+ "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u z=%u\n",
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]),
@@ -517,13 +515,14 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]),
- atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]));
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]),
+ atomic_read(&rxnet->stat_rx_acks[0]));
seq_printf(seq,
- "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n",
+ "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u\n",
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]),
- atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]),
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]),
- atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]));
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]),
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall]));
seq_printf(seq,
"Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n",
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]),
@@ -531,6 +530,30 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]),
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin]));
seq_printf(seq,
+ "Jumbo-Tx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",
+ atomic_read(&rxnet->stat_tx_jumbo[0]),
+ atomic_read(&rxnet->stat_tx_jumbo[1]),
+ atomic_read(&rxnet->stat_tx_jumbo[2]),
+ atomic_read(&rxnet->stat_tx_jumbo[3]),
+ atomic_read(&rxnet->stat_tx_jumbo[4]),
+ atomic_read(&rxnet->stat_tx_jumbo[5]),
+ atomic_read(&rxnet->stat_tx_jumbo[6]),
+ atomic_read(&rxnet->stat_tx_jumbo[7]),
+ atomic_read(&rxnet->stat_tx_jumbo[8]),
+ atomic_read(&rxnet->stat_tx_jumbo[9]));
+ seq_printf(seq,
+ "Jumbo-Rx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",
+ atomic_read(&rxnet->stat_rx_jumbo[0]),
+ atomic_read(&rxnet->stat_rx_jumbo[1]),
+ atomic_read(&rxnet->stat_rx_jumbo[2]),
+ atomic_read(&rxnet->stat_rx_jumbo[3]),
+ atomic_read(&rxnet->stat_rx_jumbo[4]),
+ atomic_read(&rxnet->stat_rx_jumbo[5]),
+ atomic_read(&rxnet->stat_rx_jumbo[6]),
+ atomic_read(&rxnet->stat_rx_jumbo[7]),
+ atomic_read(&rxnet->stat_rx_jumbo[8]),
+ atomic_read(&rxnet->stat_rx_jumbo[9]));
+ seq_printf(seq,
"Buffers : txb=%u rxb=%u\n",
atomic_read(&rxrpc_nr_txbuf),
atomic_read(&rxrpc_n_rx_skbs));
@@ -567,6 +590,8 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size)
atomic_set(&rxnet->stat_tx_ack_skip, 0);
memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks));
memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks));
+ memset(&rxnet->stat_tx_jumbo, 0, sizeof(rxnet->stat_tx_jumbo));
+ memset(&rxnet->stat_rx_jumbo, 0, sizeof(rxnet->stat_rx_jumbo));
memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack));
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
index 4fe6b4d20ada..f8bfec12bc7e 100644
--- a/net/rxrpc/protocol.h
+++ b/net/rxrpc/protocol.h
@@ -92,11 +92,16 @@ struct rxrpc_jumbo_header {
/*
* The maximum number of subpackets that can possibly fit in a UDP packet is:
*
- * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1
- * = ((65535 - 28 - 28) / 1416) + 1
- * = 46 non-terminal packets and 1 terminal packet.
+ * (max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412)
+ * = ((65535 - 28 + 4) / 1416)
+ * = 45 non-terminal packets and 1 terminal packet.
*/
-#define RXRPC_MAX_NR_JUMBO 47
+#define RXRPC_MAX_NR_JUMBO 46
+
+/* Size of a jumbo packet with N subpackets, excluding UDP+IP */
+#define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \
+ RXRPC_JUMBO_DATALEN + \
+ ((N) - 1) * RXRPC_JUMBO_SUBPKTLEN)
/*****************************************************************************/
/*
@@ -176,4 +181,24 @@ struct rxkad_response {
__be32 ticket_len; /* Kerberos ticket length */
} __packed;
+/*
+ * GSSAPI security type-4 and type-6 data header.
+ */
+struct rxgk_header {
+ __be32 epoch;
+ __be32 cid;
+ __be32 call_number;
+ __be32 seq;
+ __be32 sec_index;
+ __be32 data_len;
+} __packed;
+
+/*
+ * GSSAPI security type-4 and type-6 response packet header.
+ */
+struct rxgk_response {
+ __be64 start_time;
+ __be32 token_len;
+} __packed;
+
#endif /* _LINUX_RXRPC_PACKET_H */
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index a482f88c5fc5..86a27fb55a1c 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -36,16 +36,16 @@ void rxrpc_notify_socket(struct rxrpc_call *call)
sk = &rx->sk;
if (rx && sk->sk_state < RXRPC_CLOSE) {
if (call->notify_rx) {
- spin_lock(&call->notify_lock);
+ spin_lock_irq(&call->notify_lock);
call->notify_rx(sk, call, call->user_call_ID);
- spin_unlock(&call->notify_lock);
+ spin_unlock_irq(&call->notify_lock);
} else {
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
if (list_empty(&call->recvmsg_link)) {
rxrpc_get_call(call, rxrpc_call_get_notify_socket);
list_add_tail(&call->recvmsg_link, &rx->recvmsg_q);
}
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
if (!sock_flag(sk, SOCK_DEAD)) {
_debug("call %ps", sk->sk_data_ready);
@@ -155,6 +155,82 @@ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb)
}
/*
+ * Transcribe a call's user ID to a control message.
+ */
+static int rxrpc_recvmsg_user_id(struct rxrpc_call *call, struct msghdr *msg,
+ int flags)
+{
+ if (!test_bit(RXRPC_CALL_HAS_USERID, &call->flags))
+ return 0;
+
+ if (flags & MSG_CMSG_COMPAT) {
+ unsigned int id32 = call->user_call_ID;
+
+ return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ sizeof(unsigned int), &id32);
+ } else {
+ unsigned long idl = call->user_call_ID;
+
+ return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ sizeof(unsigned long), &idl);
+ }
+}
+
+/*
+ * Deal with a CHALLENGE packet.
+ */
+static int rxrpc_recvmsg_challenge(struct socket *sock, struct msghdr *msg,
+ struct sk_buff *challenge, unsigned int flags)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(challenge);
+ struct rxrpc_connection *conn = sp->chall.conn;
+
+ return conn->security->challenge_to_recvmsg(conn, challenge, msg);
+}
+
+/*
+ * Process OOB packets. Called with the socket locked.
+ */
+static int rxrpc_recvmsg_oob(struct socket *sock, struct msghdr *msg,
+ unsigned int flags)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct sk_buff *skb;
+ bool need_response = false;
+ int ret;
+
+ skb = skb_peek(&rx->recvmsg_oobq);
+ if (!skb)
+ return -EAGAIN;
+ rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg);
+
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_OOB_ID, sizeof(u64),
+ &skb->skb_mstamp_ns);
+ if (ret < 0)
+ return ret;
+
+ switch ((enum rxrpc_oob_type)skb->mark) {
+ case RXRPC_OOB_CHALLENGE:
+ need_response = true;
+ ret = rxrpc_recvmsg_challenge(sock, msg, skb, flags);
+ break;
+ default:
+ WARN_ONCE(1, "recvmsg() can't process unknown OOB type %u\n",
+ skb->mark);
+ ret = -EIO;
+ break;
+ }
+
+ if (!(flags & MSG_PEEK))
+ skb_unlink(skb, &rx->recvmsg_oobq);
+ if (need_response)
+ rxrpc_add_pending_oob(rx, skb);
+ else
+ rxrpc_free_skb(skb, rxrpc_skb_put_oob);
+ return ret;
+}
+
+/*
* Deliver messages to a call. This keeps processing packets until the buffer
* is filled and we find either more DATA (returns 0) or the end of the DATA
* (returns 1). If more packets are required, it returns -EAGAIN and if the
@@ -165,6 +241,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
size_t len, int flags, size_t *_offset)
{
struct rxrpc_skb_priv *sp;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
struct sk_buff *skb;
rxrpc_seq_t seq = 0;
size_t remain;
@@ -207,7 +284,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq,
sp->offset, sp->len, ret2);
if (ret2 < 0) {
- kdebug("verify = %d", ret2);
ret = ret2;
goto out;
}
@@ -255,6 +331,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
if (!(flags & MSG_PEEK))
rxrpc_rotate_rx_window(call);
+
+ if (!rx->app_ops &&
+ !skb_queue_empty_lockless(&rx->recvmsg_oobq)) {
+ trace_rxrpc_recvdata(call, rxrpc_recvmsg_oobq, seq,
+ rx_pkt_offset, rx_pkt_len, ret);
+ break;
+ }
}
out:
@@ -262,6 +345,7 @@ out:
call->rx_pkt_offset = rx_pkt_offset;
call->rx_pkt_len = rx_pkt_len;
}
+
done:
trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq,
rx_pkt_offset, rx_pkt_len, ret);
@@ -301,6 +385,7 @@ try_again:
/* Return immediately if a client socket has no outstanding calls */
if (RB_EMPTY_ROOT(&rx->calls) &&
list_empty(&rx->recvmsg_q) &&
+ skb_queue_empty_lockless(&rx->recvmsg_oobq) &&
rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
release_sock(&rx->sk);
return -EAGAIN;
@@ -322,7 +407,8 @@ try_again:
if (ret)
goto wait_error;
- if (list_empty(&rx->recvmsg_q)) {
+ if (list_empty(&rx->recvmsg_q) &&
+ skb_queue_empty_lockless(&rx->recvmsg_oobq)) {
if (signal_pending(current))
goto wait_interrupted;
trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0);
@@ -332,19 +418,29 @@ try_again:
goto try_again;
}
+ /* Deal with OOB messages before we consider getting normal data. */
+ if (!skb_queue_empty_lockless(&rx->recvmsg_oobq)) {
+ ret = rxrpc_recvmsg_oob(sock, msg, flags);
+ release_sock(&rx->sk);
+ if (ret == -EAGAIN)
+ goto try_again;
+ goto error_no_call;
+ }
+
/* Find the next call and dequeue it if we're not just peeking. If we
* do dequeue it, that comes with a ref that we will need to release.
* We also want to weed out calls that got requeued whilst we were
* shovelling data out.
*/
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
l = rx->recvmsg_q.next;
call = list_entry(l, struct rxrpc_call, recvmsg_link);
if (!rxrpc_call_is_complete(call) &&
- skb_queue_empty(&call->recvmsg_queue)) {
+ skb_queue_empty(&call->recvmsg_queue) &&
+ skb_queue_empty(&rx->recvmsg_oobq)) {
list_del_init(&call->recvmsg_link);
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
release_sock(&rx->sk);
trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0);
rxrpc_put_call(call, rxrpc_call_put_recvmsg);
@@ -355,7 +451,7 @@ try_again:
list_del_init(&call->recvmsg_link);
else
rxrpc_get_call(call, rxrpc_call_get_recvmsg);
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
call_debug_id = call->debug_id;
trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0);
@@ -377,21 +473,9 @@ try_again:
if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
BUG();
- if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
- if (flags & MSG_CMSG_COMPAT) {
- unsigned int id32 = call->user_call_ID;
-
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
- sizeof(unsigned int), &id32);
- } else {
- unsigned long idl = call->user_call_ID;
-
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
- sizeof(unsigned long), &idl);
- }
- if (ret < 0)
- goto error_unlock_call;
- }
+ ret = rxrpc_recvmsg_user_id(call, msg, flags);
+ if (ret < 0)
+ goto error_unlock_call;
if (msg->msg_name && call->peer) {
size_t len = sizeof(call->dest_srx);
@@ -445,9 +529,9 @@ error_unlock_call:
error_requeue_call:
if (!(flags & MSG_PEEK)) {
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
list_add(&call->recvmsg_link, &rx->recvmsg_q);
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0);
} else {
rxrpc_put_call(call, rxrpc_call_put_recvmsg);
@@ -477,14 +561,14 @@ wait_error:
* @_service: Where to store the actual service ID (may be upgraded)
*
* Allow a kernel service to receive data and pick up information about the
- * state of a call. Returns 0 if got what was asked for and there's more
- * available, 1 if we got what was asked for and we're at the end of the data
- * and -EAGAIN if we need more data.
+ * state of a call. Note that *@_abort should also be initialised to %0.
*
- * Note that we may return -EAGAIN to drain empty packets at the end of the
- * data, even if we've already copied over the requested data.
+ * Note that we may return %-EAGAIN to drain empty packets at the end
+ * of the data, even if we've already copied over the requested data.
*
- * *_abort should also be initialised to 0.
+ * Return: %0 if got what was asked for and there's more available, %1
+ * if we got what was asked for and we're at the end of the data and
+ * %-EAGAIN if we need more data.
*/
int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
struct iov_iter *iter, size_t *_len,
diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c
index cdab7b7d08a0..7474f88d7b18 100644
--- a/net/rxrpc/rtt.c
+++ b/net/rxrpc/rtt.c
@@ -12,22 +12,22 @@
#include "ar-internal.h"
#define RXRPC_RTO_MAX (120 * USEC_PER_SEC)
-#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */
+#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */
#define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */
-static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer)
+static u32 rxrpc_rto_min_us(struct rxrpc_call *call)
{
return 200;
}
-static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer)
+static u32 __rxrpc_set_rto(const struct rxrpc_call *call)
{
- return (peer->srtt_us >> 3) + peer->rttvar_us;
+ return (call->srtt_us >> 3) + call->rttvar_us;
}
static u32 rxrpc_bound_rto(u32 rto)
{
- return min(rto, RXRPC_RTO_MAX);
+ return clamp(200000, rto + 100000, RXRPC_RTO_MAX);
}
/*
@@ -40,10 +40,10 @@ static u32 rxrpc_bound_rto(u32 rto)
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us)
+static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us)
{
long m = sample_rtt_us; /* RTT */
- u32 srtt = peer->srtt_us;
+ u32 srtt = call->srtt_us;
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
@@ -66,7 +66,7 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us)
srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0) {
m = -m; /* m is now abs(error) */
- m -= (peer->mdev_us >> 2); /* similar update on mdev */
+ m -= (call->mdev_us >> 2); /* similar update on mdev */
/* This is similar to one of Eifel findings.
* Eifel blocks mdev updates when rtt decreases.
* This solution is a bit different: we use finer gain
@@ -78,31 +78,31 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us)
if (m > 0)
m >>= 3;
} else {
- m -= (peer->mdev_us >> 2); /* similar update on mdev */
+ m -= (call->mdev_us >> 2); /* similar update on mdev */
}
- peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
- if (peer->mdev_us > peer->mdev_max_us) {
- peer->mdev_max_us = peer->mdev_us;
- if (peer->mdev_max_us > peer->rttvar_us)
- peer->rttvar_us = peer->mdev_max_us;
+ call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
+ if (call->mdev_us > call->mdev_max_us) {
+ call->mdev_max_us = call->mdev_us;
+ if (call->mdev_max_us > call->rttvar_us)
+ call->rttvar_us = call->mdev_max_us;
}
} else {
/* no previous measure. */
srtt = m << 3; /* take the measured time to be rtt */
- peer->mdev_us = m << 1; /* make sure rto = 3*rtt */
- peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer));
- peer->mdev_max_us = peer->rttvar_us;
+ call->mdev_us = m << 1; /* make sure rto = 3*rtt */
+ call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call));
+ call->mdev_max_us = call->rttvar_us;
}
- peer->srtt_us = max(1U, srtt);
+ call->srtt_us = umax(srtt, 1);
}
/*
* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static void rxrpc_set_rto(struct rxrpc_peer *peer)
+static void rxrpc_set_rto(struct rxrpc_call *call)
{
u32 rto;
@@ -113,7 +113,7 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer)
* is invisible. Actually, Linux-2.4 also generates erratic
* ACKs in some circumstances.
*/
- rto = __rxrpc_set_rto(peer);
+ rto = __rxrpc_set_rto(call);
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
@@ -124,61 +124,73 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer)
/* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo
* guarantees that rto is higher.
*/
- peer->rto_us = rxrpc_bound_rto(rto);
+ call->rto_us = rxrpc_bound_rto(rto);
}
-static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us)
+static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us)
+{
+ /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */
+ u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024;
+
+ minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024,
+ (u32)rtt_us ? : jiffies_to_usecs(1));
+}
+
+static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us)
{
if (rtt_us < 0)
return;
- //rxrpc_update_rtt_min(peer, rtt_us);
- rxrpc_rtt_estimator(peer, rtt_us);
- rxrpc_set_rto(peer);
+ /* Update RACK min RTT [RFC8985 6.1 Step 1]. */
+ rxrpc_update_rtt_min(call, resp_time, rtt_us);
+
+ rxrpc_rtt_estimator(call, rtt_us);
+ rxrpc_set_rto(call);
- /* RFC6298: only reset backoff on valid RTT measurement. */
- peer->backoff = 0;
+ /* Only reset backoff on valid RTT measurement [RFC6298]. */
+ call->backoff = 0;
}
/*
* Add RTT information to cache. This is called in softirq mode and has
- * exclusive access to the peer RTT data.
+ * exclusive access to the call RTT data.
*/
-void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
int rtt_slot,
rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
ktime_t send_time, ktime_t resp_time)
{
- struct rxrpc_peer *peer = call->peer;
s64 rtt_us;
rtt_us = ktime_to_us(ktime_sub(resp_time, send_time));
if (rtt_us < 0)
return;
- spin_lock(&peer->rtt_input_lock);
- rxrpc_ack_update_rtt(peer, rtt_us);
- if (peer->rtt_count < 3)
- peer->rtt_count++;
- spin_unlock(&peer->rtt_input_lock);
+ rxrpc_ack_update_rtt(call, resp_time, rtt_us);
+ if (call->rtt_count < 3)
+ call->rtt_count++;
+ call->rtt_taken++;
+
+ WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8);
+ WRITE_ONCE(call->peer->recent_rto_us, call->rto_us);
trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial,
- peer->srtt_us >> 3, peer->rto_us);
+ rtt_us, call->srtt_us, call->rto_us);
}
/*
* Get the retransmission timeout to set in nanoseconds, backing it off each
* time we retransmit.
*/
-ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans)
+ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans)
{
u64 timo_us;
- u32 backoff = READ_ONCE(peer->backoff);
+ u32 backoff = READ_ONCE(call->backoff);
- timo_us = peer->rto_us;
+ timo_us = call->rto_us;
timo_us <<= backoff;
if (retrans && timo_us * 2 <= RXRPC_RTO_MAX)
- WRITE_ONCE(peer->backoff, backoff + 1);
+ WRITE_ONCE(call->backoff, backoff + 1);
if (timo_us < 1)
timo_us = 1;
@@ -186,10 +198,11 @@ ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans)
return ns_to_ktime(timo_us * NSEC_PER_USEC);
}
-void rxrpc_peer_init_rtt(struct rxrpc_peer *peer)
+void rxrpc_call_init_rtt(struct rxrpc_call *call)
{
- peer->rto_us = RXRPC_TIMEOUT_INIT;
- peer->mdev_us = RXRPC_TIMEOUT_INIT;
- peer->backoff = 0;
- //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U);
+ call->rtt_last_req = KTIME_MIN;
+ call->rto_us = RXRPC_TIMEOUT_INIT;
+ call->mdev_us = RXRPC_TIMEOUT_INIT;
+ call->backoff = 0;
+ //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U);
}
diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c
new file mode 100644
index 000000000000..1e19c605bcc8
--- /dev/null
+++ b/net/rxrpc/rxgk.c
@@ -0,0 +1,1371 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* GSSAPI-based RxRPC security
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/key-type.h>
+#include "ar-internal.h"
+#include "rxgk_common.h"
+
+/*
+ * Parse the information from a server key
+ */
+static int rxgk_preparse_server_key(struct key_preparsed_payload *prep)
+{
+ const struct krb5_enctype *krb5;
+ struct krb5_buffer *server_key = (void *)&prep->payload.data[2];
+ unsigned int service, sec_class, kvno, enctype;
+ int n = 0;
+
+ _enter("%zu", prep->datalen);
+
+ if (sscanf(prep->orig_description, "%u:%u:%u:%u%n",
+ &service, &sec_class, &kvno, &enctype, &n) != 4)
+ return -EINVAL;
+
+ if (prep->orig_description[n])
+ return -EINVAL;
+
+ krb5 = crypto_krb5_find_enctype(enctype);
+ if (!krb5)
+ return -ENOPKG;
+
+ prep->payload.data[0] = (struct krb5_enctype *)krb5;
+
+ if (prep->datalen != krb5->key_len)
+ return -EKEYREJECTED;
+
+ server_key->len = prep->datalen;
+ server_key->data = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
+ if (!server_key->data)
+ return -ENOMEM;
+
+ _leave(" = 0");
+ return 0;
+}
+
+static void rxgk_free_server_key(union key_payload *payload)
+{
+ struct krb5_buffer *server_key = (void *)&payload->data[2];
+
+ kfree_sensitive(server_key->data);
+}
+
+static void rxgk_free_preparse_server_key(struct key_preparsed_payload *prep)
+{
+ rxgk_free_server_key(&prep->payload);
+}
+
+static void rxgk_destroy_server_key(struct key *key)
+{
+ rxgk_free_server_key(&key->payload);
+}
+
+static void rxgk_describe_server_key(const struct key *key, struct seq_file *m)
+{
+ const struct krb5_enctype *krb5 = key->payload.data[0];
+
+ if (krb5)
+ seq_printf(m, ": %s", krb5->name);
+}
+
+/*
+ * Handle rekeying the connection when we see our limits overrun or when the
+ * far side decided to rekey.
+ *
+ * Returns a ref on the context if successful or -ESTALE if the key is out of
+ * date.
+ */
+static struct rxgk_context *rxgk_rekey(struct rxrpc_connection *conn,
+ const u16 *specific_key_number)
+{
+ struct rxgk_context *gk, *dead = NULL;
+ unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1;
+ bool crank = false;
+
+ _enter("%d", specific_key_number ? *specific_key_number : -1);
+
+ mutex_lock(&conn->security_lock);
+
+ current_key = conn->rxgk.key_number;
+ if (!specific_key_number) {
+ key_number = current_key;
+ } else {
+ if (*specific_key_number == (u16)current_key)
+ key_number = current_key;
+ else if (*specific_key_number == (u16)(current_key - 1))
+ key_number = current_key - 1;
+ else if (*specific_key_number == (u16)(current_key + 1))
+ goto crank_window;
+ else
+ goto bad_key;
+ }
+
+ gk = conn->rxgk.keys[key_number & mask];
+ if (!gk)
+ goto generate_key;
+ if (!specific_key_number &&
+ test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags))
+ goto crank_window;
+
+grab:
+ refcount_inc(&gk->usage);
+ mutex_unlock(&conn->security_lock);
+ rxgk_put(dead);
+ return gk;
+
+crank_window:
+ trace_rxrpc_rxgk_rekey(conn, current_key,
+ specific_key_number ? *specific_key_number : -1);
+ if (current_key == UINT_MAX)
+ goto bad_key;
+ if (current_key + 1 == UINT_MAX)
+ set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
+
+ key_number = current_key + 1;
+ if (WARN_ON(conn->rxgk.keys[key_number & mask]))
+ goto bad_key;
+ crank = true;
+
+generate_key:
+ gk = conn->rxgk.keys[current_key & mask];
+ gk = rxgk_generate_transport_key(conn, gk->key, key_number, GFP_NOFS);
+ if (IS_ERR(gk)) {
+ mutex_unlock(&conn->security_lock);
+ return gk;
+ }
+
+ write_lock(&conn->security_use_lock);
+ if (crank) {
+ current_key++;
+ conn->rxgk.key_number = current_key;
+ dead = conn->rxgk.keys[(current_key - 2) & mask];
+ conn->rxgk.keys[(current_key - 2) & mask] = NULL;
+ }
+ conn->rxgk.keys[current_key & mask] = gk;
+ write_unlock(&conn->security_use_lock);
+ goto grab;
+
+bad_key:
+ mutex_unlock(&conn->security_lock);
+ return ERR_PTR(-ESTALE);
+}
+
+/*
+ * Get the specified keying context.
+ *
+ * Returns a ref on the context if successful or -ESTALE if the key is out of
+ * date.
+ */
+static struct rxgk_context *rxgk_get_key(struct rxrpc_connection *conn,
+ const u16 *specific_key_number)
+{
+ struct rxgk_context *gk;
+ unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1;
+
+ _enter("{%u},%d",
+ conn->rxgk.key_number, specific_key_number ? *specific_key_number : -1);
+
+ read_lock(&conn->security_use_lock);
+
+ current_key = conn->rxgk.key_number;
+ if (!specific_key_number) {
+ key_number = current_key;
+ } else {
+ /* Only the bottom 16 bits of the key number are exposed in the
+ * header, so we try and keep the upper 16 bits in step. The
+ * whole 32 bits are used to generate the TK.
+ */
+ if (*specific_key_number == (u16)current_key)
+ key_number = current_key;
+ else if (*specific_key_number == (u16)(current_key - 1))
+ key_number = current_key - 1;
+ else if (*specific_key_number == (u16)(current_key + 1))
+ goto rekey;
+ else
+ goto bad_key;
+ }
+
+ gk = conn->rxgk.keys[key_number & mask];
+ if (!gk)
+ goto slow_path;
+ if (!specific_key_number &&
+ key_number < UINT_MAX) {
+ if (time_after(jiffies, gk->expiry) ||
+ gk->bytes_remaining < 0) {
+ set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags);
+ goto slow_path;
+ }
+
+ if (test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags))
+ goto slow_path;
+ }
+
+ refcount_inc(&gk->usage);
+ read_unlock(&conn->security_use_lock);
+ return gk;
+
+rekey:
+ _debug("rekey");
+ if (current_key == UINT_MAX)
+ goto bad_key;
+ gk = conn->rxgk.keys[current_key & mask];
+ if (gk)
+ set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags);
+slow_path:
+ read_unlock(&conn->security_use_lock);
+ return rxgk_rekey(conn, specific_key_number);
+bad_key:
+ read_unlock(&conn->security_use_lock);
+ return ERR_PTR(-ESTALE);
+}
+
+/*
+ * initialise connection security
+ */
+static int rxgk_init_connection_security(struct rxrpc_connection *conn,
+ struct rxrpc_key_token *token)
+{
+ struct rxgk_context *gk;
+ int ret;
+
+ _enter("{%d,%u},{%x}",
+ conn->debug_id, conn->rxgk.key_number, key_serial(conn->key));
+
+ conn->security_ix = token->security_index;
+ conn->security_level = token->rxgk->level;
+
+ if (rxrpc_conn_is_client(conn)) {
+ conn->rxgk.start_time = ktime_get();
+ do_div(conn->rxgk.start_time, 100);
+ }
+
+ gk = rxgk_generate_transport_key(conn, token->rxgk, conn->rxgk.key_number,
+ GFP_NOFS);
+ if (IS_ERR(gk))
+ return PTR_ERR(gk);
+ conn->rxgk.enctype = gk->krb5->etype;
+ conn->rxgk.keys[gk->key_number & 3] = gk;
+
+ switch (conn->security_level) {
+ case RXRPC_SECURITY_PLAIN:
+ case RXRPC_SECURITY_AUTH:
+ case RXRPC_SECURITY_ENCRYPT:
+ break;
+ default:
+ ret = -EKEYREJECTED;
+ goto error;
+ }
+
+ ret = 0;
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Clean up the crypto on a call.
+ */
+static void rxgk_free_call_crypto(struct rxrpc_call *call)
+{
+}
+
+/*
+ * Work out how much data we can put in a packet.
+ */
+static struct rxrpc_txbuf *rxgk_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp)
+{
+ enum krb5_crypto_mode mode;
+ struct rxgk_context *gk;
+ struct rxrpc_txbuf *txb;
+ size_t shdr, alloc, limit, part, offset, gap;
+
+ switch (call->conn->security_level) {
+ default:
+ alloc = umin(remain, RXRPC_JUMBO_DATALEN);
+ return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp);
+ case RXRPC_SECURITY_AUTH:
+ shdr = 0;
+ mode = KRB5_CHECKSUM_MODE;
+ break;
+ case RXRPC_SECURITY_ENCRYPT:
+ shdr = sizeof(struct rxgk_header);
+ mode = KRB5_ENCRYPT_MODE;
+ break;
+ }
+
+ gk = rxgk_get_key(call->conn, NULL);
+ if (IS_ERR(gk))
+ return NULL;
+
+ /* Work out the maximum amount of data that will fit. */
+ alloc = RXRPC_JUMBO_DATALEN;
+ limit = crypto_krb5_how_much_data(gk->krb5, mode, &alloc, &offset);
+
+ if (remain < limit - shdr) {
+ part = remain;
+ alloc = crypto_krb5_how_much_buffer(gk->krb5, mode,
+ shdr + part, &offset);
+ gap = 0;
+ } else {
+ part = limit - shdr;
+ gap = RXRPC_JUMBO_DATALEN - alloc;
+ alloc = RXRPC_JUMBO_DATALEN;
+ }
+
+ rxgk_put(gk);
+
+ txb = rxrpc_alloc_data_txbuf(call, alloc, 16, gfp);
+ if (!txb)
+ return NULL;
+
+ txb->crypto_header = offset;
+ txb->sec_header = shdr;
+ txb->offset += offset + shdr;
+ txb->space = part;
+
+ /* Clear excess space in the packet */
+ if (gap)
+ memset(txb->data + alloc - gap, 0, gap);
+ return txb;
+}
+
+/*
+ * Integrity mode (sign a packet - level 1 security)
+ */
+static int rxgk_secure_packet_integrity(const struct rxrpc_call *call,
+ struct rxgk_context *gk,
+ struct rxrpc_txbuf *txb)
+{
+ struct rxgk_header *hdr;
+ struct scatterlist sg[1];
+ struct krb5_buffer metadata;
+ int ret = -ENOMEM;
+
+ _enter("");
+
+ hdr = kzalloc(sizeof(*hdr), GFP_NOFS);
+ if (!hdr)
+ goto error_gk;
+
+ hdr->epoch = htonl(call->conn->proto.epoch);
+ hdr->cid = htonl(call->cid);
+ hdr->call_number = htonl(call->call_id);
+ hdr->seq = htonl(txb->seq);
+ hdr->sec_index = htonl(call->security_ix);
+ hdr->data_len = htonl(txb->len);
+ metadata.len = sizeof(*hdr);
+ metadata.data = hdr;
+
+ sg_init_table(sg, 1);
+ sg_set_buf(&sg[0], txb->data, txb->alloc_size);
+
+ ret = crypto_krb5_get_mic(gk->krb5, gk->tx_Kc, &metadata,
+ sg, 1, txb->alloc_size,
+ txb->crypto_header, txb->sec_header + txb->len);
+ if (ret >= 0) {
+ txb->pkt_len = ret;
+ if (txb->alloc_size == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
+ gk->bytes_remaining -= ret;
+ }
+ kfree(hdr);
+error_gk:
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * wholly encrypt a packet (level 2 security)
+ */
+static int rxgk_secure_packet_encrypted(const struct rxrpc_call *call,
+ struct rxgk_context *gk,
+ struct rxrpc_txbuf *txb)
+{
+ struct rxgk_header *hdr;
+ struct scatterlist sg[1];
+ int ret;
+
+ _enter("%x", txb->len);
+
+ /* Insert the header into the buffer. */
+ hdr = txb->data + txb->crypto_header;
+ hdr->epoch = htonl(call->conn->proto.epoch);
+ hdr->cid = htonl(call->cid);
+ hdr->call_number = htonl(call->call_id);
+ hdr->seq = htonl(txb->seq);
+ hdr->sec_index = htonl(call->security_ix);
+ hdr->data_len = htonl(txb->len);
+
+ sg_init_table(sg, 1);
+ sg_set_buf(&sg[0], txb->data, txb->alloc_size);
+
+ ret = crypto_krb5_encrypt(gk->krb5, gk->tx_enc,
+ sg, 1, txb->alloc_size,
+ txb->crypto_header, txb->sec_header + txb->len,
+ false);
+ if (ret >= 0) {
+ txb->pkt_len = ret;
+ if (txb->alloc_size == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
+ gk->bytes_remaining -= ret;
+ }
+
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * checksum an RxRPC packet header
+ */
+static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+{
+ struct rxgk_context *gk;
+ int ret;
+
+ _enter("{%d{%x}},{#%u},%u,",
+ call->debug_id, key_serial(call->conn->key), txb->seq, txb->len);
+
+ gk = rxgk_get_key(call->conn, NULL);
+ if (IS_ERR(gk))
+ return PTR_ERR(gk) == -ESTALE ? -EKEYREJECTED : PTR_ERR(gk);
+
+ ret = key_validate(call->conn->key);
+ if (ret < 0) {
+ rxgk_put(gk);
+ return ret;
+ }
+
+ call->security_enctype = gk->krb5->etype;
+ txb->cksum = htons(gk->key_number);
+
+ switch (call->conn->security_level) {
+ case RXRPC_SECURITY_PLAIN:
+ rxgk_put(gk);
+ txb->pkt_len = txb->len;
+ return 0;
+ case RXRPC_SECURITY_AUTH:
+ return rxgk_secure_packet_integrity(call, gk, txb);
+ case RXRPC_SECURITY_ENCRYPT:
+ return rxgk_secure_packet_encrypted(call, gk, txb);
+ default:
+ rxgk_put(gk);
+ return -EPERM;
+ }
+}
+
+/*
+ * Integrity mode (check the signature on a packet - level 1 security)
+ */
+static int rxgk_verify_packet_integrity(struct rxrpc_call *call,
+ struct rxgk_context *gk,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxgk_header *hdr;
+ struct krb5_buffer metadata;
+ unsigned int offset = sp->offset, len = sp->len;
+ size_t data_offset = 0, data_len = len;
+ u32 ac;
+ int ret = -ENOMEM;
+
+ _enter("");
+
+ crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE,
+ &data_offset, &data_len);
+
+ hdr = kzalloc(sizeof(*hdr), GFP_NOFS);
+ if (!hdr)
+ goto put_gk;
+
+ hdr->epoch = htonl(call->conn->proto.epoch);
+ hdr->cid = htonl(call->cid);
+ hdr->call_number = htonl(call->call_id);
+ hdr->seq = htonl(sp->hdr.seq);
+ hdr->sec_index = htonl(call->security_ix);
+ hdr->data_len = htonl(data_len);
+
+ metadata.len = sizeof(*hdr);
+ metadata.data = hdr;
+ ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata,
+ skb, &offset, &len, &ac);
+ kfree(hdr);
+ if (ret == -EPROTO) {
+ rxrpc_abort_eproto(call, skb, ac,
+ rxgk_abort_1_verify_mic_eproto);
+ } else {
+ sp->offset = offset;
+ sp->len = len;
+ }
+
+put_gk:
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Decrypt an encrypted packet (level 2 security).
+ */
+static int rxgk_verify_packet_encrypted(struct rxrpc_call *call,
+ struct rxgk_context *gk,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxgk_header hdr;
+ unsigned int offset = sp->offset, len = sp->len;
+ int ret;
+ u32 ac;
+
+ _enter("");
+
+ ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac);
+ if (ret == -EPROTO)
+ rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto);
+ if (ret < 0)
+ goto error;
+
+ if (len < sizeof(hdr)) {
+ ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT,
+ rxgk_abort_2_short_header);
+ goto error;
+ }
+
+ /* Extract the header from the skb */
+ ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr));
+ if (ret < 0) {
+ ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT,
+ rxgk_abort_2_short_encdata);
+ goto error;
+ }
+ offset += sizeof(hdr);
+ len -= sizeof(hdr);
+
+ if (ntohl(hdr.epoch) != call->conn->proto.epoch ||
+ ntohl(hdr.cid) != call->cid ||
+ ntohl(hdr.call_number) != call->call_id ||
+ ntohl(hdr.seq) != sp->hdr.seq ||
+ ntohl(hdr.sec_index) != call->security_ix ||
+ ntohl(hdr.data_len) > len) {
+ ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON,
+ rxgk_abort_2_short_data);
+ goto error;
+ }
+
+ sp->offset = offset;
+ sp->len = ntohl(hdr.data_len);
+ ret = 0;
+error:
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Verify the security on a received packet or subpacket (if part of a
+ * jumbo packet).
+ */
+static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxgk_context *gk;
+ u16 key_number = sp->hdr.cksum;
+
+ _enter("{%d{%x}},{#%u}",
+ call->debug_id, key_serial(call->conn->key), sp->hdr.seq);
+
+ gk = rxgk_get_key(call->conn, &key_number);
+ if (IS_ERR(gk)) {
+ switch (PTR_ERR(gk)) {
+ case -ESTALE:
+ return rxrpc_abort_eproto(call, skb, RXGK_BADKEYNO,
+ rxgk_abort_bad_key_number);
+ default:
+ return PTR_ERR(gk);
+ }
+ }
+
+ call->security_enctype = gk->krb5->etype;
+ switch (call->conn->security_level) {
+ case RXRPC_SECURITY_PLAIN:
+ rxgk_put(gk);
+ return 0;
+ case RXRPC_SECURITY_AUTH:
+ return rxgk_verify_packet_integrity(call, gk, skb);
+ case RXRPC_SECURITY_ENCRYPT:
+ return rxgk_verify_packet_encrypted(call, gk, skb);
+ default:
+ rxgk_put(gk);
+ return -ENOANO;
+ }
+}
+
+/*
+ * Allocate memory to hold a challenge or a response packet. We're not running
+ * in the io_thread, so we can't use ->tx_alloc.
+ */
+static struct page *rxgk_alloc_packet(size_t total_len)
+{
+ gfp_t gfp = GFP_NOFS;
+ int order;
+
+ order = get_order(total_len);
+ if (order > 0)
+ gfp |= __GFP_COMP;
+ return alloc_pages(gfp, order);
+}
+
+/*
+ * Issue a challenge.
+ */
+static int rxgk_issue_challenge(struct rxrpc_connection *conn)
+{
+ struct rxrpc_wire_header *whdr;
+ struct bio_vec bvec[1];
+ struct msghdr msg;
+ struct page *page;
+ size_t len = sizeof(*whdr) + sizeof(conn->rxgk.nonce);
+ u32 serial;
+ int ret;
+
+ _enter("{%d}", conn->debug_id);
+
+ get_random_bytes(&conn->rxgk.nonce, sizeof(conn->rxgk.nonce));
+
+ /* We can't use conn->tx_alloc without a lock */
+ page = rxgk_alloc_packet(sizeof(*whdr) + sizeof(conn->rxgk.nonce));
+ if (!page)
+ return -ENOMEM;
+
+ bvec_set_page(&bvec[0], page, len, 0);
+ iov_iter_bvec(&msg.msg_iter, WRITE, bvec, 1, len);
+
+ msg.msg_name = &conn->peer->srx.transport;
+ msg.msg_namelen = conn->peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_SPLICE_PAGES;
+
+ whdr = page_address(page);
+ whdr->epoch = htonl(conn->proto.epoch);
+ whdr->cid = htonl(conn->proto.cid);
+ whdr->callNumber = 0;
+ whdr->seq = 0;
+ whdr->type = RXRPC_PACKET_TYPE_CHALLENGE;
+ whdr->flags = conn->out_clientflag;
+ whdr->userStatus = 0;
+ whdr->securityIndex = conn->security_ix;
+ whdr->_rsvd = 0;
+ whdr->serviceId = htons(conn->service_id);
+
+ memcpy(whdr + 1, conn->rxgk.nonce, sizeof(conn->rxgk.nonce));
+
+ serial = rxrpc_get_next_serials(conn, 1);
+ whdr->serial = htonl(serial);
+
+ trace_rxrpc_tx_challenge(conn, serial, 0, *(u32 *)&conn->rxgk.nonce);
+
+ ret = do_udp_sendmsg(conn->local->socket, &msg, len);
+ if (ret > 0)
+ conn->peer->last_tx_at = ktime_get_seconds();
+ __free_page(page);
+
+ if (ret < 0) {
+ trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
+ rxrpc_tx_point_rxgk_challenge);
+ return -EAGAIN;
+ }
+
+ trace_rxrpc_tx_packet(conn->debug_id, whdr,
+ rxrpc_tx_point_rxgk_challenge);
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Validate a challenge packet.
+ */
+static bool rxgk_validate_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ u8 nonce[20];
+
+ if (!conn->key) {
+ rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO,
+ rxgk_abort_chall_no_key);
+ return false;
+ }
+
+ if (key_validate(conn->key) < 0) {
+ rxrpc_abort_conn(conn, skb, RXGK_EXPIRED, -EPROTO,
+ rxgk_abort_chall_key_expired);
+ return false;
+ }
+
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ nonce, sizeof(nonce)) < 0) {
+ rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+ rxgk_abort_chall_short);
+ return false;
+ }
+
+ trace_rxrpc_rx_challenge(conn, sp->hdr.serial, 0, *(u32 *)nonce, 0);
+ return true;
+}
+
+/**
+ * rxgk_kernel_query_challenge - Query RxGK-specific challenge parameters
+ * @challenge: The challenge packet to query
+ *
+ * Return: The Kerberos 5 encoding type for the challenged connection.
+ */
+u32 rxgk_kernel_query_challenge(struct sk_buff *challenge)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(challenge);
+
+ return sp->chall.conn->rxgk.enctype;
+}
+EXPORT_SYMBOL(rxgk_kernel_query_challenge);
+
+/*
+ * Fill out the control message to pass to userspace to inform about the
+ * challenge.
+ */
+static int rxgk_challenge_to_recvmsg(struct rxrpc_connection *conn,
+ struct sk_buff *challenge,
+ struct msghdr *msg)
+{
+ struct rxgk_challenge chall;
+
+ chall.base.service_id = conn->service_id;
+ chall.base.security_index = conn->security_ix;
+ chall.enctype = conn->rxgk.enctype;
+
+ return put_cmsg(msg, SOL_RXRPC, RXRPC_CHALLENGED, sizeof(chall), &chall);
+}
+
+/*
+ * Insert the requisite amount of XDR padding for the length given.
+ */
+static int rxgk_pad_out(struct sk_buff *response, size_t len, size_t offset)
+{
+ __be32 zero = 0;
+ size_t pad = xdr_round_up(len) - len;
+ int ret;
+
+ if (!pad)
+ return 0;
+
+ ret = skb_store_bits(response, offset, &zero, pad);
+ if (ret < 0)
+ return ret;
+ return pad;
+}
+
+/*
+ * Insert the header into the response.
+ */
+static noinline ssize_t rxgk_insert_response_header(struct rxrpc_connection *conn,
+ struct rxgk_context *gk,
+ struct sk_buff *response,
+ size_t offset)
+{
+ struct rxrpc_skb_priv *rsp = rxrpc_skb(response);
+
+ struct {
+ struct rxrpc_wire_header whdr;
+ __be32 start_time_msw;
+ __be32 start_time_lsw;
+ __be32 ticket_len;
+ } h;
+ int ret;
+
+ rsp->resp.kvno = gk->key_number;
+ rsp->resp.version = gk->krb5->etype;
+
+ h.whdr.epoch = htonl(conn->proto.epoch);
+ h.whdr.cid = htonl(conn->proto.cid);
+ h.whdr.callNumber = 0;
+ h.whdr.serial = 0;
+ h.whdr.seq = 0;
+ h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE;
+ h.whdr.flags = conn->out_clientflag;
+ h.whdr.userStatus = 0;
+ h.whdr.securityIndex = conn->security_ix;
+ h.whdr.cksum = htons(gk->key_number);
+ h.whdr.serviceId = htons(conn->service_id);
+ h.start_time_msw = htonl(upper_32_bits(conn->rxgk.start_time));
+ h.start_time_lsw = htonl(lower_32_bits(conn->rxgk.start_time));
+ h.ticket_len = htonl(gk->key->ticket.len);
+
+ ret = skb_store_bits(response, offset, &h, sizeof(h));
+ return ret < 0 ? ret : sizeof(h);
+}
+
+/*
+ * Construct the authenticator to go in the response packet
+ *
+ * struct RXGK_Authenticator {
+ * opaque nonce[20];
+ * opaque appdata<>;
+ * RXGK_Level level;
+ * unsigned int epoch;
+ * unsigned int cid;
+ * unsigned int call_numbers<>;
+ * };
+ */
+static ssize_t rxgk_construct_authenticator(struct rxrpc_connection *conn,
+ struct sk_buff *challenge,
+ const struct krb5_buffer *appdata,
+ struct sk_buff *response,
+ size_t offset)
+{
+ struct {
+ u8 nonce[20];
+ __be32 appdata_len;
+ } a;
+ struct {
+ __be32 level;
+ __be32 epoch;
+ __be32 cid;
+ __be32 call_numbers_count;
+ __be32 call_numbers[4];
+ } b;
+ int ret;
+
+ ret = skb_copy_bits(challenge, sizeof(struct rxrpc_wire_header),
+ a.nonce, sizeof(a.nonce));
+ if (ret < 0)
+ return -EPROTO;
+
+ a.appdata_len = htonl(appdata->len);
+
+ ret = skb_store_bits(response, offset, &a, sizeof(a));
+ if (ret < 0)
+ return ret;
+ offset += sizeof(a);
+
+ if (appdata->len) {
+ ret = skb_store_bits(response, offset, appdata->data, appdata->len);
+ if (ret < 0)
+ return ret;
+ offset += appdata->len;
+
+ ret = rxgk_pad_out(response, appdata->len, offset);
+ if (ret < 0)
+ return ret;
+ offset += ret;
+ }
+
+ b.level = htonl(conn->security_level);
+ b.epoch = htonl(conn->proto.epoch);
+ b.cid = htonl(conn->proto.cid);
+ b.call_numbers_count = htonl(4);
+ b.call_numbers[0] = htonl(conn->channels[0].call_counter);
+ b.call_numbers[1] = htonl(conn->channels[1].call_counter);
+ b.call_numbers[2] = htonl(conn->channels[2].call_counter);
+ b.call_numbers[3] = htonl(conn->channels[3].call_counter);
+
+ ret = skb_store_bits(response, offset, &b, sizeof(b));
+ if (ret < 0)
+ return ret;
+ return sizeof(a) + xdr_round_up(appdata->len) + sizeof(b);
+}
+
+static ssize_t rxgk_encrypt_authenticator(struct rxrpc_connection *conn,
+ struct rxgk_context *gk,
+ struct sk_buff *response,
+ size_t offset,
+ size_t alloc_len,
+ size_t auth_offset,
+ size_t auth_len)
+{
+ struct scatterlist sg[16];
+ int nr_sg;
+
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ nr_sg = skb_to_sgvec(response, sg, offset, alloc_len);
+ if (unlikely(nr_sg < 0))
+ return nr_sg;
+ return crypto_krb5_encrypt(gk->krb5, gk->resp_enc, sg, nr_sg, alloc_len,
+ auth_offset, auth_len, false);
+}
+
+/*
+ * Construct the response.
+ *
+ * struct RXGK_Response {
+ * rxgkTime start_time;
+ * RXGK_Data token;
+ * opaque authenticator<RXGK_MAXAUTHENTICATOR>
+ * };
+ */
+static int rxgk_construct_response(struct rxrpc_connection *conn,
+ struct sk_buff *challenge,
+ struct krb5_buffer *appdata)
+{
+ struct rxrpc_skb_priv *csp, *rsp;
+ struct rxgk_context *gk;
+ struct sk_buff *response;
+ size_t len, auth_len, authx_len, offset, auth_offset, authx_offset;
+ __be32 tmp;
+ int ret;
+
+ gk = rxgk_get_key(conn, NULL);
+ if (IS_ERR(gk))
+ return PTR_ERR(gk);
+
+ auth_len = 20 + (4 + appdata->len) + 12 + (1 + 4) * 4;
+ authx_len = crypto_krb5_how_much_buffer(gk->krb5, KRB5_ENCRYPT_MODE,
+ auth_len, &auth_offset);
+ len = sizeof(struct rxrpc_wire_header) +
+ 8 + (4 + xdr_round_up(gk->key->ticket.len)) + (4 + authx_len);
+
+ response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS);
+ if (!response)
+ goto error;
+ rxrpc_new_skb(response, rxrpc_skb_new_response_rxgk);
+ response->len = len;
+ response->data_len = len;
+
+ ret = rxgk_insert_response_header(conn, gk, response, 0);
+ if (ret < 0)
+ goto error;
+ offset = ret;
+
+ ret = skb_store_bits(response, offset, gk->key->ticket.data, gk->key->ticket.len);
+ if (ret < 0)
+ goto error;
+ offset += gk->key->ticket.len;
+ ret = rxgk_pad_out(response, gk->key->ticket.len, offset);
+ if (ret < 0)
+ goto error;
+
+ authx_offset = offset + ret + 4; /* Leave a gap for the length. */
+
+ ret = rxgk_construct_authenticator(conn, challenge, appdata, response,
+ authx_offset + auth_offset);
+ if (ret < 0)
+ goto error;
+ auth_len = ret;
+
+ ret = rxgk_encrypt_authenticator(conn, gk, response,
+ authx_offset, authx_len,
+ auth_offset, auth_len);
+ if (ret < 0)
+ goto error;
+ authx_len = ret;
+
+ tmp = htonl(authx_len);
+ ret = skb_store_bits(response, authx_offset - 4, &tmp, 4);
+ if (ret < 0)
+ goto error;
+
+ ret = rxgk_pad_out(response, authx_len, authx_offset + authx_len);
+ if (ret < 0)
+ goto error;
+ len = authx_offset + authx_len + ret;
+
+ if (len != response->len) {
+ response->len = len;
+ response->data_len = len;
+ }
+
+ csp = rxrpc_skb(challenge);
+ rsp = rxrpc_skb(response);
+ rsp->resp.len = len;
+ rsp->resp.challenge_serial = csp->hdr.serial;
+ rxrpc_post_response(conn, response);
+ response = NULL;
+ ret = 0;
+
+error:
+ rxrpc_free_skb(response, rxrpc_skb_put_response);
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Respond to a challenge packet.
+ */
+static int rxgk_respond_to_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *challenge,
+ struct krb5_buffer *appdata)
+{
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+
+ if (key_validate(conn->key) < 0)
+ return rxrpc_abort_conn(conn, NULL, RXGK_EXPIRED, -EPROTO,
+ rxgk_abort_chall_key_expired);
+
+ return rxgk_construct_response(conn, challenge, appdata);
+}
+
+static int rxgk_respond_to_challenge_no_appdata(struct rxrpc_connection *conn,
+ struct sk_buff *challenge)
+{
+ struct krb5_buffer appdata = {};
+
+ return rxgk_respond_to_challenge(conn, challenge, &appdata);
+}
+
+/**
+ * rxgk_kernel_respond_to_challenge - Respond to a challenge with appdata
+ * @challenge: The challenge to respond to
+ * @appdata: The application data to include in the RESPONSE authenticator
+ *
+ * Allow a kernel application to respond to a CHALLENGE with application data
+ * to be included in the RxGK RESPONSE Authenticator.
+ *
+ * Return: %0 if successful and a negative error code otherwise.
+ */
+int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge,
+ struct krb5_buffer *appdata)
+{
+ struct rxrpc_skb_priv *csp = rxrpc_skb(challenge);
+
+ return rxgk_respond_to_challenge(csp->chall.conn, challenge, appdata);
+}
+EXPORT_SYMBOL(rxgk_kernel_respond_to_challenge);
+
+/*
+ * Parse sendmsg() control message and respond to challenge. We need to see if
+ * there's an appdata to fish out.
+ */
+static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge,
+ struct msghdr *msg)
+{
+ struct krb5_buffer appdata = {};
+ struct cmsghdr *cmsg;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (cmsg->cmsg_level != SOL_RXRPC ||
+ cmsg->cmsg_type != RXRPC_RESP_RXGK_APPDATA)
+ continue;
+ if (appdata.data)
+ return -EINVAL;
+ appdata.data = CMSG_DATA(cmsg);
+ appdata.len = cmsg->cmsg_len - sizeof(struct cmsghdr);
+ }
+
+ return rxgk_kernel_respond_to_challenge(challenge, &appdata);
+}
+
+/*
+ * Verify the authenticator.
+ *
+ * struct RXGK_Authenticator {
+ * opaque nonce[20];
+ * opaque appdata<>;
+ * RXGK_Level level;
+ * unsigned int epoch;
+ * unsigned int cid;
+ * unsigned int call_numbers<>;
+ * };
+ */
+static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn,
+ const struct krb5_enctype *krb5,
+ struct sk_buff *skb,
+ __be32 *p, __be32 *end)
+{
+ u32 app_len, call_count, level, epoch, cid, i;
+
+ _enter("");
+
+ if (memcmp(p, conn->rxgk.nonce, 20) != 0)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_bad_nonce);
+ p += 20 / sizeof(__be32);
+
+ app_len = ntohl(*p++);
+ if (app_len > (end - p) * sizeof(__be32))
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_short_applen);
+
+ p += xdr_round_up(app_len) / sizeof(__be32);
+ if (end - p < 4)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_short_applen);
+
+ level = ntohl(*p++);
+ epoch = ntohl(*p++);
+ cid = ntohl(*p++);
+ call_count = ntohl(*p++);
+
+ if (level != conn->security_level ||
+ epoch != conn->proto.epoch ||
+ cid != conn->proto.cid ||
+ call_count > 4)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_bad_param);
+
+ if (end - p < call_count)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_short_call_list);
+
+ for (i = 0; i < call_count; i++) {
+ u32 call_id = ntohl(*p++);
+
+ if (call_id > INT_MAX)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_bad_callid);
+
+ if (call_id < conn->channels[i].call_counter)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_call_ctr);
+
+ if (call_id > conn->channels[i].call_counter) {
+ if (conn->channels[i].call)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_call_state);
+
+ conn->channels[i].call_counter = call_id;
+ }
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Extract the authenticator and verify it.
+ */
+static int rxgk_verify_authenticator(struct rxrpc_connection *conn,
+ const struct krb5_enctype *krb5,
+ struct sk_buff *skb,
+ unsigned int auth_offset, unsigned int auth_len)
+{
+ void *auth;
+ __be32 *p;
+ int ret;
+
+ auth = kmalloc(auth_len, GFP_NOFS);
+ if (!auth)
+ return -ENOMEM;
+
+ ret = skb_copy_bits(skb, auth_offset, auth, auth_len);
+ if (ret < 0) {
+ ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_short_auth);
+ goto error;
+ }
+
+ p = auth;
+ ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, p + auth_len);
+error:
+ kfree(auth);
+ return ret;
+}
+
+/*
+ * Verify a response.
+ *
+ * struct RXGK_Response {
+ * rxgkTime start_time;
+ * RXGK_Data token;
+ * opaque authenticator<RXGK_MAXAUTHENTICATOR>
+ * };
+ */
+static int rxgk_verify_response(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ const struct krb5_enctype *krb5;
+ struct rxrpc_key_token *token;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxgk_response rhdr;
+ struct rxgk_context *gk;
+ struct key *key = NULL;
+ unsigned int offset = sizeof(struct rxrpc_wire_header);
+ unsigned int len = skb->len - sizeof(struct rxrpc_wire_header);
+ unsigned int token_offset, token_len;
+ unsigned int auth_offset, auth_len;
+ __be32 xauth_len;
+ int ret, ec;
+
+ _enter("{%d}", conn->debug_id);
+
+ /* Parse the RXGK_Response object */
+ if (sizeof(rhdr) + sizeof(__be32) > len)
+ goto short_packet;
+
+ if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0)
+ goto short_packet;
+ offset += sizeof(rhdr);
+ len -= sizeof(rhdr);
+
+ token_offset = offset;
+ token_len = ntohl(rhdr.token_len);
+ if (xdr_round_up(token_len) + sizeof(__be32) > len)
+ goto short_packet;
+
+ trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len);
+
+ offset += xdr_round_up(token_len);
+ len -= xdr_round_up(token_len);
+
+ if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0)
+ goto short_packet;
+ offset += sizeof(xauth_len);
+ len -= sizeof(xauth_len);
+
+ auth_offset = offset;
+ auth_len = ntohl(xauth_len);
+ if (auth_len < len)
+ goto short_packet;
+ if (auth_len & 3)
+ goto inconsistent;
+ if (auth_len < 20 + 9 * 4)
+ goto auth_too_short;
+
+ /* We need to extract and decrypt the token and instantiate a session
+ * key for it. This bit, however, is application-specific. If
+ * possible, we use a default parser, but we might end up bumping this
+ * to the app to deal with - which might mean a round trip to
+ * userspace.
+ */
+ ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key);
+ if (ret < 0)
+ goto out;
+
+ /* We now have a key instantiated from the decrypted ticket. We can
+ * pass this to the application so that they can parse the ticket
+ * content and we can use the session key it contains to derive the
+ * keys we need.
+ *
+ * Note that we have to switch enctype at this point as the enctype of
+ * the ticket doesn't necessarily match that of the transport.
+ */
+ token = key->payload.data[0];
+ conn->security_level = token->rxgk->level;
+ conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time);
+
+ gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS);
+ if (IS_ERR(gk)) {
+ ret = PTR_ERR(gk);
+ goto cant_get_token;
+ }
+
+ krb5 = gk->krb5;
+
+ trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len);
+
+ /* Decrypt, parse and verify the authenticator. */
+ ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb,
+ &auth_offset, &auth_len, &ec);
+ if (ret < 0) {
+ rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret,
+ rxgk_abort_resp_auth_dec);
+ goto out;
+ }
+
+ ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len);
+ if (ret < 0)
+ goto out;
+
+ conn->key = key;
+ key = NULL;
+ ret = 0;
+out:
+ key_put(key);
+ _leave(" = %d", ret);
+ return ret;
+
+inconsistent:
+ ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
+ rxgk_abort_resp_xdr_align);
+ goto out;
+auth_too_short:
+ ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+ rxgk_abort_resp_short_auth);
+ goto out;
+short_packet:
+ ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+ rxgk_abort_resp_short_packet);
+ goto out;
+
+cant_get_token:
+ switch (ret) {
+ case -ENOMEM:
+ goto temporary_error;
+ case -EINVAL:
+ ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED,
+ rxgk_abort_resp_internal_error);
+ goto out;
+ case -ENOPKG:
+ ret = rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP,
+ -EKEYREJECTED, rxgk_abort_resp_nopkg);
+ goto out;
+ }
+
+temporary_error:
+ /* Ignore the response packet if we got a temporary error such as
+ * ENOMEM. We just want to send the challenge again. Note that we
+ * also come out this way if the ticket decryption fails.
+ */
+ goto out;
+}
+
+/*
+ * clear the connection security
+ */
+static void rxgk_clear(struct rxrpc_connection *conn)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(conn->rxgk.keys); i++)
+ rxgk_put(conn->rxgk.keys[i]);
+}
+
+/*
+ * Initialise the RxGK security service.
+ */
+static int rxgk_init(void)
+{
+ return 0;
+}
+
+/*
+ * Clean up the RxGK security service.
+ */
+static void rxgk_exit(void)
+{
+}
+
+/*
+ * RxRPC YFS GSSAPI-based security
+ */
+const struct rxrpc_security rxgk_yfs = {
+ .name = "yfs-rxgk",
+ .security_index = RXRPC_SECURITY_YFS_RXGK,
+ .no_key_abort = RXGK_NOTAUTH,
+ .init = rxgk_init,
+ .exit = rxgk_exit,
+ .preparse_server_key = rxgk_preparse_server_key,
+ .free_preparse_server_key = rxgk_free_preparse_server_key,
+ .destroy_server_key = rxgk_destroy_server_key,
+ .describe_server_key = rxgk_describe_server_key,
+ .init_connection_security = rxgk_init_connection_security,
+ .alloc_txbuf = rxgk_alloc_txbuf,
+ .secure_packet = rxgk_secure_packet,
+ .verify_packet = rxgk_verify_packet,
+ .free_call_crypto = rxgk_free_call_crypto,
+ .issue_challenge = rxgk_issue_challenge,
+ .validate_challenge = rxgk_validate_challenge,
+ .challenge_to_recvmsg = rxgk_challenge_to_recvmsg,
+ .sendmsg_respond_to_challenge = rxgk_sendmsg_respond_to_challenge,
+ .respond_to_challenge = rxgk_respond_to_challenge_no_appdata,
+ .verify_response = rxgk_verify_response,
+ .clear = rxgk_clear,
+ .default_decode_ticket = rxgk_yfs_decode_ticket,
+};
diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c
new file mode 100644
index 000000000000..b94b77a1c317
--- /dev/null
+++ b/net/rxrpc/rxgk_app.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Application-specific bits for GSSAPI-based RxRPC security
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/key-type.h>
+#include "ar-internal.h"
+#include "rxgk_common.h"
+
+/*
+ * Decode a default-style YFS ticket in a response and turn it into an
+ * rxrpc-type key.
+ *
+ * struct rxgk_key {
+ * afs_uint32 enctype;
+ * opaque key<>;
+ * };
+ *
+ * struct RXGK_AuthName {
+ * afs_int32 kind;
+ * opaque data<AUTHDATAMAX>;
+ * opaque display<AUTHPRINTABLEMAX>;
+ * };
+ *
+ * struct RXGK_Token {
+ * rxgk_key K0;
+ * RXGK_Level level;
+ * rxgkTime starttime;
+ * afs_int32 lifetime;
+ * afs_int32 bytelife;
+ * rxgkTime expirationtime;
+ * struct RXGK_AuthName identities<>;
+ * };
+ */
+int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
+ unsigned int ticket_offset, unsigned int ticket_len,
+ struct key **_key)
+{
+ struct rxrpc_key_token *token;
+ const struct cred *cred = current_cred(); // TODO - use socket creds
+ struct key *key;
+ size_t pre_ticket_len, payload_len;
+ unsigned int klen, enctype;
+ void *payload, *ticket;
+ __be32 *t, *p, *q, tmp[2];
+ int ret;
+
+ _enter("");
+
+ /* Get the session key length */
+ ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp));
+ if (ret < 0)
+ return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
+ rxgk_abort_resp_short_yfs_klen);
+ enctype = ntohl(tmp[0]);
+ klen = ntohl(tmp[1]);
+
+ if (klen > ticket_len - 10 * sizeof(__be32))
+ return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
+ rxgk_abort_resp_short_yfs_key);
+
+ pre_ticket_len = ((5 + 14) * sizeof(__be32) +
+ xdr_round_up(klen) +
+ sizeof(__be32));
+ payload_len = pre_ticket_len + xdr_round_up(ticket_len);
+
+ payload = kzalloc(payload_len, GFP_NOFS);
+ if (!payload)
+ return -ENOMEM;
+
+ /* We need to fill out the XDR form for a key payload that we can pass
+ * to add_key(). Start by copying in the ticket so that we can parse
+ * it.
+ */
+ ticket = payload + pre_ticket_len;
+ ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len);
+ if (ret < 0) {
+ ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
+ rxgk_abort_resp_short_yfs_tkt);
+ goto error;
+ }
+
+ /* Fill out the form header. */
+ p = payload;
+ p[0] = htonl(0); /* Flags */
+ p[1] = htonl(1); /* len(cellname) */
+ p[2] = htonl(0x20000000); /* Cellname " " */
+ p[3] = htonl(1); /* #tokens */
+ p[4] = htonl(15 * sizeof(__be32) + xdr_round_up(klen) +
+ xdr_round_up(ticket_len)); /* Token len */
+
+ /* Now fill in the body. Most of this we can just scrape directly from
+ * the ticket.
+ */
+ t = ticket + sizeof(__be32) * 2 + xdr_round_up(klen);
+ q = payload + 5 * sizeof(__be32);
+ q[0] = htonl(RXRPC_SECURITY_YFS_RXGK);
+ q[1] = t[1]; /* begintime - msw */
+ q[2] = t[2]; /* - lsw */
+ q[3] = t[5]; /* endtime - msw */
+ q[4] = t[6]; /* - lsw */
+ q[5] = 0; /* level - msw */
+ q[6] = t[0]; /* - lsw */
+ q[7] = 0; /* lifetime - msw */
+ q[8] = t[3]; /* - lsw */
+ q[9] = 0; /* bytelife - msw */
+ q[10] = t[4]; /* - lsw */
+ q[11] = 0; /* enctype - msw */
+ q[12] = htonl(enctype); /* - lsw */
+ q[13] = htonl(klen); /* Key length */
+
+ q += 14;
+
+ memcpy(q, ticket + sizeof(__be32) * 2, klen);
+ q += xdr_round_up(klen) / 4;
+ q[0] = htonl(ticket_len);
+ q++;
+ if (WARN_ON((unsigned long)q != (unsigned long)ticket)) {
+ ret = -EIO;
+ goto error;
+ }
+
+ /* Ticket read in with skb_copy_bits above */
+ q += xdr_round_up(ticket_len) / 4;
+ if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) {
+ ret = -EIO;
+ goto error;
+ }
+
+ /* Now turn that into a key. */
+ key = key_alloc(&key_type_rxrpc, "x",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, // TODO: Use socket owner
+ KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ if (IS_ERR(key)) {
+ _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
+ ret = PTR_ERR(key);
+ goto error;
+ }
+
+ _debug("key %d", key_serial(key));
+
+ ret = key_instantiate_and_link(key, payload, payload_len, NULL, NULL);
+ if (ret < 0)
+ goto error_key;
+
+ token = key->payload.data[0];
+ token->no_leak_key = true;
+ *_key = key;
+ key = NULL;
+ ret = 0;
+ goto error;
+
+error_key:
+ key_put(key);
+error:
+ kfree_sensitive(payload);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Extract the token and set up a session key from the details.
+ *
+ * struct RXGK_TokenContainer {
+ * afs_int32 kvno;
+ * afs_int32 enctype;
+ * opaque encrypted_token<>;
+ * };
+ *
+ * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1]
+ */
+int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
+ unsigned int token_offset, unsigned int token_len,
+ struct key **_key)
+{
+ const struct krb5_enctype *krb5;
+ const struct krb5_buffer *server_secret;
+ struct crypto_aead *token_enc = NULL;
+ struct key *server_key;
+ unsigned int ticket_offset, ticket_len;
+ u32 kvno, enctype;
+ int ret, ec;
+
+ struct {
+ __be32 kvno;
+ __be32 enctype;
+ __be32 token_len;
+ } container;
+
+ /* Decode the RXGK_TokenContainer object. This tells us which server
+ * key we should be using. We can then fetch the key, get the secret
+ * and set up the crypto to extract the token.
+ */
+ if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0)
+ return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+ rxgk_abort_resp_tok_short);
+
+ kvno = ntohl(container.kvno);
+ enctype = ntohl(container.enctype);
+ ticket_len = ntohl(container.token_len);
+ ticket_offset = token_offset + sizeof(container);
+
+ if (xdr_round_up(ticket_len) > token_len - 3 * 4)
+ return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+ rxgk_abort_resp_tok_short);
+
+ _debug("KVNO %u", kvno);
+ _debug("ENC %u", enctype);
+ _debug("TLEN %u", ticket_len);
+
+ server_key = rxrpc_look_up_server_security(conn, skb, kvno, enctype);
+ if (IS_ERR(server_key))
+ goto cant_get_server_key;
+
+ down_read(&server_key->sem);
+ server_secret = (const void *)&server_key->payload.data[2];
+ ret = rxgk_set_up_token_cipher(server_secret, &token_enc, enctype, &krb5, GFP_NOFS);
+ up_read(&server_key->sem);
+ key_put(server_key);
+ if (ret < 0)
+ goto cant_get_token;
+
+ /* We can now decrypt and parse the token/ticket. This allows us to
+ * gain access to K0, from which we can derive the transport key and
+ * thence decode the authenticator.
+ */
+ ret = rxgk_decrypt_skb(krb5, token_enc, skb,
+ &ticket_offset, &ticket_len, &ec);
+ crypto_free_aead(token_enc);
+ token_enc = NULL;
+ if (ret < 0)
+ return rxrpc_abort_conn(conn, skb, ec, ret,
+ rxgk_abort_resp_tok_dec);
+
+ ret = conn->security->default_decode_ticket(conn, skb, ticket_offset,
+ ticket_len, _key);
+ if (ret < 0)
+ goto cant_get_token;
+
+ _leave(" = 0");
+ return ret;
+
+cant_get_server_key:
+ ret = PTR_ERR(server_key);
+ switch (ret) {
+ case -ENOMEM:
+ goto temporary_error;
+ case -ENOKEY:
+ case -EKEYREJECTED:
+ case -EKEYEXPIRED:
+ case -EKEYREVOKED:
+ case -EPERM:
+ return rxrpc_abort_conn(conn, skb, RXGK_BADKEYNO, -EKEYREJECTED,
+ rxgk_abort_resp_tok_nokey);
+ default:
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED,
+ rxgk_abort_resp_tok_keyerr);
+ }
+
+cant_get_token:
+ switch (ret) {
+ case -ENOMEM:
+ goto temporary_error;
+ case -EINVAL:
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED,
+ rxgk_abort_resp_tok_internal_error);
+ case -ENOPKG:
+ return rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP,
+ -EKEYREJECTED, rxgk_abort_resp_tok_nopkg);
+ }
+
+temporary_error:
+ /* Ignore the response packet if we got a temporary error such as
+ * ENOMEM. We just want to send the challenge again. Note that we
+ * also come out this way if the ticket decryption fails.
+ */
+ return ret;
+}
diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h
new file mode 100644
index 000000000000..7370a5655985
--- /dev/null
+++ b/net/rxrpc/rxgk_common.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Common bits for GSSAPI-based RxRPC security.
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <crypto/krb5.h>
+#include <crypto/skcipher.h>
+#include <crypto/hash.h>
+
+/*
+ * Per-key number context. This is replaced when the connection is rekeyed.
+ */
+struct rxgk_context {
+ refcount_t usage;
+ unsigned int key_number; /* Rekeying number (goes in the rx header) */
+ unsigned long flags;
+#define RXGK_TK_NEEDS_REKEY 0 /* Set if this needs rekeying */
+ unsigned long expiry; /* Expiration time of this key */
+ long long bytes_remaining; /* Remaining Tx lifetime of this key */
+ const struct krb5_enctype *krb5; /* RxGK encryption type */
+ const struct rxgk_key *key;
+
+ /* We need up to 7 keys derived from the transport key, but we don't
+ * actually need the transport key. Each key is derived by
+ * DK(TK,constant).
+ */
+ struct crypto_aead *tx_enc; /* Transmission key */
+ struct crypto_aead *rx_enc; /* Reception key */
+ struct crypto_shash *tx_Kc; /* Transmission checksum key */
+ struct crypto_shash *rx_Kc; /* Reception checksum key */
+ struct crypto_aead *resp_enc; /* Response packet enc key */
+};
+
+#define xdr_round_up(x) (round_up((x), sizeof(__be32)))
+#define xdr_object_len(x) (4 + xdr_round_up(x))
+
+/*
+ * rxgk_app.c
+ */
+int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
+ unsigned int ticket_offset, unsigned int ticket_len,
+ struct key **_key);
+int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
+ unsigned int token_offset, unsigned int token_len,
+ struct key **_key);
+
+/*
+ * rxgk_kdf.c
+ */
+void rxgk_put(struct rxgk_context *gk);
+struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn,
+ const struct rxgk_key *key,
+ unsigned int key_number,
+ gfp_t gfp);
+int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key,
+ struct crypto_aead **token_key,
+ unsigned int enctype,
+ const struct krb5_enctype **_krb5,
+ gfp_t gfp);
+
+/*
+ * Apply decryption and checksumming functions to part of an skbuff. The
+ * offset and length are updated to reflect the actual content of the encrypted
+ * region.
+ */
+static inline
+int rxgk_decrypt_skb(const struct krb5_enctype *krb5,
+ struct crypto_aead *aead,
+ struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len,
+ int *_error_code)
+{
+ struct scatterlist sg[16];
+ size_t offset = 0, len = *_len;
+ int nr_sg, ret;
+
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ nr_sg = skb_to_sgvec(skb, sg, *_offset, len);
+ if (unlikely(nr_sg < 0))
+ return nr_sg;
+
+ ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg,
+ &offset, &len);
+ switch (ret) {
+ case 0:
+ *_offset += offset;
+ *_len = len;
+ break;
+ case -EPROTO:
+ case -EBADMSG:
+ *_error_code = RXGK_SEALEDINCON;
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Check the MIC on a region of an skbuff. The offset and length are updated
+ * to reflect the actual content of the secure region.
+ */
+static inline
+int rxgk_verify_mic_skb(const struct krb5_enctype *krb5,
+ struct crypto_shash *shash,
+ const struct krb5_buffer *metadata,
+ struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len,
+ u32 *_error_code)
+{
+ struct scatterlist sg[16];
+ size_t offset = 0, len = *_len;
+ int nr_sg, ret;
+
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ nr_sg = skb_to_sgvec(skb, sg, *_offset, len);
+ if (unlikely(nr_sg < 0))
+ return nr_sg;
+
+ ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg,
+ &offset, &len);
+ switch (ret) {
+ case 0:
+ *_offset += offset;
+ *_len = len;
+ break;
+ case -EPROTO:
+ case -EBADMSG:
+ *_error_code = RXGK_SEALEDINCON;
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
diff --git a/net/rxrpc/rxgk_kdf.c b/net/rxrpc/rxgk_kdf.c
new file mode 100644
index 000000000000..b4db5aa30e5b
--- /dev/null
+++ b/net/rxrpc/rxgk_kdf.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* RxGK transport key derivation.
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/key-type.h>
+#include <linux/slab.h>
+#include <keys/rxrpc-type.h>
+#include "ar-internal.h"
+#include "rxgk_common.h"
+
+#define round16(x) (((x) + 15) & ~15)
+
+/*
+ * Constants used to derive the keys and hmacs actually used for doing stuff.
+ */
+#define RXGK_CLIENT_ENC_PACKET 1026U // 0x402
+#define RXGK_CLIENT_MIC_PACKET 1027U // 0x403
+#define RXGK_SERVER_ENC_PACKET 1028U // 0x404
+#define RXGK_SERVER_MIC_PACKET 1029U // 0x405
+#define RXGK_CLIENT_ENC_RESPONSE 1030U // 0x406
+#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c
+
+static void rxgk_free(struct rxgk_context *gk)
+{
+ if (gk->tx_Kc)
+ crypto_free_shash(gk->tx_Kc);
+ if (gk->rx_Kc)
+ crypto_free_shash(gk->rx_Kc);
+ if (gk->tx_enc)
+ crypto_free_aead(gk->tx_enc);
+ if (gk->rx_enc)
+ crypto_free_aead(gk->rx_enc);
+ if (gk->resp_enc)
+ crypto_free_aead(gk->resp_enc);
+ kfree(gk);
+}
+
+void rxgk_put(struct rxgk_context *gk)
+{
+ if (gk && refcount_dec_and_test(&gk->usage))
+ rxgk_free(gk);
+}
+
+/*
+ * Transport key derivation function.
+ *
+ * TK = random-to-key(PRF+(K0, L,
+ * epoch || cid || start_time || key_number))
+ * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-11 sec 8.3]
+ */
+static int rxgk_derive_transport_key(struct rxrpc_connection *conn,
+ struct rxgk_context *gk,
+ const struct rxgk_key *rxgk,
+ struct krb5_buffer *TK,
+ gfp_t gfp)
+{
+ const struct krb5_enctype *krb5 = gk->krb5;
+ struct krb5_buffer conn_info;
+ unsigned int L = krb5->key_bytes;
+ __be32 *info;
+ u8 *buffer;
+ int ret;
+
+ _enter("");
+
+ conn_info.len = sizeof(__be32) * 5;
+
+ buffer = kzalloc(round16(conn_info.len), gfp);
+ if (!buffer)
+ return -ENOMEM;
+
+ conn_info.data = buffer;
+
+ info = (__be32 *)conn_info.data;
+ info[0] = htonl(conn->proto.epoch);
+ info[1] = htonl(conn->proto.cid);
+ info[2] = htonl(conn->rxgk.start_time >> 32);
+ info[3] = htonl(conn->rxgk.start_time >> 0);
+ info[4] = htonl(gk->key_number);
+
+ ret = crypto_krb5_calc_PRFplus(krb5, &rxgk->key, L, &conn_info, TK, gfp);
+ kfree_sensitive(buffer);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Set up the ciphers for the usage keys.
+ */
+static int rxgk_set_up_ciphers(struct rxrpc_connection *conn,
+ struct rxgk_context *gk,
+ const struct rxgk_key *rxgk,
+ gfp_t gfp)
+{
+ const struct krb5_enctype *krb5 = gk->krb5;
+ struct crypto_shash *shash;
+ struct crypto_aead *aead;
+ struct krb5_buffer TK;
+ bool service = rxrpc_conn_is_service(conn);
+ int ret;
+ u8 *buffer;
+
+ buffer = kzalloc(krb5->key_bytes, gfp);
+ if (!buffer)
+ return -ENOMEM;
+
+ TK.len = krb5->key_bytes;
+ TK.data = buffer;
+
+ ret = rxgk_derive_transport_key(conn, gk, rxgk, &TK, gfp);
+ if (ret < 0)
+ goto out;
+
+ aead = crypto_krb5_prepare_encryption(krb5, &TK, RXGK_CLIENT_ENC_RESPONSE, gfp);
+ if (IS_ERR(aead))
+ goto aead_error;
+ gk->resp_enc = aead;
+
+ if (crypto_aead_blocksize(gk->resp_enc) != krb5->block_len ||
+ crypto_aead_authsize(gk->resp_enc) != krb5->cksum_len) {
+ pr_notice("algo inconsistent with krb5 table %u!=%u or %u!=%u\n",
+ crypto_aead_blocksize(gk->resp_enc), krb5->block_len,
+ crypto_aead_authsize(gk->resp_enc), krb5->cksum_len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (service) {
+ switch (conn->security_level) {
+ case RXRPC_SECURITY_AUTH:
+ shash = crypto_krb5_prepare_checksum(
+ krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp);
+ if (IS_ERR(shash))
+ goto hash_error;
+ gk->tx_Kc = shash;
+ shash = crypto_krb5_prepare_checksum(
+ krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp);
+ if (IS_ERR(shash))
+ goto hash_error;
+ gk->rx_Kc = shash;
+ break;
+ case RXRPC_SECURITY_ENCRYPT:
+ aead = crypto_krb5_prepare_encryption(
+ krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp);
+ if (IS_ERR(aead))
+ goto aead_error;
+ gk->tx_enc = aead;
+ aead = crypto_krb5_prepare_encryption(
+ krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp);
+ if (IS_ERR(aead))
+ goto aead_error;
+ gk->rx_enc = aead;
+ break;
+ }
+ } else {
+ switch (conn->security_level) {
+ case RXRPC_SECURITY_AUTH:
+ shash = crypto_krb5_prepare_checksum(
+ krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp);
+ if (IS_ERR(shash))
+ goto hash_error;
+ gk->tx_Kc = shash;
+ shash = crypto_krb5_prepare_checksum(
+ krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp);
+ if (IS_ERR(shash))
+ goto hash_error;
+ gk->rx_Kc = shash;
+ break;
+ case RXRPC_SECURITY_ENCRYPT:
+ aead = crypto_krb5_prepare_encryption(
+ krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp);
+ if (IS_ERR(aead))
+ goto aead_error;
+ gk->tx_enc = aead;
+ aead = crypto_krb5_prepare_encryption(
+ krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp);
+ if (IS_ERR(aead))
+ goto aead_error;
+ gk->rx_enc = aead;
+ break;
+ }
+ }
+
+ ret = 0;
+out:
+ kfree_sensitive(buffer);
+ return ret;
+aead_error:
+ ret = PTR_ERR(aead);
+ goto out;
+hash_error:
+ ret = PTR_ERR(shash);
+ goto out;
+}
+
+/*
+ * Derive a transport key for a connection and then derive a bunch of usage
+ * keys from it and set up ciphers using them.
+ */
+struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn,
+ const struct rxgk_key *key,
+ unsigned int key_number,
+ gfp_t gfp)
+{
+ struct rxgk_context *gk;
+ unsigned long lifetime;
+ int ret = -ENOPKG;
+
+ _enter("");
+
+ gk = kzalloc(sizeof(*gk), GFP_KERNEL);
+ if (!gk)
+ return ERR_PTR(-ENOMEM);
+ refcount_set(&gk->usage, 1);
+ gk->key = key;
+ gk->key_number = key_number;
+
+ gk->krb5 = crypto_krb5_find_enctype(key->enctype);
+ if (!gk->krb5)
+ goto err_tk;
+
+ ret = rxgk_set_up_ciphers(conn, gk, key, gfp);
+ if (ret)
+ goto err_tk;
+
+ /* Set the remaining number of bytes encrypted with this key that may
+ * be transmitted before rekeying. Note that the spec has been
+ * interpreted differently on this point...
+ */
+ switch (key->bytelife) {
+ case 0:
+ case 63:
+ gk->bytes_remaining = LLONG_MAX;
+ break;
+ case 1 ... 62:
+ gk->bytes_remaining = 1LL << key->bytelife;
+ break;
+ default:
+ gk->bytes_remaining = key->bytelife;
+ break;
+ }
+
+ /* Set the time after which rekeying must occur */
+ if (key->lifetime) {
+ lifetime = min_t(u64, key->lifetime, INT_MAX / HZ);
+ lifetime *= HZ;
+ } else {
+ lifetime = MAX_JIFFY_OFFSET;
+ }
+ gk->expiry = jiffies + lifetime;
+ return gk;
+
+err_tk:
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Use the server secret key to set up the ciphers that will be used to extract
+ * the token from a response packet.
+ */
+int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key,
+ struct crypto_aead **token_aead,
+ unsigned int enctype,
+ const struct krb5_enctype **_krb5,
+ gfp_t gfp)
+{
+ const struct krb5_enctype *krb5;
+ struct crypto_aead *aead;
+
+ krb5 = crypto_krb5_find_enctype(enctype);
+ if (!krb5)
+ return -ENOPKG;
+
+ aead = crypto_krb5_prepare_encryption(krb5, server_key, RXGK_SERVER_ENC_TOKEN, gfp);
+ if (IS_ERR(aead))
+ return PTR_ERR(aead);
+
+ *_krb5 = krb5;
+ *token_aead = aead;
+ return 0;
+}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index f1a68270862d..3657c0661cdc 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -148,14 +148,14 @@ error:
static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp)
{
struct rxrpc_txbuf *txb;
- size_t shdr, space;
+ size_t shdr, alloc, limit, part;
- remain = min(remain, 65535 - sizeof(struct rxrpc_wire_header));
+ remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header));
switch (call->conn->security_level) {
default:
- space = min_t(size_t, remain, RXRPC_JUMBO_DATALEN);
- return rxrpc_alloc_data_txbuf(call, space, 0, gfp);
+ alloc = umin(remain, RXRPC_JUMBO_DATALEN);
+ return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp);
case RXRPC_SECURITY_AUTH:
shdr = sizeof(struct rxkad_level1_hdr);
break;
@@ -164,15 +164,23 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem
break;
}
- space = min_t(size_t, round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr);
- space = round_up(space, RXKAD_ALIGN);
+ limit = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN) - shdr;
+ if (remain < limit) {
+ part = remain;
+ alloc = round_up(shdr + part, RXKAD_ALIGN);
+ } else {
+ part = limit;
+ alloc = RXRPC_JUMBO_DATALEN;
+ }
- txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp);
+ txb = rxrpc_alloc_data_txbuf(call, alloc, RXKAD_ALIGN, gfp);
if (!txb)
return NULL;
- txb->offset += shdr;
- txb->space -= shdr;
+ txb->crypto_header = 0;
+ txb->sec_header = shdr;
+ txb->offset += shdr;
+ txb->space = part;
return txb;
}
@@ -251,8 +259,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
struct rxrpc_txbuf *txb,
struct skcipher_request *req)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
- struct rxkad_level1_hdr *hdr = (void *)(whdr + 1);
+ struct rxkad_level1_hdr *hdr = txb->data;
struct rxrpc_crypt iv;
struct scatterlist sg;
size_t pad;
@@ -263,13 +270,13 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
check = txb->seq ^ call->call_id;
hdr->data_size = htonl((u32)check << 16 | txb->len);
- txb->len += sizeof(struct rxkad_level1_hdr);
- pad = txb->len;
+ txb->pkt_len = sizeof(struct rxkad_level1_hdr) + txb->len;
+ pad = txb->pkt_len;
pad = RXKAD_ALIGN - pad;
pad &= RXKAD_ALIGN - 1;
if (pad) {
- memset(txb->kvec[0].iov_base + txb->offset, 0, pad);
- txb->len += pad;
+ memset(txb->data + txb->offset, 0, pad);
+ txb->pkt_len += pad;
}
/* start the encryption afresh */
@@ -294,11 +301,10 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
struct skcipher_request *req)
{
const struct rxrpc_key_token *token;
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
- struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1);
+ struct rxkad_level2_hdr *rxkhdr = txb->data;
struct rxrpc_crypt iv;
struct scatterlist sg;
- size_t pad;
+ size_t content, pad;
u16 check;
int ret;
@@ -309,23 +315,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
rxkhdr->data_size = htonl(txb->len | (u32)check << 16);
rxkhdr->checksum = 0;
- txb->len += sizeof(struct rxkad_level2_hdr);
- pad = txb->len;
- pad = RXKAD_ALIGN - pad;
- pad &= RXKAD_ALIGN - 1;
- if (pad) {
- memset(txb->kvec[0].iov_base + txb->offset, 0, pad);
- txb->len += pad;
- }
+ content = sizeof(struct rxkad_level2_hdr) + txb->len;
+ txb->pkt_len = round_up(content, RXKAD_ALIGN);
+ pad = txb->pkt_len - content;
+ if (pad)
+ memset(txb->data + txb->offset, 0, pad);
/* encrypt from the session key */
token = call->conn->key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
- sg_init_one(&sg, rxkhdr, txb->len);
+ sg_init_one(&sg, rxkhdr, txb->pkt_len);
skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x);
+ skcipher_request_set_crypt(req, &sg, &sg, txb->pkt_len, iv.x);
ret = crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
return ret;
@@ -384,19 +387,32 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
switch (call->conn->security_level) {
case RXRPC_SECURITY_PLAIN:
+ txb->pkt_len = txb->len;
ret = 0;
break;
case RXRPC_SECURITY_AUTH:
ret = rxkad_secure_packet_auth(call, txb, req);
+ if (txb->alloc_size == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
break;
case RXRPC_SECURITY_ENCRYPT:
ret = rxkad_secure_packet_encrypt(call, txb, req);
+ if (txb->alloc_size == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
break;
default:
ret = -EPERM;
break;
}
+ /* Clear excess space in the packet */
+ if (txb->pkt_len < txb->alloc_size) {
+ size_t gap = txb->alloc_size - txb->pkt_len;
+ void *p = txb->data;
+
+ memset(p + txb->pkt_len, 0, gap);
+ }
+
skcipher_request_free(req);
_leave(" = %d [set %x]", ret, y);
return ret;
@@ -669,6 +685,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
serial = rxrpc_get_next_serial(conn);
whdr.serial = htonl(serial);
+ trace_rxrpc_tx_challenge(conn, serial, 0, conn->rxkad.nonce);
+
ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len);
if (ret < 0) {
trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
@@ -684,62 +702,6 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
}
/*
- * send a Kerberos security response
- */
-static int rxkad_send_response(struct rxrpc_connection *conn,
- struct rxrpc_host_header *hdr,
- struct rxkad_response *resp,
- const struct rxkad_key *s2)
-{
- struct rxrpc_wire_header whdr;
- struct msghdr msg;
- struct kvec iov[3];
- size_t len;
- u32 serial;
- int ret;
-
- _enter("");
-
- msg.msg_name = &conn->peer->srx.transport;
- msg.msg_namelen = conn->peer->srx.transport_len;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = 0;
-
- memset(&whdr, 0, sizeof(whdr));
- whdr.epoch = htonl(hdr->epoch);
- whdr.cid = htonl(hdr->cid);
- whdr.type = RXRPC_PACKET_TYPE_RESPONSE;
- whdr.flags = conn->out_clientflag;
- whdr.securityIndex = hdr->securityIndex;
- whdr.serviceId = htons(hdr->serviceId);
-
- iov[0].iov_base = &whdr;
- iov[0].iov_len = sizeof(whdr);
- iov[1].iov_base = resp;
- iov[1].iov_len = sizeof(*resp);
- iov[2].iov_base = (void *)s2->ticket;
- iov[2].iov_len = s2->ticket_len;
-
- len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len;
-
- serial = rxrpc_get_next_serial(conn);
- whdr.serial = htonl(serial);
-
- rxrpc_local_dont_fragment(conn->local, false);
- ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len);
- if (ret < 0) {
- trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
- rxrpc_tx_point_rxkad_response);
- return -EAGAIN;
- }
-
- conn->peer->last_tx_at = ktime_get_seconds();
- _leave(" = 0");
- return 0;
-}
-
-/*
* calculate the response checksum
*/
static void rxkad_calc_response_checksum(struct rxkad_response *response)
@@ -758,12 +720,21 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response)
* encrypt the response packet
*/
static int rxkad_encrypt_response(struct rxrpc_connection *conn,
- struct rxkad_response *resp,
+ struct sk_buff *response,
const struct rxkad_key *s2)
{
struct skcipher_request *req;
struct rxrpc_crypt iv;
struct scatterlist sg[1];
+ size_t encsize = sizeof(((struct rxkad_response *)0)->encrypted);
+ int ret;
+
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ ret = skb_to_sgvec(response, sg,
+ sizeof(struct rxrpc_wire_header) +
+ offsetof(struct rxkad_response, encrypted), encsize);
+ if (ret < 0)
+ return ret;
req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS);
if (!req)
@@ -772,89 +743,206 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn,
/* continue encrypting from where we left off */
memcpy(&iv, s2->session_key, sizeof(iv));
- sg_init_table(sg, 1);
- sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted));
skcipher_request_set_sync_tfm(req, conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
- crypto_skcipher_encrypt(req);
+ skcipher_request_set_crypt(req, sg, sg, encsize, iv.x);
+ ret = crypto_skcipher_encrypt(req);
skcipher_request_free(req);
- return 0;
+ return ret;
}
/*
- * respond to a challenge packet
+ * Validate a challenge packet.
*/
-static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
- struct sk_buff *skb)
+static bool rxkad_validate_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
{
- const struct rxrpc_key_token *token;
struct rxkad_challenge challenge;
- struct rxkad_response *resp;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- u32 version, nonce, min_level;
- int ret = -EPROTO;
+ u32 version, min_level;
+ int ret;
_enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
- if (!conn->key)
- return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO,
- rxkad_abort_chall_no_key);
+ if (!conn->key) {
+ rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO,
+ rxkad_abort_chall_no_key);
+ return false;
+ }
ret = key_validate(conn->key);
- if (ret < 0)
- return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret,
- rxkad_abort_chall_key_expired);
+ if (ret < 0) {
+ rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret,
+ rxkad_abort_chall_key_expired);
+ return false;
+ }
if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
- &challenge, sizeof(challenge)) < 0)
- return rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO,
- rxkad_abort_chall_short);
+ &challenge, sizeof(challenge)) < 0) {
+ rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO,
+ rxkad_abort_chall_short);
+ return false;
+ }
version = ntohl(challenge.version);
- nonce = ntohl(challenge.nonce);
+ sp->chall.rxkad_nonce = ntohl(challenge.nonce);
min_level = ntohl(challenge.min_level);
- trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level);
+ trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version,
+ sp->chall.rxkad_nonce, min_level);
+
+ if (version != RXKAD_VERSION) {
+ rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO,
+ rxkad_abort_chall_version);
+ return false;
+ }
+
+ if (conn->security_level < min_level) {
+ rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES,
+ rxkad_abort_chall_level);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Insert the header into the response.
+ */
+static noinline
+int rxkad_insert_response_header(struct rxrpc_connection *conn,
+ const struct rxrpc_key_token *token,
+ struct sk_buff *challenge,
+ struct sk_buff *response,
+ size_t *offset)
+{
+ struct rxrpc_skb_priv *csp = rxrpc_skb(challenge);
+ struct {
+ struct rxrpc_wire_header whdr;
+ struct rxkad_response resp;
+ } h;
+ int ret;
+
+ h.whdr.epoch = htonl(conn->proto.epoch);
+ h.whdr.cid = htonl(conn->proto.cid);
+ h.whdr.callNumber = 0;
+ h.whdr.serial = 0;
+ h.whdr.seq = 0;
+ h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE;
+ h.whdr.flags = conn->out_clientflag;
+ h.whdr.userStatus = 0;
+ h.whdr.securityIndex = conn->security_ix;
+ h.whdr.cksum = 0;
+ h.whdr.serviceId = htons(conn->service_id);
+ h.resp.version = htonl(RXKAD_VERSION);
+ h.resp.__pad = 0;
+ h.resp.encrypted.epoch = htonl(conn->proto.epoch);
+ h.resp.encrypted.cid = htonl(conn->proto.cid);
+ h.resp.encrypted.checksum = 0;
+ h.resp.encrypted.securityIndex = htonl(conn->security_ix);
+ h.resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter);
+ h.resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter);
+ h.resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter);
+ h.resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter);
+ h.resp.encrypted.inc_nonce = htonl(csp->chall.rxkad_nonce + 1);
+ h.resp.encrypted.level = htonl(conn->security_level);
+ h.resp.kvno = htonl(token->kad->kvno);
+ h.resp.ticket_len = htonl(token->kad->ticket_len);
+
+ rxkad_calc_response_checksum(&h.resp);
+
+ ret = skb_store_bits(response, *offset, &h, sizeof(h));
+ *offset += sizeof(h);
+ return ret;
+}
- if (version != RXKAD_VERSION)
- return rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO,
- rxkad_abort_chall_version);
+/*
+ * respond to a challenge packet
+ */
+static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *challenge)
+{
+ const struct rxrpc_key_token *token;
+ struct rxrpc_skb_priv *csp, *rsp;
+ struct sk_buff *response;
+ size_t len, offset = 0;
+ int ret = -EPROTO;
- if (conn->security_level < min_level)
- return rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES,
- rxkad_abort_chall_level);
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+
+ ret = key_validate(conn->key);
+ if (ret < 0)
+ return rxrpc_abort_conn(conn, challenge, RXKADEXPIRED, ret,
+ rxkad_abort_chall_key_expired);
token = conn->key->payload.data[0];
/* build the response packet */
- resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS);
- if (!resp)
- return -ENOMEM;
+ len = sizeof(struct rxrpc_wire_header) +
+ sizeof(struct rxkad_response) +
+ token->kad->ticket_len;
+
+ response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS);
+ if (!response)
+ goto error;
+ rxrpc_new_skb(response, rxrpc_skb_new_response_rxkad);
+ response->len = len;
+ response->data_len = len;
- resp->version = htonl(RXKAD_VERSION);
- resp->encrypted.epoch = htonl(conn->proto.epoch);
- resp->encrypted.cid = htonl(conn->proto.cid);
- resp->encrypted.securityIndex = htonl(conn->security_ix);
- resp->encrypted.inc_nonce = htonl(nonce + 1);
- resp->encrypted.level = htonl(conn->security_level);
- resp->kvno = htonl(token->kad->kvno);
- resp->ticket_len = htonl(token->kad->ticket_len);
- resp->encrypted.call_id[0] = htonl(conn->channels[0].call_counter);
- resp->encrypted.call_id[1] = htonl(conn->channels[1].call_counter);
- resp->encrypted.call_id[2] = htonl(conn->channels[2].call_counter);
- resp->encrypted.call_id[3] = htonl(conn->channels[3].call_counter);
-
- /* calculate the response checksum and then do the encryption */
- rxkad_calc_response_checksum(resp);
- ret = rxkad_encrypt_response(conn, resp, token->kad);
- if (ret == 0)
- ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad);
- kfree(resp);
+ offset = 0;
+ ret = rxkad_insert_response_header(conn, token, challenge, response,
+ &offset);
+ if (ret < 0)
+ goto error;
+
+ ret = rxkad_encrypt_response(conn, response, token->kad);
+ if (ret < 0)
+ goto error;
+
+ ret = skb_store_bits(response, offset, token->kad->ticket,
+ token->kad->ticket_len);
+ if (ret < 0)
+ goto error;
+
+ csp = rxrpc_skb(challenge);
+ rsp = rxrpc_skb(response);
+ rsp->resp.len = len;
+ rsp->resp.challenge_serial = csp->hdr.serial;
+ rxrpc_post_response(conn, response);
+ response = NULL;
+ ret = 0;
+
+error:
+ rxrpc_free_skb(response, rxrpc_skb_put_response);
return ret;
}
/*
+ * RxKAD does automatic response only as there's nothing to manage that isn't
+ * already in the key.
+ */
+static int rxkad_sendmsg_respond_to_challenge(struct sk_buff *challenge,
+ struct msghdr *msg)
+{
+ return -EINVAL;
+}
+
+/**
+ * rxkad_kernel_respond_to_challenge - Respond to a challenge with appdata
+ * @challenge: The challenge to respond to
+ *
+ * Allow a kernel application to respond to a CHALLENGE.
+ *
+ * Return: %0 if successful and a negative error code otherwise.
+ */
+int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge)
+{
+ struct rxrpc_skb_priv *csp = rxrpc_skb(challenge);
+
+ return rxkad_respond_to_challenge(csp->chall.conn, challenge);
+}
+EXPORT_SYMBOL(rxkad_kernel_respond_to_challenge);
+
+/*
* decrypt the kerberos IV ticket in the response
*/
static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
@@ -1262,6 +1350,8 @@ const struct rxrpc_security rxkad = {
.verify_packet = rxkad_verify_packet,
.free_call_crypto = rxkad_free_call_crypto,
.issue_challenge = rxkad_issue_challenge,
+ .validate_challenge = rxkad_validate_challenge,
+ .sendmsg_respond_to_challenge = rxkad_sendmsg_respond_to_challenge,
.respond_to_challenge = rxkad_respond_to_challenge,
.verify_response = rxkad_verify_response,
.clear = rxkad_clear,
diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c
index 085e7892d310..0377301156b0 100644
--- a/net/rxrpc/rxperf.c
+++ b/net/rxrpc/rxperf.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) "rxperf: " fmt
#include <linux/module.h>
#include <linux/slab.h>
+#include <crypto/krb5.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
@@ -136,6 +137,12 @@ static void rxperf_notify_end_reply_tx(struct sock *sock,
RXPERF_CALL_SV_AWAIT_ACK);
}
+static const struct rxrpc_kernel_ops rxperf_rxrpc_callback_ops = {
+ .notify_new_call = rxperf_rx_new_call,
+ .discard_new_call = rxperf_rx_discard_new_call,
+ .user_attach_call = rxperf_rx_attach,
+};
+
/*
* Charge the incoming call preallocation.
*/
@@ -161,7 +168,6 @@ static void rxperf_charge_preallocation(struct work_struct *work)
if (rxrpc_kernel_charge_accept(rxperf_socket,
rxperf_notify_rx,
- rxperf_rx_attach,
(unsigned long)call,
GFP_KERNEL,
call->debug_id) < 0)
@@ -209,8 +215,7 @@ static int rxperf_open_socket(void)
if (ret < 0)
goto error_2;
- rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call,
- rxperf_rx_discard_new_call);
+ rxrpc_kernel_set_notifications(socket, &rxperf_rxrpc_callback_ops);
ret = kernel_listen(socket, INT_MAX);
if (ret < 0)
@@ -478,6 +483,18 @@ static int rxperf_deliver_request(struct rxperf_call *call)
call->unmarshal++;
fallthrough;
case 2:
+ ret = rxperf_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ /* Deal with the terminal magic cookie. */
+ call->iov_len = 4;
+ call->kvec[0].iov_len = call->iov_len;
+ call->kvec[0].iov_base = call->tmp;
+ iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len);
+ call->unmarshal++;
+ fallthrough;
+ case 3:
ret = rxperf_extract_data(call, false);
if (ret < 0)
return ret;
@@ -503,7 +520,7 @@ static int rxperf_process_call(struct rxperf_call *call)
reply_len + sizeof(rxperf_magic_cookie));
while (reply_len > 0) {
- len = min_t(size_t, reply_len, PAGE_SIZE);
+ len = umin(reply_len, PAGE_SIZE);
bvec_set_page(&bv, ZERO_PAGE(0), len, 0);
iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, len);
msg.msg_flags = MSG_MORE;
@@ -534,9 +551,9 @@ static int rxperf_process_call(struct rxperf_call *call)
}
/*
- * Add a key to the security keyring.
+ * Add an rxkad key to the security keyring.
*/
-static int rxperf_add_key(struct key *keyring)
+static int rxperf_add_rxkad_key(struct key *keyring)
{
key_ref_t kref;
int ret;
@@ -562,6 +579,47 @@ static int rxperf_add_key(struct key *keyring)
return ret;
}
+#ifdef CONFIG_RXGK
+/*
+ * Add a yfs-rxgk key to the security keyring.
+ */
+static int rxperf_add_yfs_rxgk_key(struct key *keyring, u32 enctype)
+{
+ const struct krb5_enctype *krb5 = crypto_krb5_find_enctype(enctype);
+ key_ref_t kref;
+ char name[64];
+ int ret;
+ u8 key[32];
+
+ if (!krb5 || krb5->key_len > sizeof(key))
+ return 0;
+
+ /* The key is just { 0, 1, 2, 3, 4, ... } */
+ for (int i = 0; i < krb5->key_len; i++)
+ key[i] = i;
+
+ sprintf(name, "%u:6:1:%u", RX_PERF_SERVICE, enctype);
+
+ kref = key_create_or_update(make_key_ref(keyring, true),
+ "rxrpc_s", name,
+ key, krb5->key_len,
+ KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
+ KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA);
+
+ if (IS_ERR(kref)) {
+ pr_err("Can't allocate rxperf server key: %ld\n", PTR_ERR(kref));
+ return PTR_ERR(kref);
+ }
+
+ ret = key_link(keyring, key_ref_to_ptr(kref));
+ if (ret < 0)
+ pr_err("Can't link rxperf server key: %d\n", ret);
+ key_ref_put(kref);
+ return ret;
+}
+#endif
+
/*
* Initialise the rxperf server.
*/
@@ -591,9 +649,29 @@ static int __init rxperf_init(void)
goto error_keyring;
}
rxperf_sec_keyring = keyring;
- ret = rxperf_add_key(keyring);
+ ret = rxperf_add_rxkad_key(keyring);
+ if (ret < 0)
+ goto error_key;
+#ifdef CONFIG_RXGK
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96);
+ if (ret < 0)
+ goto error_key;
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA1_96);
+ if (ret < 0)
+ goto error_key;
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA256_128);
+ if (ret < 0)
+ goto error_key;
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA384_192);
+ if (ret < 0)
+ goto error_key;
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA128_CTS_CMAC);
+ if (ret < 0)
+ goto error_key;
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA256_CTS_CMAC);
if (ret < 0)
goto error_key;
+#endif
ret = rxperf_open_socket();
if (ret < 0)
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index cb8dd1d3b1d4..078d91a6b77f 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -20,6 +20,9 @@ static const struct rxrpc_security *rxrpc_security_types[] = {
#ifdef CONFIG_RXKAD
[RXRPC_SECURITY_RXKAD] = &rxkad,
#endif
+#ifdef CONFIG_RXGK
+ [RXRPC_SECURITY_YFS_RXGK] = &rxgk_yfs,
+#endif
};
int __init rxrpc_init_security(void)
@@ -114,10 +117,10 @@ found:
if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) {
ret = conn->security->init_connection_security(conn, token);
if (ret == 0) {
- spin_lock(&conn->state_lock);
+ spin_lock_irq(&conn->state_lock);
if (conn->state == RXRPC_CONN_CLIENT_UNSECURED)
conn->state = RXRPC_CONN_CLIENT;
- spin_unlock(&conn->state_lock);
+ spin_unlock_irq(&conn->state_lock);
}
}
mutex_unlock(&conn->security_lock);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 894b8fa68e5e..ebbb78b842de 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -29,6 +29,7 @@ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error,
call->send_abort_why = why;
call->send_abort_err = error;
call->send_abort_seq = 0;
+ trace_rxrpc_abort_call(call, abort_code);
/* Request abort locklessly vs rxrpc_input_call_event(). */
smp_store_release(&call->send_abort, abort_code);
rxrpc_poke_call(call, rxrpc_call_poke_abort);
@@ -93,9 +94,11 @@ no_wait:
*/
static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win)
{
+ rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom);
+
if (_tx_win)
- *_tx_win = call->tx_bottom;
- return call->tx_prepared - call->tx_bottom < 256;
+ *_tx_win = tx_bottom;
+ return call->send_top - tx_bottom < 256;
}
/*
@@ -131,13 +134,13 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
rxrpc_seq_t tx_start, tx_win;
signed long rtt, timeout;
- rtt = READ_ONCE(call->peer->srtt_us) >> 3;
+ rtt = READ_ONCE(call->srtt_us) >> 3;
rtt = usecs_to_jiffies(rtt) * 2;
if (rtt < 2)
rtt = 2;
timeout = rtt;
- tx_start = smp_load_acquire(&call->acks_hard_ack);
+ tx_start = READ_ONCE(call->tx_bottom);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -194,8 +197,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
DECLARE_WAITQUEUE(myself, current);
int ret;
- _enter(",{%u,%u,%u,%u}",
- call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize);
+ _enter(",{%u,%u,%u}",
+ call->tx_bottom, call->tx_top, call->tx_winsize);
add_wait_queue(&call->waitq, &myself);
@@ -239,37 +242,77 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
struct rxrpc_txbuf *txb,
rxrpc_notify_end_tx_t notify_end_tx)
{
+ struct rxrpc_txqueue *sq = call->send_queue;
rxrpc_seq_t seq = txb->seq;
bool poke, last = txb->flags & RXRPC_LAST_PACKET;
-
+ int ix = seq & RXRPC_TXQ_MASK;
rxrpc_inc_stat(call->rxnet, stat_tx_data);
- ASSERTCMP(txb->seq, ==, call->tx_prepared + 1);
-
- /* We have to set the timestamp before queueing as the retransmit
- * algorithm can see the packet as soon as we queue it.
- */
- txb->last_sent = ktime_get_real();
+ ASSERTCMP(txb->seq, ==, call->send_top + 1);
if (last)
trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last);
else
trace_rxrpc_txqueue(call, rxrpc_txqueue_queue);
+ if (WARN_ON_ONCE(sq->bufs[ix]))
+ trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup);
+ else
+ trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue);
+
/* Add the packet to the call's output buffer */
- spin_lock(&call->tx_lock);
- poke = list_empty(&call->tx_sendmsg);
- list_add_tail(&txb->call_link, &call->tx_sendmsg);
- call->tx_prepared = seq;
- if (last)
+ poke = (READ_ONCE(call->tx_bottom) == call->send_top);
+ sq->bufs[ix] = txb;
+ /* Order send_top after the queue->next pointer and txb content. */
+ smp_store_release(&call->send_top, seq);
+ if (last) {
+ set_bit(RXRPC_CALL_TX_NO_MORE, &call->flags);
rxrpc_notify_end_tx(rx, call, notify_end_tx);
- spin_unlock(&call->tx_lock);
+ call->send_queue = NULL;
+ }
if (poke)
rxrpc_poke_call(call, rxrpc_call_poke_start);
}
/*
+ * Allocate a new txqueue unit and add it to the transmission queue.
+ */
+static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call)
+{
+ struct rxrpc_txqueue *tq;
+
+ tq = kzalloc(sizeof(*tq), sk->sk_allocation);
+ if (!tq)
+ return -ENOMEM;
+
+ tq->xmit_ts_base = KTIME_MIN;
+ for (int i = 0; i < RXRPC_NR_TXQUEUE; i++)
+ tq->segment_xmit_ts[i] = UINT_MAX;
+
+ if (call->send_queue) {
+ tq->qbase = call->send_top + 1;
+ call->send_queue->next = tq;
+ call->send_queue = tq;
+ } else if (WARN_ON(call->tx_queue)) {
+ kfree(tq);
+ return -ENOMEM;
+ } else {
+ /* We start at seq 1, so pretend seq 0 is hard-acked. */
+ tq->nr_reported_acks = 1;
+ tq->segment_acked = 1UL;
+ tq->qbase = 0;
+ call->tx_qbase = 0;
+ call->send_queue = tq;
+ call->tx_qtail = tq;
+ call->tx_queue = tq;
+ }
+
+ trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc);
+ return 0;
+}
+
+/*
* send data through a socket
* - must be called in process context
* - The caller holds the call user access mutex, but not the socket lock.
@@ -287,6 +330,13 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
bool more = msg->msg_flags & MSG_MORE;
int ret, copied = 0;
+ if (test_bit(RXRPC_CALL_TX_NO_MORE, &call->flags)) {
+ trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send,
+ call->cid, call->call_id, call->rx_consumed,
+ 0, -EPROTO);
+ return -EPROTO;
+ }
+
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
ret = rxrpc_wait_to_be_connected(call, &timeo);
@@ -303,6 +353,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
reload:
+ txb = call->tx_pending;
+ call->tx_pending = NULL;
+ if (txb)
+ rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more);
+
ret = -EPIPE;
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto maybe_error;
@@ -329,11 +384,6 @@ reload:
goto maybe_error;
}
- txb = call->tx_pending;
- call->tx_pending = NULL;
- if (txb)
- rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more);
-
do {
if (!txb) {
size_t remain;
@@ -343,6 +393,13 @@ reload:
if (!rxrpc_check_tx_space(call, NULL))
goto wait_for_space;
+ /* See if we need to begin/extend the Tx queue. */
+ if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) {
+ ret = rxrpc_alloc_txqueue(sk, call);
+ if (ret < 0)
+ goto maybe_error;
+ }
+
/* Work out the maximum size of a packet. Assume that
* the security header is going to be in the padded
* region (enc blocksize), but the trailer is not.
@@ -359,10 +416,10 @@ reload:
/* append next segment of data to the current buffer */
if (msg_data_left(msg) > 0) {
- size_t copy = min_t(size_t, txb->space, msg_data_left(msg));
+ size_t copy = umin(txb->space, msg_data_left(msg));
_debug("add %zu", copy);
- if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset,
+ if (!copy_from_iter_full(txb->data + txb->offset,
copy, &msg->msg_iter))
goto efault;
_debug("added");
@@ -384,16 +441,10 @@ reload:
(msg_data_left(msg) == 0 && !more)) {
if (msg_data_left(msg) == 0 && !more)
txb->flags |= RXRPC_LAST_PACKET;
- else if (call->tx_top - call->acks_hard_ack <
- call->tx_winsize)
- txb->flags |= RXRPC_MORE_PACKETS;
ret = call->security->secure_packet(call, txb);
if (ret < 0)
goto out;
-
- txb->kvec[0].iov_len += txb->len;
- txb->len = txb->kvec[0].iov_len;
rxrpc_queue_packet(rx, call, txb, notify_end_tx);
txb = NULL;
}
@@ -556,7 +607,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p)
static struct rxrpc_call *
rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
struct rxrpc_send_params *p)
- __releases(&rx->sk.sk_lock.slock)
+ __releases(&rx->sk.sk_lock)
__acquires(&call->user_mutex)
{
struct rxrpc_conn_parameters cp;
@@ -606,7 +657,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
* - the socket may be either a client socket or a server socket
*/
int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
- __releases(&rx->sk.sk_lock.slock)
{
struct rxrpc_call *call;
bool dropped_lock = false;
@@ -654,7 +704,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
} else {
switch (rxrpc_call_state(call)) {
case RXRPC_CALL_CLIENT_AWAIT_CONN:
- case RXRPC_CALL_SERVER_SECURING:
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
if (p.command == RXRPC_CMD_SEND_ABORT)
break;
fallthrough;
@@ -708,14 +758,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
if (rxrpc_call_is_complete(call)) {
/* it's too late for this call */
ret = -ESHUTDOWN;
- } else if (p.command == RXRPC_CMD_SEND_ABORT) {
+ goto out_put_unlock;
+ }
+
+ switch (p.command) {
+ case RXRPC_CMD_SEND_ABORT:
rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED,
rxrpc_abort_call_sendmsg);
ret = 0;
- } else if (p.command != RXRPC_CMD_SEND_DATA) {
- ret = -EINVAL;
- } else {
+ break;
+ case RXRPC_CMD_SEND_DATA:
ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
}
out_put_unlock:
@@ -743,6 +800,8 @@ error_release_sock:
* appropriate to sending data. No control data should be supplied in @msg,
* nor should an address be supplied. MSG_MORE should be flagged if there's
* more data to come, otherwise this data will end the transmission phase.
+ *
+ * Return: %0 if successful and a negative error code otherwise.
*/
int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call,
struct msghdr *msg, size_t len,
@@ -778,8 +837,9 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data);
* @error: Local error value
* @why: Indication as to why.
*
- * Allow a kernel service to abort a call, if it's still in an abortable state
- * and return true if the call was aborted, false if it was already complete.
+ * Allow a kernel service to abort a call if it's still in an abortable state.
+ *
+ * Return: %true if the call was aborted, %false if it was already complete.
*/
bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
u32 abort_code, int error, enum rxrpc_abort_reason why)
diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c
index e51940589ee5..36b05fd842a7 100644
--- a/net/rxrpc/server_key.c
+++ b/net/rxrpc/server_key.c
@@ -152,6 +152,8 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen)
*
* Set the server security keyring on an rxrpc socket. This is used to provide
* the encryption keys for a kernel service.
+ *
+ * Return: %0 if successful and a negative error code otherwise.
*/
int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring)
{
@@ -169,3 +171,43 @@ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring)
return ret;
}
EXPORT_SYMBOL(rxrpc_sock_set_security_keyring);
+
+/**
+ * rxrpc_sock_set_manage_response - Set the manage-response flag for a kernel service
+ * @sk: The socket to set the keyring on
+ * @set: True to set, false to clear the flag
+ *
+ * Set the flag on an rxrpc socket to say that the caller wants to manage the
+ * RESPONSE packet and the user-defined data it may contain. Setting this
+ * means that recvmsg() will return messages with RXRPC_CHALLENGED in the
+ * control message buffer containing information about the challenge.
+ *
+ * The user should respond to the challenge by passing RXRPC_RESPOND or
+ * RXRPC_RESPOND_ABORT control messages with sendmsg() to the same call.
+ * Supplementary control messages, such as RXRPC_RESP_RXGK_APPDATA, may be
+ * included to indicate the parts the user wants to supply.
+ *
+ * The server will be passed the response data with a RXRPC_RESPONDED control
+ * message when it gets the first data from each call.
+ *
+ * Note that this is only honoured by security classes that need auxiliary data
+ * (e.g. RxGK). Those that don't offer the facility (e.g. RxKAD) respond
+ * without consulting userspace.
+ *
+ * Return: The previous setting.
+ */
+int rxrpc_sock_set_manage_response(struct sock *sk, bool set)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ int ret;
+
+ lock_sock(sk);
+ ret = !!test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags);
+ if (set)
+ set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags);
+ else
+ clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags);
+ release_sock(sk);
+ return ret;
+}
+EXPORT_SYMBOL(rxrpc_sock_set_manage_response);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index c9bedd0e2d86..46a20cf4c402 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -11,6 +11,8 @@
#include "ar-internal.h"
static struct ctl_table_header *rxrpc_sysctl_reg_table;
+static const unsigned int rxrpc_rx_mtu_min = 500;
+static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO;
static const unsigned int four = 4;
static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1;
static const unsigned int n_65535 = 65535;
@@ -115,7 +117,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)SYSCTL_ONE,
+ .extra1 = (void *)&rxrpc_rx_mtu_min,
.extra2 = (void *)&n_65535,
},
{
@@ -125,9 +127,8 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)SYSCTL_ONE,
- .extra2 = (void *)&four,
+ .extra2 = (void *)&rxrpc_jumbo_max,
},
- { }
};
int __init rxrpc_sysctl_init(void)
diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c
index e0679658d9de..29767038691a 100644
--- a/net/rxrpc/txbuf.c
+++ b/net/rxrpc/txbuf.c
@@ -19,58 +19,39 @@ atomic_t rxrpc_nr_txbuf;
struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size,
size_t data_align, gfp_t gfp)
{
- struct rxrpc_wire_header *whdr;
struct rxrpc_txbuf *txb;
- size_t total, hoff = 0;
+ size_t total, doff, jsize = sizeof(struct rxrpc_jumbo_header);
void *buf;
- txb = kmalloc(sizeof(*txb), gfp);
+ txb = kzalloc(sizeof(*txb), gfp);
if (!txb)
return NULL;
- if (data_align)
- hoff = round_up(sizeof(*whdr), data_align) - sizeof(*whdr);
- total = hoff + sizeof(*whdr) + data_size;
+ /* We put a jumbo header in the buffer, but not a full wire header to
+ * avoid delayed-corruption problems with zerocopy.
+ */
+ doff = round_up(jsize, data_align);
+ total = doff + data_size;
+ data_align = umax(data_align, L1_CACHE_BYTES);
mutex_lock(&call->conn->tx_data_alloc_lock);
- buf = __page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp,
- ~(data_align - 1) & ~(L1_CACHE_BYTES - 1));
+ buf = page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp,
+ data_align);
mutex_unlock(&call->conn->tx_data_alloc_lock);
if (!buf) {
kfree(txb);
return NULL;
}
- whdr = buf + hoff;
-
- INIT_LIST_HEAD(&txb->call_link);
- INIT_LIST_HEAD(&txb->tx_link);
refcount_set(&txb->ref, 1);
- txb->last_sent = KTIME_MIN;
txb->call_debug_id = call->debug_id;
txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids);
+ txb->alloc_size = data_size;
txb->space = data_size;
- txb->len = 0;
- txb->offset = sizeof(*whdr);
+ txb->offset = 0;
txb->flags = call->conn->out_clientflag;
- txb->ack_why = 0;
- txb->seq = call->tx_prepared + 1;
- txb->serial = 0;
- txb->cksum = 0;
- txb->nr_kvec = 1;
- txb->kvec[0].iov_base = whdr;
- txb->kvec[0].iov_len = sizeof(*whdr);
-
- whdr->epoch = htonl(call->conn->proto.epoch);
- whdr->cid = htonl(call->cid);
- whdr->callNumber = htonl(call->call_id);
- whdr->seq = htonl(txb->seq);
- whdr->type = RXRPC_PACKET_TYPE_DATA;
- whdr->flags = 0;
- whdr->userStatus = 0;
- whdr->securityIndex = call->security_ix;
- whdr->_rsvd = 0;
- whdr->serviceId = htons(call->dest_srx.srx_service);
+ txb->seq = call->send_top + 1;
+ txb->data = buf + doff;
trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1,
rxrpc_txbuf_alloc_data);
@@ -79,92 +60,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_
return txb;
}
-/*
- * Allocate and partially initialise an ACK packet.
- */
-struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size)
-{
- struct rxrpc_wire_header *whdr;
- struct rxrpc_acktrailer *trailer;
- struct rxrpc_ackpacket *ack;
- struct rxrpc_txbuf *txb;
- gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS;
- void *buf, *buf2 = NULL;
- u8 *filler;
-
- txb = kmalloc(sizeof(*txb), gfp);
- if (!txb)
- return NULL;
-
- buf = page_frag_alloc(&call->local->tx_alloc,
- sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp);
- if (!buf) {
- kfree(txb);
- return NULL;
- }
-
- if (sack_size) {
- buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp);
- if (!buf2) {
- page_frag_free(buf);
- kfree(txb);
- return NULL;
- }
- }
-
- whdr = buf;
- ack = buf + sizeof(*whdr);
- filler = buf + sizeof(*whdr) + sizeof(*ack) + 1;
- trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3;
-
- INIT_LIST_HEAD(&txb->call_link);
- INIT_LIST_HEAD(&txb->tx_link);
- refcount_set(&txb->ref, 1);
- txb->call_debug_id = call->debug_id;
- txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids);
- txb->space = 0;
- txb->len = sizeof(*whdr) + sizeof(*ack) + 3 + sizeof(*trailer);
- txb->offset = 0;
- txb->flags = call->conn->out_clientflag;
- txb->ack_rwind = 0;
- txb->seq = 0;
- txb->serial = 0;
- txb->cksum = 0;
- txb->nr_kvec = 3;
- txb->kvec[0].iov_base = whdr;
- txb->kvec[0].iov_len = sizeof(*whdr) + sizeof(*ack);
- txb->kvec[1].iov_base = buf2;
- txb->kvec[1].iov_len = sack_size;
- txb->kvec[2].iov_base = filler;
- txb->kvec[2].iov_len = 3 + sizeof(*trailer);
-
- whdr->epoch = htonl(call->conn->proto.epoch);
- whdr->cid = htonl(call->cid);
- whdr->callNumber = htonl(call->call_id);
- whdr->seq = 0;
- whdr->type = RXRPC_PACKET_TYPE_ACK;
- whdr->flags = 0;
- whdr->userStatus = 0;
- whdr->securityIndex = call->security_ix;
- whdr->_rsvd = 0;
- whdr->serviceId = htons(call->dest_srx.srx_service);
-
- get_page(virt_to_head_page(trailer));
-
- trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1,
- rxrpc_txbuf_alloc_ack);
- atomic_inc(&rxrpc_nr_txbuf);
- return txb;
-}
-
-void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
-{
- int r;
-
- __refcount_inc(&txb->ref, &r);
- trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r + 1, what);
-}
-
void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
{
int r = refcount_read(&txb->ref);
@@ -174,13 +69,10 @@ void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb)
{
- int i;
-
trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0,
rxrpc_txbuf_free);
- for (i = 0; i < txb->nr_kvec; i++)
- if (txb->kvec[i].iov_base)
- page_frag_free(txb->kvec[i].iov_base);
+ if (txb->data)
+ page_frag_free(txb->data);
kfree(txb);
atomic_dec(&rxrpc_nr_txbuf);
}
@@ -202,37 +94,3 @@ void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
rxrpc_free_txbuf(txb);
}
}
-
-/*
- * Shrink the transmit buffer.
- */
-void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call)
-{
- struct rxrpc_txbuf *txb;
- rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack);
- bool wake = false;
-
- _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top);
-
- while ((txb = list_first_entry_or_null(&call->tx_buffer,
- struct rxrpc_txbuf, call_link))) {
- hard_ack = smp_load_acquire(&call->acks_hard_ack);
- if (before(hard_ack, txb->seq))
- break;
-
- if (txb->seq != call->tx_bottom + 1)
- rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step);
- ASSERTCMP(txb->seq, ==, call->tx_bottom + 1);
- smp_store_release(&call->tx_bottom, call->tx_bottom + 1);
- list_del_rcu(&txb->call_link);
-
- trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue);
-
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated);
- if (after(call->acks_hard_ack, call->tx_bottom + 128))
- wake = true;
- }
-
- if (wake)
- wake_up(&call->waitq);
-}