diff options
Diffstat (limited to 'net/ipv4/tcp_cong.c')
| -rw-r--r-- | net/ipv4/tcp_cong.c | 184 |
1 files changed, 123 insertions, 61 deletions
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bc6c02f16243..df758adbb445 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Pluggable TCP congestion control support and newReno * congestion control. @@ -15,12 +16,13 @@ #include <linux/gfp.h> #include <linux/jhash.h> #include <net/tcp.h> +#include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ -static struct tcp_congestion_ops *tcp_ca_find(const char *name) +struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; @@ -32,9 +34,19 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) return NULL; } +void tcp_set_ca_state(struct sock *sk, const u8 ca_state) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + trace_tcp_cong_state_set(sk, ca_state); + + if (icsk->icsk_ca_ops->set_state) + icsk->icsk_ca_ops->set_state(sk, ca_state); + icsk->icsk_ca_state = ca_state; +} + /* Must be called with rcu lock held */ -static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, - const char *name) +static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name) { struct tcp_congestion_ops *ca = tcp_ca_find(name); @@ -62,14 +74,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key) return NULL; } -/* - * Attach new congestion control algorithm to the list - * of available options. - */ -int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +int tcp_validate_congestion_control(struct tcp_congestion_ops *ca) { - int ret = 0; - /* all algorithms must implement these */ if (!ca->ssthresh || !ca->undo_cwnd || !(ca->cong_avoid || ca->cong_control)) { @@ -77,6 +83,20 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) return -EINVAL; } + return 0; +} + +/* Attach new congestion control algorithm to the list + * of available options. + */ +int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +{ + int ret; + + ret = tcp_validate_congestion_control(ca); + if (ret) + return ret; + ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); @@ -117,7 +137,47 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) +/* Replace a registered old ca with a new one. + * + * The new ca must have the same name as the old one, that has been + * registered. + */ +int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca) +{ + struct tcp_congestion_ops *existing; + int ret = 0; + + ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); + + spin_lock(&tcp_cong_list_lock); + existing = tcp_ca_find_key(old_ca->key); + if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) { + pr_notice("%s not registered or non-unique key\n", + ca->name); + ret = -EINVAL; + } else if (existing != old_ca) { + pr_notice("invalid old congestion control algorithm to replace\n"); + ret = -EINVAL; + } else { + /* Add the new one before removing the old one to keep + * one implementation available all the time. + */ + list_add_tail_rcu(&ca->list, &tcp_cong_list); + list_del_rcu(&existing->list); + pr_debug("%s updated\n", ca->name); + } + spin_unlock(&tcp_cong_list_lock); + + /* Wait for outstanding readers to complete before the + * module or struct_ops gets removed entirely. + */ + if (!ret) + synchronize_rcu(); + + return ret; +} + +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; @@ -125,7 +185,7 @@ u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) might_sleep(); rcu_read_lock(); - ca = tcp_ca_find_autoload(net, name); + ca = tcp_ca_find_autoload(name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; @@ -134,7 +194,6 @@ u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) return key; } -EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name); char *tcp_ca_get_name_by_key(u32 key, char *buffer) { @@ -143,14 +202,14 @@ char *tcp_ca_get_name_by_key(u32 key, char *buffer) rcu_read_lock(); ca = tcp_ca_find_key(key); - if (ca) - ret = strncpy(buffer, ca->name, - TCP_CA_NAME_MAX); + if (ca) { + strscpy(buffer, ca->name, TCP_CA_NAME_MAX); + ret = buffer; + } rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) @@ -161,7 +220,7 @@ void tcp_assign_congestion_control(struct sock *sk) rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); - if (unlikely(!try_module_get(ca->owner))) + if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); @@ -175,7 +234,7 @@ void tcp_assign_congestion_control(struct sock *sk) void tcp_init_congestion_control(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) @@ -184,6 +243,7 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); + icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, @@ -196,7 +256,12 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); - if (sk->sk_state != TCP_CLOSE) + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); + + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } @@ -205,9 +270,10 @@ void tcp_cleanup_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->release) + if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); - module_put(icsk->icsk_ca_ops->owner); + icsk->icsk_ca_initialized = 0; + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ @@ -218,15 +284,19 @@ int tcp_set_default_congestion_control(struct net *net, const char *name) int ret; rcu_read_lock(); - ca = tcp_ca_find_autoload(net, name); + ca = tcp_ca_find_autoload(name); if (!ca) { ret = -ENOENT; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; + } else if (!net_eq(net, &init_net) && + !(ca->flags & TCP_CONG_NON_RESTRICTED)) { + /* Only init netns can set default to a restricted algorithm */ + ret = -EPERM; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) - module_put(prev->owner); + bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; @@ -255,6 +325,9 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } @@ -266,7 +339,7 @@ void tcp_get_default_congestion_control(struct net *net, char *name) rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); - strncpy(name, ca->name, TCP_CA_NAME_MAX); + strscpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } @@ -284,6 +357,9 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } @@ -332,7 +408,8 @@ out: * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit) +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -345,7 +422,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo if (!load) ca = tcp_ca_find(name); else - ca = tcp_ca_find_autoload(sock_net(sk), name); + ca = tcp_ca_find_autoload(name); /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { @@ -353,29 +430,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo goto out; } - if (!ca) { + if (!ca) err = -ENOENT; - } else if (!load) { - const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - - if (try_module_get(ca->owner)) { - if (reinit) { - tcp_reinit_congestion_control(sk, ca); - } else { - icsk->icsk_ca_ops = ca; - module_put(old_ca->owner); - } - } else { - err = -EBUSY; - } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; - } else if (!try_module_get(ca->owner)) { + else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; - } else { + else tcp_reinit_congestion_control(sk, ca); - } out: rcu_read_unlock(); return err; @@ -390,12 +452,12 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) +__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { - u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); + u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); - acked -= cwnd - tp->snd_cwnd; - tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); + acked -= cwnd - tcp_snd_cwnd(tp); + tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); return acked; } @@ -404,12 +466,12 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), * for every packet that was ACKed. */ -void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) +__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { /* If credits accumulated at a higher w, apply them gently now. */ if (tp->snd_cwnd_cnt >= w) { tp->snd_cwnd_cnt = 0; - tp->snd_cwnd++; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } tp->snd_cwnd_cnt += acked; @@ -417,9 +479,9 @@ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) u32 delta = tp->snd_cwnd_cnt / w; tp->snd_cwnd_cnt -= delta * w; - tp->snd_cwnd += delta; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta); } - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); + tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); @@ -430,7 +492,7 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) +__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -444,24 +506,24 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; } /* In dangerous area, increase slowly. */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); + tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ -u32 tcp_reno_ssthresh(struct sock *sk) +__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return max(tp->snd_cwnd >> 1U, 2U); + return max(tcp_snd_cwnd(tp) >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -u32 tcp_reno_undo_cwnd(struct sock *sk) +__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return max(tp->snd_cwnd, tp->prior_cwnd); + return max(tcp_snd_cwnd(tp), tp->prior_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); |
