diff options
Diffstat (limited to 'net/ipv4/tcp_cong.c')
| -rw-r--r-- | net/ipv4/tcp_cong.c | 457 |
1 files changed, 294 insertions, 163 deletions
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 019c2389a341..df758adbb445 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * Plugable TCP congestion control support and newReno + * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * @@ -13,15 +14,15 @@ #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> +#include <linux/jhash.h> #include <net/tcp.h> - -int sysctl_tcp_max_ssthresh = 0; +#include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ -static struct tcp_congestion_ops *tcp_ca_find(const char *name) +struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; @@ -33,27 +34,79 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) return NULL; } -/* - * Attach new congestion control algorithm to the list - * of available options. - */ -int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { - int ret = 0; + struct inet_connection_sock *icsk = inet_csk(sk); + + trace_tcp_cong_state_set(sk, ca_state); - /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !ca->cong_avoid) { + if (icsk->icsk_ca_ops->set_state) + icsk->icsk_ca_ops->set_state(sk, ca_state); + icsk->icsk_ca_state = ca_state; +} + +/* Must be called with rcu lock held */ +static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name) +{ + struct tcp_congestion_ops *ca = tcp_ca_find(name); + +#ifdef CONFIG_MODULES + if (!ca && capable(CAP_NET_ADMIN)) { + rcu_read_unlock(); + request_module("tcp_%s", name); + rcu_read_lock(); + ca = tcp_ca_find(name); + } +#endif + return ca; +} + +/* Simple linear search, not much in here. */ +struct tcp_congestion_ops *tcp_ca_find_key(u32 key) +{ + struct tcp_congestion_ops *e; + + list_for_each_entry_rcu(e, &tcp_cong_list, list) { + if (e->key == key) + return e; + } + + return NULL; +} + +int tcp_validate_congestion_control(struct tcp_congestion_ops *ca) +{ + /* all algorithms must implement these */ + if (!ca->ssthresh || !ca->undo_cwnd || + !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } + return 0; +} + +/* Attach new congestion control algorithm to the list + * of available options. + */ +int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +{ + int ret; + + ret = tcp_validate_congestion_control(ca); + if (ret) + return ret; + + ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); + spin_lock(&tcp_cong_list_lock); - if (tcp_ca_find(ca->name)) { - pr_notice("%s already registered\n", ca->name); + if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) { + pr_notice("%s already registered or non-unique key\n", + ca->name); ret = -EEXIST; } else { list_add_tail_rcu(&ca->list, &tcp_cong_list); - pr_info("%s registered\n", ca->name); + pr_debug("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); @@ -72,31 +125,144 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) spin_lock(&tcp_cong_list_lock); list_del_rcu(&ca->list); spin_unlock(&tcp_cong_list_lock); + + /* Wait for outstanding readers to complete before the + * module gets removed entirely. + * + * A try_module_get() should fail by now as our module is + * in "going" state since no refs are held anymore and + * module_exit() handler being called. + */ + synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); +/* Replace a registered old ca with a new one. + * + * The new ca must have the same name as the old one, that has been + * registered. + */ +int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca) +{ + struct tcp_congestion_ops *existing; + int ret = 0; + + ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); + + spin_lock(&tcp_cong_list_lock); + existing = tcp_ca_find_key(old_ca->key); + if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) { + pr_notice("%s not registered or non-unique key\n", + ca->name); + ret = -EINVAL; + } else if (existing != old_ca) { + pr_notice("invalid old congestion control algorithm to replace\n"); + ret = -EINVAL; + } else { + /* Add the new one before removing the old one to keep + * one implementation available all the time. + */ + list_add_tail_rcu(&ca->list, &tcp_cong_list); + list_del_rcu(&existing->list); + pr_debug("%s updated\n", ca->name); + } + spin_unlock(&tcp_cong_list_lock); + + /* Wait for outstanding readers to complete before the + * module or struct_ops gets removed entirely. + */ + if (!ret) + synchronize_rcu(); + + return ret; +} + +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) +{ + const struct tcp_congestion_ops *ca; + u32 key = TCP_CA_UNSPEC; + + might_sleep(); + + rcu_read_lock(); + ca = tcp_ca_find_autoload(name); + if (ca) { + key = ca->key; + *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; + } + rcu_read_unlock(); + + return key; +} + +char *tcp_ca_get_name_by_key(u32 key, char *buffer) +{ + const struct tcp_congestion_ops *ca; + char *ret = NULL; + + rcu_read_lock(); + ca = tcp_ca_find_key(key); + if (ca) { + strscpy(buffer, ca->name, TCP_CA_NAME_MAX); + ret = buffer; + } + rcu_read_unlock(); + + return ret; +} + /* Assign choice of congestion control. */ -void tcp_init_congestion_control(struct sock *sk) +void tcp_assign_congestion_control(struct sock *sk) { + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_congestion_ops *ca; + const struct tcp_congestion_ops *ca; - /* if no choice made yet assign the current value set as default */ - if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { - rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (try_module_get(ca->owner)) { - icsk->icsk_ca_ops = ca; - break; - } + rcu_read_lock(); + ca = rcu_dereference(net->ipv4.tcp_congestion_control); + if (unlikely(!bpf_try_module_get(ca, ca->owner))) + ca = &tcp_reno; + icsk->icsk_ca_ops = ca; + rcu_read_unlock(); - /* fallback to next available */ - } - rcu_read_unlock(); - } + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); +} + +void tcp_init_congestion_control(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); + icsk->icsk_ca_initialized = 1; +} + +static void tcp_reinit_congestion_control(struct sock *sk, + const struct tcp_congestion_ops *ca) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_cleanup_congestion_control(sk); + icsk->icsk_ca_ops = ca; + icsk->icsk_ca_setsockopt = 1; + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); + + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ @@ -104,35 +270,38 @@ void tcp_cleanup_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->release) + if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); - module_put(icsk->icsk_ca_ops->owner); + icsk->icsk_ca_initialized = 0; + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ -int tcp_set_default_congestion_control(const char *name) +int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; - int ret = -ENOENT; + const struct tcp_congestion_ops *prev; + int ret; - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); -#ifdef CONFIG_MODULES - if (!ca && capable(CAP_NET_ADMIN)) { - spin_unlock(&tcp_cong_list_lock); - - request_module("tcp_%s", name); - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); - } -#endif + rcu_read_lock(); + ca = tcp_ca_find_autoload(name); + if (!ca) { + ret = -ENOENT; + } else if (!bpf_try_module_get(ca, ca->owner)) { + ret = -EBUSY; + } else if (!net_eq(net, &init_net) && + !(ca->flags & TCP_CONG_NON_RESTRICTED)) { + /* Only init netns can set default to a restricted algorithm */ + ret = -EPERM; + } else { + prev = xchg(&net->ipv4.tcp_congestion_control, ca); + if (prev) + bpf_module_put(prev, prev->owner); - if (ca) { - ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ - list_move(&ca->list, &tcp_cong_list); + ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } - spin_unlock(&tcp_cong_list_lock); + rcu_read_unlock(); return ret; } @@ -140,11 +309,11 @@ int tcp_set_default_congestion_control(const char *name) /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { - return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); + return tcp_set_default_congestion_control(&init_net, + CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); - /* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { @@ -157,20 +326,20 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) "%s%s", offs == 0 ? "" : " ", ca->name); + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } /* Get current default congestion control */ -void tcp_get_default_congestion_control(char *name) +void tcp_get_default_congestion_control(struct net *net, char *name) { - struct tcp_congestion_ops *ca; - /* We will always have reno... */ - BUG_ON(list_empty(&tcp_cong_list)); + const struct tcp_congestion_ops *ca; rcu_read_lock(); - ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); - strncpy(name, ca->name, TCP_CA_NAME_MAX); + ca = rcu_dereference(net->ipv4.tcp_congestion_control); + strscpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } @@ -189,6 +358,8 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) "%s%s", offs == 0 ? "" : " ", ca->name); + if (WARN_ON_ONCE(offs >= maxlen)) + break; } rcu_read_unlock(); } @@ -232,115 +403,85 @@ out: return ret; } - -/* Change congestion control for socket */ -int tcp_set_congestion_control(struct sock *sk, const char *name) +/* Change congestion control for socket. If load is false, then it is the + * responsibility of the caller to call tcp_init_congestion_control or + * tcp_reinit_congestion_control (if the current congestion control was + * already initialized. + */ +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_congestion_ops *ca; + const struct tcp_congestion_ops *ca; int err = 0; + if (icsk->icsk_ca_dst_locked) + return -EPERM; + rcu_read_lock(); - ca = tcp_ca_find(name); + if (!load) + ca = tcp_ca_find(name); + else + ca = tcp_ca_find_autoload(name); - /* no change asking for existing value */ - if (ca == icsk->icsk_ca_ops) + /* No change asking for existing value */ + if (ca == icsk->icsk_ca_ops) { + icsk->icsk_ca_setsockopt = 1; goto out; - -#ifdef CONFIG_MODULES - /* not found attempt to autoload module */ - if (!ca && capable(CAP_NET_ADMIN)) { - rcu_read_unlock(); - request_module("tcp_%s", name); - rcu_read_lock(); - ca = tcp_ca_find(name); } -#endif + if (!ca) err = -ENOENT; - - else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; - - else if (!try_module_get(ca->owner)) + else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; - - else { - tcp_cleanup_congestion_control(sk); - icsk->icsk_ca_ops = ca; - - if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) - icsk->icsk_ca_ops->init(sk); - } + else + tcp_reinit_congestion_control(sk, ca); out: rcu_read_unlock(); return err; } -/* RFC2861 Check whether we are limited by application or congestion window - * This is the inverse of cwnd check in tcp_tso_should_defer +/* Slow start is used when congestion window is no greater than the slow start + * threshold. We base on RFC2581 and also handle stretch ACKs properly. + * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but + * something better;) a packet is only considered (s)acked in its entirety to + * defend the ACK attacks described in the RFC. Slow start processes a stretch + * ACK of degree N as if N acks of degree 1 are received back to back except + * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and + * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) +__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { - const struct tcp_sock *tp = tcp_sk(sk); - u32 left; - - if (in_flight >= tp->snd_cwnd) - return true; - - left = tp->snd_cwnd - in_flight; - if (sk_can_gso(sk) && - left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && - left * tp->mss_cache < sk->sk_gso_max_size && - left < sk->sk_gso_max_segs) - return true; - return left <= tcp_max_tso_deferred_mss(tp); -} -EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); + u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); -/* - * Slow start is used when congestion window is less than slow start - * threshold. This version implements the basic RFC2581 version - * and optionally supports: - * RFC3742 Limited Slow Start - growth limited to max_ssthresh - * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged - */ -void tcp_slow_start(struct tcp_sock *tp) -{ - int cnt; /* increase in packets */ - unsigned int delta = 0; - u32 snd_cwnd = tp->snd_cwnd; + acked -= cwnd - tcp_snd_cwnd(tp); + tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); - if (unlikely(!snd_cwnd)) { - pr_err_once("snd_cwnd is nul, please report this bug.\n"); - snd_cwnd = 1U; - } - - if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) - cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */ - else - cnt = snd_cwnd; /* exponential increase */ - - tp->snd_cwnd_cnt += cnt; - while (tp->snd_cwnd_cnt >= snd_cwnd) { - tp->snd_cwnd_cnt -= snd_cwnd; - delta++; - } - tp->snd_cwnd = min(snd_cwnd + delta, tp->snd_cwnd_clamp); + return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); -/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ -void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) +/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), + * for every packet that was ACKed. + */ +__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { + /* If credits accumulated at a higher w, apply them gently now. */ if (tp->snd_cwnd_cnt >= w) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; - } else { - tp->snd_cwnd_cnt++; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); + } + + tp->snd_cwnd_cnt += acked; + if (tp->snd_cwnd_cnt >= w) { + u32 delta = tp->snd_cwnd_cnt / w; + + tp->snd_cwnd_cnt -= delta * w; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta); } + tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); @@ -351,37 +492,40 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } /* In dangerous area, increase slowly. */ - else - tcp_cong_avoid_ai(tp, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ -u32 tcp_reno_ssthresh(struct sock *sk) +__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return max(tp->snd_cwnd >> 1U, 2U); + + return max(tcp_snd_cwnd(tp) >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window with halving. */ -u32 tcp_reno_min_cwnd(const struct sock *sk) +__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh/2; + + return max(tcp_snd_cwnd(tp), tp->prior_cwnd); } -EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); +EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, @@ -389,18 +533,5 @@ struct tcp_congestion_ops tcp_reno = { .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, -}; - -/* Initial congestion control used (until SYN) - * really reno under another name so we can tell difference - * during tcp_set_default_congestion_control - */ -struct tcp_congestion_ops tcp_init_congestion_ops = { - .name = "", - .owner = THIS_MODULE, - .ssthresh = tcp_reno_ssthresh, - .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, + .undo_cwnd = tcp_reno_undo_cwnd, }; -EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); |
