diff options
Diffstat (limited to 'net/ipv4/tcp_bbr.c')
| -rw-r--r-- | net/ipv4/tcp_bbr.c | 99 |
1 files changed, 63 insertions, 36 deletions
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 56be7d27f208..760941e55153 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -56,6 +56,8 @@ * otherwise TCP stack falls back to an internal pacing using one high * resolution timer per TCP socket and may use more resources. */ +#include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/module.h> #include <net/tcp.h> #include <linux/inet_diag.h> @@ -256,7 +258,7 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) u64 rate = bw; rate = bbr_rate_bytes_per_sec(sk, rate, gain); - rate = min_t(u64, rate, sk->sk_max_pacing_rate); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); return rate; } @@ -274,9 +276,10 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) } else { /* no RTT sample yet */ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ } - bw = (u64)tp->snd_cwnd * BW_UNIT; + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; do_div(bw, rtt_us); - sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); + WRITE_ONCE(sk->sk_pacing_rate, + bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); } /* Pace using current bw estimate and a gain factor. */ @@ -288,14 +291,14 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) bbr_init_pacing_rate_from_rtt(sk); - if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) - sk->sk_pacing_rate = rate; + if (bbr_full_bw_reached(sk) || rate > READ_ONCE(sk->sk_pacing_rate)) + WRITE_ONCE(sk->sk_pacing_rate, rate); } /* override sysctl_tcp_min_tso_segs */ -static u32 bbr_min_tso_segs(struct sock *sk) +__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) { - return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; + return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; } static u32 bbr_tso_segs_goal(struct sock *sk) @@ -306,8 +309,9 @@ static u32 bbr_tso_segs_goal(struct sock *sk) /* Sort of tcp_tso_autosize() but ignoring * driver provided sk_gso_max_size. */ - bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift, - GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); + bytes = min_t(unsigned long, + READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), + GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); return min(segs, 0x7FU); @@ -320,12 +324,12 @@ static void bbr_save_cwnd(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) - bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ + bbr->prior_cwnd = tcp_snd_cwnd(tp); /* this cwnd is good enough */ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ - bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); + bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp)); } -static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) +__bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); @@ -346,7 +350,7 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) /* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: * - * bdp = bw * min_rtt * gain + * bdp = ceil(bw * min_rtt * gain) * * The key factor, gain, controls the amount of queue. While a small gain * builds a smaller queue, it becomes more vulnerable to noise in RTT @@ -370,7 +374,9 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) w = (u64)bw * bbr->min_rtt_us; - /* Apply a gain to the given value, then remove the BW_SCALE shift. */ + /* Apply a gain to the given value, remove the BW_SCALE shift, and + * round the value up to avoid a negative feedback loop. + */ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; return bdp; @@ -386,7 +392,7 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) * which allows 2 outstanding 2-packet sequences, to try to keep pipe * full even with ACK-every-other-packet delayed ACKs. */ -static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) { struct bbr *bbr = inet_csk_ca(sk); @@ -397,7 +403,7 @@ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) cwnd = (cwnd + 1) & ~1U; /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ - if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) cwnd += 2; return cwnd; @@ -409,7 +415,7 @@ static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) u32 inflight; inflight = bbr_bdp(sk, bw, gain); - inflight = bbr_quantization_budget(sk, inflight, gain); + inflight = bbr_quantization_budget(sk, inflight); return inflight; } @@ -477,7 +483,7 @@ static bool bbr_set_cwnd_to_recover_or_restore( struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; - u32 cwnd = tp->snd_cwnd; + u32 cwnd = tcp_snd_cwnd(tp); /* An ACK for P pkts should release at most 2*P packets. We do this * in two steps. First, here we deduct the number of lost packets. @@ -515,7 +521,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u32 cwnd = tp->snd_cwnd, target_cwnd = 0; + u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; if (!acked) goto done; /* no packet fully ACKed; just apply caps */ @@ -529,7 +535,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, * due to aggregation (of data and/or ACKs) visible in the ACK stream. */ target_cwnd += bbr_ack_aggregation_cwnd(sk); - target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); /* If we're below target cwnd, slow start cwnd toward target cwnd. */ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ @@ -539,9 +545,9 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, cwnd = max(cwnd, bbr_cwnd_min_target); done: - tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ + tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ - tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); + tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); } /* End cycle phase if it's time and/or we hit the phase's in-flight target. */ @@ -613,7 +619,7 @@ static void bbr_reset_probe_bw_mode(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->mode = BBR_PROBE_BW; - bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); + bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ } @@ -776,8 +782,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) * bandwidth sample. Delivered is in packets and interval_us in uS and * ratio will be <<1 for most connections. So delivered is first scaled. */ - bw = (u64)rs->delivered * BW_UNIT; - do_div(bw, rs->interval_us); + bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); /* If this sample is application-limited, it is likely to have a very * low delivered count that represents application behavior rather than @@ -852,7 +857,7 @@ static void bbr_update_ack_aggregation(struct sock *sk, bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, bbr->ack_epoch_acked + rs->acked_sacked); extra_acked = bbr->ack_epoch_acked - expected_acked; - extra_acked = min(extra_acked, tp->snd_cwnd); + extra_acked = min(extra_acked, tcp_snd_cwnd(tp)); if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; } @@ -910,7 +915,7 @@ static void bbr_check_probe_rtt_done(struct sock *sk) return; bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ - tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); bbr_reset_mode(sk); } @@ -943,7 +948,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && - (rs->rtt_us <= bbr->min_rtt_us || + (rs->rtt_us < bbr->min_rtt_us || (filter_expired && !rs->is_ack_delayed))) { bbr->min_rtt_us = rs->rtt_us; bbr->min_rtt_stamp = tcp_jiffies32; @@ -1019,7 +1024,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) bbr_update_gains(sk); } -static void bbr_main(struct sock *sk, const struct rate_sample *rs) +__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { struct bbr *bbr = inet_csk_ca(sk); u32 bw; @@ -1031,7 +1036,7 @@ static void bbr_main(struct sock *sk, const struct rate_sample *rs) bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); } -static void bbr_init(struct sock *sk) +__bpf_kfunc static void bbr_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); @@ -1039,7 +1044,7 @@ static void bbr_init(struct sock *sk) bbr->prior_cwnd = 0; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; bbr->rtt_cnt = 0; - bbr->next_rtt_delivered = 0; + bbr->next_rtt_delivered = tp->delivered; bbr->prev_ca_state = TCP_CA_Open; bbr->packet_conservation = 0; @@ -1073,7 +1078,7 @@ static void bbr_init(struct sock *sk) cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); } -static u32 bbr_sndbuf_expand(struct sock *sk) +__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) { /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ return 3; @@ -1082,18 +1087,18 @@ static u32 bbr_sndbuf_expand(struct sock *sk) /* In theory BBR does not need to undo the cwnd since it does not * always reduce cwnd on losses (see bbr_main()). Keep it for now. */ -static u32 bbr_undo_cwnd(struct sock *sk) +__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) { struct bbr *bbr = inet_csk_ca(sk); bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ bbr->full_bw_cnt = 0; bbr_reset_lt_bw_sampling(sk); - return tcp_sk(sk)->snd_cwnd; + return tcp_snd_cwnd(tcp_sk(sk)); } /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ -static u32 bbr_ssthresh(struct sock *sk) +__bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) { bbr_save_cwnd(sk); return tcp_sk(sk)->snd_ssthresh; @@ -1121,7 +1126,7 @@ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, return 0; } -static void bbr_set_state(struct sock *sk, u8 new_state) +__bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) { struct bbr *bbr = inet_csk_ca(sk); @@ -1150,9 +1155,31 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .set_state = bbr_set_state, }; +BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) +BTF_ID_FLAGS(func, bbr_init) +BTF_ID_FLAGS(func, bbr_main) +BTF_ID_FLAGS(func, bbr_sndbuf_expand) +BTF_ID_FLAGS(func, bbr_undo_cwnd) +BTF_ID_FLAGS(func, bbr_cwnd_event) +BTF_ID_FLAGS(func, bbr_ssthresh) +BTF_ID_FLAGS(func, bbr_min_tso_segs) +BTF_ID_FLAGS(func, bbr_set_state) +BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { + .owner = THIS_MODULE, + .set = &tcp_bbr_check_kfunc_ids, +}; + static int __init bbr_register(void) { + int ret; + BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set); + if (ret < 0) + return ret; return tcp_register_congestion_control(&tcp_bbr_cong_ops); } |
