From c3426ba2ed6942fe33c75bf17fc7513ba2c6ac64 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Thu, 11 Sep 2025 13:06:31 +0200 Subject: tcp: reorganize tcp_sock_write_txrx group for variables later Use the first 3-byte hole at the beginning of the tcp_sock_write_txrx group for 'noneagle'/'rate_app_limited' to fill in the existing hole in later patches. Therefore, the group size of tcp_sock_write_txrx is reduced from 92 + 4 to 91 + 4. In addition, the group size of tcp_sock_write_rx is changed to 96 to fit in the pahole outcome. Below are the trimmed pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ /* XXX 3 bytes hole, try to pack */ [...] struct tcp_options_received rx_opt; /* 2588 24 */ u8 nonagle:4; /* 2612: 0 1 */ u8 rate_app_limited:1; /* 2612: 4 1 */ /* XXX 3 bits hole, try to pack */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2613 0 */ /* XXX 3 bytes hole, try to pack */ __cacheline_group_begin__tcp_sock_write_rx[0] __attribute__((__aligned__(8))); /* 2616 0 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2712 0 */ [...] /* size: 3200, cachelines: 50, members: 161 */ } [AFTER THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* XXX 2 bytes hole, try to pack */ [...] struct tcp_options_received rx_opt; /* 2588 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */ /* XXX 4 bytes hole, try to pack */ __cacheline_group_begin__tcp_sock_write_rx[0] __attribute__((__aligned__(8))); /* 2616 0 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2712 0 */ [...] /* size: 3200, cachelines: 50, members: 161 */ } Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911110642.87529-4-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 57e478bfaef2..d103cc0e7a35 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -285,6 +285,8 @@ struct tcp_sock { * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ + u8 nonagle : 4,/* Disable Nagle algorithm? */ + rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -303,8 +305,6 @@ struct tcp_sock { * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; - u8 nonagle : 4,/* Disable Nagle algorithm? */ - rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ __cacheline_group_end(tcp_sock_write_txrx); /* RX read-write hotpath cache lines */ -- cgit From 542a495cbaa6dc57a310da62b501fdf318657cad Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:25 +0200 Subject: tcp: AccECN core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change implements Accurate ECN without negotiation and AccECN Option (that will be added by later changes). Based on AccECN specifications: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN allows feeding back the number of CE (congestion experienced) marks accurately to the sender in contrast to RFC3168 ECN that can only signal one marks-seen-yes/no per RTT. Congestion control algorithms can take advantage of the accurate ECN information to fine-tune their congestion response to avoid drastic rate reduction when only mild congestion is encountered. With Accurate ECN, tp->received_ce (r.cep in AccECN spec) keeps track of how many segments have arrived with a CE mark. Accurate ECN uses ACE field (ECE, CWR, AE) to communicate the value back to the sender which updates tp->delivered_ce (s.cep) based on the feedback. This signalling channel is lossy when ACE field overflow occurs. Conservative strategy is selected here to deal with the ACE overflow, however, some strategies using the AccECN option later in the overall patchset mitigate against false overflows detected. The ACE field values on the wire are offset by TCP_ACCECN_CEP_INIT_OFFSET. Delivered_ce/received_ce count the real CE marks rather than forcing all downstream users to adapt to the wire offset. This patch uses the first 1-byte hole and the last 4-byte hole of the tcp_sock_write_txrx for 'received_ce_pending' and 'received_ce'. Also, the group size of tcp_sock_write_txrx is increased from 91 + 4 to 95 + 4 due to the new u32 received_ce member. Below are the trimmed pahole outcomes before and after this patch. [BEFORE THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* XXX 2 bytes hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 app_limited; /* 2580 4 */ u32 rcv_wnd; /* 2684 4 */ struct tcp_options_received rx_opt; /* 2688 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 161 */ } [AFTER THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 164 */ } Signed-off-by: Ilpo Järvinen Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-2-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d103cc0e7a35..90cee6e53527 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -287,6 +287,8 @@ struct tcp_sock { */ u8 nonagle : 4,/* Disable Nagle algorithm? */ rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ + u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ + unused2:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -299,6 +301,7 @@ struct tcp_sock { u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ + u32 received_ce; /* Like the above but for rcvd CE marked pkts */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* -- cgit From 3cae34274c79e0c60ccd1c10516973af1aed2a7c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:26 +0200 Subject: tcp: accecn: AccECN negotiation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accurate ECN negotiation parts based on the specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN is negotiated using ECE, CWR and AE flags in the TCP header. TCP falls back into using RFC3168 ECN if one of the ends supports only RFC3168-style ECN. The AccECN negotiation includes reflecting IP ECN field value seen in SYN and SYNACK back using the same bits as negotiation to allow responding to SYN CE marks and to detect ECN field mangling. CE marks should not occur currently because SYN=1 segments are sent with Non-ECT in IP ECN field (but proposal exists to remove this restriction). Reflecting SYN IP ECN field in SYNACK is relatively simple. Reflecting SYNACK IP ECN field in the final/third ACK of the handshake is more challenging. Linux TCP code is not well prepared for using the final/third ACK a signalling channel which makes things somewhat complicated here. tcp_ecn sysctl can be used to select the highest ECN variant (Accurate ECN, ECN, No ECN) that is attemped to be negotiated and requested for incoming connection and outgoing connection: TCP_ECN_IN_NOECN_OUT_NOECN, TCP_ECN_IN_ECN_OUT_ECN, TCP_ECN_IN_ECN_OUT_NOECN, TCP_ECN_IN_ACCECN_OUT_ACCECN, TCP_ECN_IN_ACCECN_OUT_ECN, and TCP_ECN_IN_ACCECN_OUT_NOECN. After this patch, the size of tcp_request_sock remains unchanged and no new holes are added. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_request_sock { [...] u32 rcv_nxt; /* 352 4 */ u8 syn_tos; /* 356 1 */ /* size: 360, cachelines: 6, members: 16 */ } [AFTER THIS PATCH] struct tcp_request_sock { [...] u32 rcv_nxt; /* 352 4 */ u8 syn_tos; /* 356 1 */ bool accecn_ok; /* 357 1 */ u8 syn_ect_snt:2; /* 358: 0 1 */ u8 syn_ect_rcv:2; /* 358: 2 1 */ u8 accecn_fail_mode:4; /* 358: 4 1 */ /* size: 360, cachelines: 6, members: 20 */ } After this patch, the size of tcp_sock remains unchanged and no new holes are added. Also, 4 bits of the existing 2-byte hole are exploited. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 dup_ack_counter:2; /* 2761: 0 1 */ u8 tlp_retrans:1; /* 2761: 2 1 */ u8 unused:5; /* 2761: 3 1 */ u8 thin_lto:1; /* 2762: 0 1 */ u8 fastopen_connect:1; /* 2762: 1 1 */ u8 fastopen_no_cookie:1; /* 2762: 2 1 */ u8 fastopen_client_fail:2; /* 2762: 3 1 */ u8 frto:1; /* 2762: 5 1 */ /* XXX 2 bits hole, try to pack */ [...] u8 keepalive_probes; /* 2765 1 */ /* XXX 2 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 164 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 dup_ack_counter:2; /* 2761: 0 1 */ u8 tlp_retrans:1; /* 2761: 2 1 */ u8 syn_ect_snt:2; /* 2761: 3 1 */ u8 syn_ect_rcv:2; /* 2761: 5 1 */ u8 thin_lto:1; /* 2761: 7 1 */ u8 fastopen_connect:1; /* 2762: 0 1 */ u8 fastopen_no_cookie:1; /* 2762: 1 1 */ u8 fastopen_client_fail:2; /* 2762: 2 1 */ u8 frto:1; /* 2762: 4 1 */ /* XXX 3 bits hole, try to pack */ [...] u8 keepalive_probes; /* 2765 1 */ u8 accecn_fail_mode:4; /* 2766: 0 1 */ /* XXX 4 bits hole, try to pack */ /* XXX 1 byte hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 166 */ } Signed-off-by: Ilpo Järvinen Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-3-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 90cee6e53527..b8432bed546d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -168,6 +168,10 @@ struct tcp_request_sock { * after data-in-SYN. */ u8 syn_tos; + bool accecn_ok; + u8 syn_ect_snt: 2, + syn_ect_rcv: 2, + accecn_fail_mode:4; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -375,7 +379,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + syn_ect_snt:2, /* AccECN ECT memory, only */ + syn_ect_rcv:2; /* ... needed during 3WHS + first seqno */ u8 thin_lto : 1,/* Use linear timeouts for thin streams */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ @@ -391,6 +396,7 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ + u8 accecn_fail_mode:4; /* AccECN failure handling */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ -- cgit From 9a011277445583bab002fbf5043fab0ea03dc5dd Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:27 +0200 Subject: tcp: accecn: add AccECN rx byte counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These three byte counters track IP ECN field payload byte sums for all arriving (acceptable) packets for ECT0, ECT1, and CE. The AccECN option (added by a later patch in the series) echoes these counters back to sender side; therefore, it is placed within the group of tcp_sock_write_txrx. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_txrx is increased from 95 + 4 to 107 + 4 and an extra 4-byte hole is created but will be exploited in later patches: [BEFORE THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 166 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 received_ecn_bytes[3];/* 2584 12 */ u32 app_limited; /* 2596 4 */ u32 rcv_wnd; /* 2600 4 */ struct tcp_options_received rx_opt; /* 2604 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 167 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-4-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b8432bed546d..012d01347b3c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -306,6 +306,10 @@ struct tcp_sock { u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 received_ce; /* Like the above but for rcvd CE marked pkts */ + u32 received_ecn_bytes[3]; /* received byte counters for three ECN + * types: INET_ECN_ECT_1, INET_ECN_ECT_0, + * and INET_ECN_CE + */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* -- cgit From b5e74132dfbe60329b3ff0e5c485039f2e31605c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:30 +0200 Subject: tcp: accecn: AccECN option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Accurate ECN allows echoing back the sum of bytes for each IP ECN field value in the received packets using AccECN option. This change implements AccECN option tx & rx side processing without option send control related features that are added by a later change. Based on specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt (Some features of the spec will be added in the later changes rather than in this one). A full-length AccECN option is always attempted but if it does not fit, the minimum length is selected based on the counters that have changed since the last update. The AccECN option (with 24-bit fields) often ends in odd sizes so the option write code tries to take advantage of some nop used to pad the other TCP options. The delivered_ecn_bytes pairs with received_ecn_bytes similar to how delivered_ce pairs with received_ce. In contrast to ACE field, however, the option is not always available to update delivered_ecn_bytes. For ACK w/o AccECN option, the delivered bytes calculated based on the cumulative ACK+SACK information are assigned to one of the counters using an estimation heuristic to select the most likely ECN byte counter. Any estimation error is corrected when the next AccECN option arrives. It may occur that the heuristic gets too confused when there are enough different byte counter deltas between ACKs with the AccECN option in which case the heuristic just gives up on updating the counters for a while. tcp_ecn_option sysctl can be used to select option sending mode for AccECN: TCP_ECN_OPTION_DISABLED, TCP_ECN_OPTION_MINIMUM, and TCP_ECN_OPTION_FULL. This patch increases the size of tcp_info struct, as there is no existing holes for new u32 variables. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ /* size: 248, cachelines: 4, members: 61 */ } [AFTER THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ __u32 tcpi_received_ce; /* 248 4 */ __u32 tcpi_delivered_e1_bytes; /* 252 4 */ __u32 tcpi_delivered_e0_bytes; /* 256 4 */ __u32 tcpi_delivered_ce_bytes; /* 260 4 */ __u32 tcpi_received_e1_bytes; /* 264 4 */ __u32 tcpi_received_e0_bytes; /* 268 4 */ __u32 tcpi_received_ce_bytes; /* 272 4 */ /* size: 280, cachelines: 5, members: 68 */ } This patch uses the existing 1-byte holes in the tcp_sock_write_txrx group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx group after the new u32 delivered_ecn_bytes[3] member. Therefore, the group size of tcp_sock_write_rx is increased from 96 to 112. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4; /* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2728 0 */ [...] /* size: 3200, cachelines: 50, members: 167 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ u32 delivered_ecn_bytes[3];/* 2672 12 */ /* XXX 4 bytes hole, try to pack */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2744 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-7-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 012d01347b3c..73557656cb2d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -122,8 +122,9 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ - u8 saw_unknown:1, /* Received unknown option */ - unused:7; + u8 accecn:6, /* AccECN index in header, 0=no options */ + saw_unknown:1, /* Received unknown option */ + unused:1; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -293,6 +294,9 @@ struct tcp_sock { rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ unused2:4; + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ + est_ecnfield:2,/* ECN field for AccECN delivered estimates */ + unused3:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -337,6 +341,7 @@ struct tcp_sock { u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_rtt_last_tsecr; + u32 delivered_ecn_bytes[3]; u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked -- cgit From aa55a7dde7ec506bb23448a5005ae3f4f809d022 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:31 +0200 Subject: tcp: accecn: AccECN option send control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of sending the option in every ACK, limit sending to those ACKs where the option is necessary: - Handshake - "Change-triggered ACK" + the ACK following it. The 2nd ACK is necessary to unambiguously indicate which of the ECN byte counters in increasing. The first ACK has two counters increasing due to the ecnfield edge. - ACKs with CE to allow CEP delta validations to take advantage of the option. - Force option to be sent every at least once per 2^22 bytes. The check is done using the bit edges of the byte counters (avoids need for extra variables). - AccECN option beacon to send a few times per RTT even if nothing in the ECN state requires that. The default is 3 times per RTT, and its period can be set via sysctl_tcp_ecn_option_beacon. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_tx is increased from 89 to 97 due to the new u64 accecn_opt_tstamp member: [BEFORE THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ struct list_head tsorted_sent_queue; /* 2496 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2521 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ u64 accecn_opt_tstamp; /* 2596 8 */ struct list_head tsorted_sent_queue; /* 2504 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2529 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2529 0 */ u8 nonagle:4; /* 2529: 0 1 */ u8 rate_app_limited:1; /* 2529: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2530: 0 1 */ u8 unused2:4; /* 2530: 4 1 */ u8 accecn_minlen:2; /* 2531: 0 1 */ u8 est_ecnfield:2; /* 2531: 2 1 */ u8 accecn_opt_demand:2; /* 2531: 4 1 */ u8 prev_ecnfield:2; /* 2531: 6 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2636 0 */ [...] /* size: 3200, cachelines: 50, members: 173 */ } Signed-off-by: Chia-Yu Chang Co-developed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-8-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 73557656cb2d..f637b659b35a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -275,6 +275,7 @@ struct tcp_sock { u32 mdev_us; /* medium deviation */ u32 rtt_seq; /* sequence number to update rttvar */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ + u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set @@ -296,7 +297,8 @@ struct tcp_sock { unused2:4; u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ est_ecnfield:2,/* ECN field for AccECN delivered estimates */ - unused3:4; + accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */ + prev_ecnfield:2; /* ECN bits from the previous segment */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ -- cgit From b40671b5ee588c8a61b2d0eacbad32ffc57e9a8f Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:32 +0200 Subject: tcp: accecn: AccECN option failure handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AccECN option may fail in various way, handle these: - Attempt to negotiate the use of AccECN on the 1st retransmitted SYN - From the 2nd retransmitted SYN, stop AccECN negotiation - Remove option from SYN/ACK rexmits to handle blackholes - If no option arrives in SYN/ACK, assume Option is not usable - If an option arrives later, re-enabled - If option is zeroed, disable AccECN option processing This patch use existing padding bits in tcp_request_sock and holes in tcp_sock without increasing the size. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-9-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f637b659b35a..3ca5ed02de6d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -173,6 +173,7 @@ struct tcp_request_sock { u8 syn_ect_snt: 2, syn_ect_rcv: 2, accecn_fail_mode:4; + u8 saw_accecn_opt :2; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -407,7 +408,8 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ - u8 accecn_fail_mode:4; /* AccECN failure handling */ + u8 accecn_fail_mode:4, /* AccECN failure handling */ + saw_accecn_opt:2; /* An AccECN option was seen */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ -- cgit From 1b44d700023e77dd92821e7811db825e75a1a394 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:52 +0000 Subject: tcp: move tcp->rcv_tstamp to tcp_sock_write_txrx group tcp_ack() writes this field, it belongs to tcp_sock_write_txrx. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3ca5ed02de6d..1e6c2ded22c9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -238,7 +238,6 @@ struct tcp_sock { /* RX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_rx); u32 copied_seq; /* Head of yet unread data */ - u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 snd_wl1; /* Sequence for window update */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 rttvar_us; /* smoothed mdev_max */ @@ -246,13 +245,13 @@ struct tcp_sock { u16 advmss; /* Advertised MSS */ u16 urg_data; /* Saved octet of OOB data and control flags */ u32 lost; /* Total data packets lost incl. rexmits */ + u32 snd_ssthresh; /* Slow start size threshold */ struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; #if defined(CONFIG_TLS_DEVICE) void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); #endif - u32 snd_ssthresh; /* Slow start size threshold */ u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_rx); @@ -319,6 +318,7 @@ struct tcp_sock { */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ + u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ /* * Options received (usually on last packet, some only on SYN packets). */ -- cgit From 969904dcd77dbb0a773d66cddaa59eccc6415d03 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:53 +0000 Subject: tcp: move recvmsg_inq to tcp_sock_read_txrx Fill a hole in tcp_sock_read_txrx, instead of possibly wasting a cache line. Note that tcp_recvmsg_locked() is also reading tp->repair, so this removes one cache line miss in tcp recvmsg(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1e6c2ded22c9..c1d7fce251d7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -232,7 +232,8 @@ struct tcp_sock { repair : 1, tcp_usec_ts : 1, /* TSval values in usec */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ - is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ + is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ + recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_txrx); /* RX read-mostly hotpath cache lines */ @@ -252,7 +253,6 @@ struct tcp_sock { #if defined(CONFIG_TLS_DEVICE) void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); #endif - u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_rx); /* TX read-write hotpath cache lines */ -- cgit From a105ea47a4e855d24ebf65f1c5fb907162e7b8cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:54 +0000 Subject: tcp: move tcp_clean_acked to tcp_sock_read_tx group tp->tcp_clean_acked is fetched in tx path when snd_una is updated. This field thus belongs to tcp_sock_read_tx group. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c1d7fce251d7..3f282130c863 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -215,6 +215,9 @@ struct tcp_sock { u16 gso_segs; /* Max number of segs per GSO packet */ /* from STCP, retrans queue hinting */ struct sk_buff *retransmit_skb_hint; +#if defined(CONFIG_TLS_DEVICE) + void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); +#endif __cacheline_group_end(tcp_sock_read_tx); /* TXRX read-mostly hotpath cache lines */ @@ -250,9 +253,6 @@ struct tcp_sock { struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; -#if defined(CONFIG_TLS_DEVICE) - void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); -#endif __cacheline_group_end(tcp_sock_read_rx); /* TX read-write hotpath cache lines */ -- cgit From 31c4511bbb0c75c525b6e4f4fe4167f2e9d3b05c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:55 +0000 Subject: tcp: move mtu_info to remove two 32bit holes This removes 8bytes waste on 64bit builds. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3f282130c863..20b8c6e21fef 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -448,6 +448,9 @@ struct tcp_sock { * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ + u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG + * while socket was owned by user. + */ u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans * Total data bytes retransmitted */ @@ -494,9 +497,6 @@ struct tcp_sock { u32 probe_seq_end; } mtu_probe; u32 plb_rehash; /* PLB-triggered rehash attempts */ - u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG - * while socket was owned by user. - */ #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif -- cgit