From 44badc908f2c85711cb18e45e13119c10ad3a05f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 24 Sep 2024 09:05:45 +0100 Subject: tcp: Fix spelling mistake "emtpy" -> "empty" There is a spelling mistake in a WARN_ONCE message. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Jason Xing Link: https://patch.msgid.link/20240924080545.1324962-1-colin.i.king@gmail.com Signed-off-by: Paolo Abeni --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index d1948d357dad..739a9fb83d0c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2442,7 +2442,7 @@ static inline s64 tcp_rto_delta_us(const struct sock *sk) return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; } else { WARN_ONCE(1, - "rtx queue emtpy: " + "rtx queue empty: " "out:%u sacked:%u lost:%u retrans:%u " "tlp_high_seq:%u sk_state:%u ca_state:%u " "advmss:%u mss_cache:%u pmtu:%u\n", -- cgit From e26a0c5d828b225b88f534e2fcf10bf617f85f23 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Sun, 29 Sep 2024 20:44:35 -0700 Subject: net: mana: Increase the DEF_RX_BUFFERS_PER_QUEUE to 1024 Through some experiments, we found out that increasing the default RX buffers count from 512 to 1024, gives slightly better throughput and significantly reduces the no_wqe_rx errs on the receiver side. Along with these, other parameters like cpu usage, retrans seg etc also show some improvement with 1024 value. Following are some snippets from the experiments ntttcp tests with 512 Rx buffers --------------------------------------- connections| throughput| no_wqe errs| --------------------------------------- 1 | 40.93Gbps | 123,211 | 16 | 180.15Gbps | 190,120 | 128 | 180.20Gbps | 173,508 | 256 | 180.27Gbps | 189,884 | ntttcp tests with 1024 Rx buffers --------------------------------------- connections| throughput| no_wqe errs| --------------------------------------- 1 | 44.22Gbps | 19,864 | 16 | 180.19Gbps | 4,430 | 128 | 180.21Gbps | 2,560 | 256 | 180.29Gbps | 1,529 | So, increasing the default RX buffers per queue count to 1024 Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Reviewed-by: Pavan Chebbi Link: https://patch.msgid.link/1727667875-29908-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Paolo Abeni --- include/net/mana/mana.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index f2a5200d8a0f..9b0faa24b758 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -43,7 +43,7 @@ enum TRI_STATE { * size beyond this value gets rejected by __alloc_page() call. */ #define MAX_RX_BUFFERS_PER_QUEUE 8192 -#define DEF_RX_BUFFERS_PER_QUEUE 512 +#define DEF_RX_BUFFERS_PER_QUEUE 1024 #define MIN_RX_BUFFERS_PER_QUEUE 128 /* This max value for TX buffers is derived as the maximum allocatable -- cgit From 7e863e5db6185b1add0df4cb01b31a4ed1c4b738 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 1 Oct 2024 21:28:43 +0200 Subject: ipv4: Convert ip_route_input() to dscp_t. Pass a dscp_t variable to ip_route_input(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Callers of ip_route_input() to consider are: * input_action_end_dx4_finish() and input_action_end_dt4() in net/ipv6/seg6_local.c. These functions set the tos parameter to 0, which is already a valid dscp_t value, so they don't need to be adjusted for the new prototype. * icmp_route_lookup(), which already has a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversion. * br_nf_pre_routing_finish(), ip_options_rcv_srr() and ip4ip6_err(), which get the DSCP directly from IPv4 headers. Define a helper to read the .tos field of struct iphdr as dscp_t, so that these function don't have to do the conversion manually. While there, declare *iph as const in br_nf_pre_routing_finish(), declare its local variables in reverse-christmas-tree order and move the "err = ip_route_input()" assignment out of the conditional to avoid checkpatch warning. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/e9d40781d64d3d69f4c79ac8a008b8d67a033e8d.1727807926.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 5 +++++ include/net/route.h | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index d92d3bc3ec0e..bab084df1567 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -424,6 +424,11 @@ int ip_decrease_ttl(struct iphdr *iph) return --iph->ttl; } +static inline dscp_t ip4h_dscp(const struct iphdr *ip4h) +{ + return inet_dsfield_to_dscp(ip4h->tos); +} + static inline int ip_mtu_locked(const struct dst_entry *dst) { const struct rtable *rt = dst_rtable(dst); diff --git a/include/net/route.h b/include/net/route.h index 1789f1e6640b..03dd28cf4bc4 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -208,12 +208,13 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, const struct sk_buff *hint); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) + dscp_t dscp, struct net_device *devin) { int err; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, tos, devin); + err = ip_route_input_noref(skb, dst, src, inet_dscp_to_dsfield(dscp), + devin); if (!err) { skb_dst_force(skb); if (!skb_dst(skb)) -- cgit From 66fb6386d358a04edd5c640e38b4a02b323b89d8 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 1 Oct 2024 21:28:49 +0200 Subject: ipv4: Convert ip_route_input_noref() to dscp_t. Pass a dscp_t variable to ip_route_input_noref(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Callers of ip_route_input_noref() to consider are: * arp_process() in net/ipv4/arp.c. This function sets the tos parameter to 0, which is already a valid dscp_t value, so it doesn't need to be adjusted for the new prototype. * ip_route_input(), which already has a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversion. * ipvlan_l3_rcv(), bpf_lwt_input_reroute(), ip_expire(), ip_rcv_finish_core(), xfrm4_rcv_encap_finish() and xfrm4_rcv_encap(), which get the DSCP directly from IPv4 headers and can simply use the ip4h_dscp() helper. While there, declare the IPv4 header pointers as const in ipvlan_l3_rcv() and bpf_lwt_input_reroute(). Also, modify the declaration of ip_route_input_noref() in include/net/route.h so that it matches the prototype of its implementation in net/ipv4/route.c. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/a8a747bed452519c4d0cc06af32c7e7795d7b627.1727807926.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 03dd28cf4bc4..5e4374d66927 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -201,8 +201,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, struct in_device *in_dev, u32 *itag); -int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin); +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin, const struct sk_buff *hint); @@ -213,8 +213,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, int err; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, inet_dscp_to_dsfield(dscp), - devin); + err = ip_route_input_noref(skb, dst, src, dscp, devin); if (!err) { skb_dst_force(skb); if (!skb_dst(skb)) -- cgit From 4aecca4c76808f3736056d18ff510df80424bc9f Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 1 Oct 2024 05:57:14 -0700 Subject: net_tstamp: add SCM_TS_OPT_ID to provide OPT_ID in control message SOF_TIMESTAMPING_OPT_ID socket option flag gives a way to correlate TX timestamps and packets sent via socket. Unfortunately, there is no way to reliably predict socket timestamp ID value in case of error returned by sendmsg. For UDP sockets it's impossible because of lockless nature of UDP transmit, several threads may send packets in parallel. In case of RAW sockets MSG_MORE option makes things complicated. More details are in the conversation [1]. This patch adds new control message type to give user-space software an opportunity to control the mapping between packets and values by providing ID with each sendmsg for UDP sockets. The documentation is also added in this patch. [1] https://lore.kernel.org/netdev/CALCETrU0jB+kg0mhV6A8mrHfTE1D1pr1SD_B9Eaa9aDPfgHdtA@mail.gmail.com/ Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20241001125716.2832769-2-vadfed@meta.com Signed-off-by: Jakub Kicinski --- include/net/inet_sock.h | 4 +++- include/net/sock.h | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 394c3b66065e..f01dd273bea6 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -174,6 +174,7 @@ struct inet_cork { __s16 tos; char priority; __u16 gso_size; + u32 ts_opt_id; u64 transmit_time; u32 mark; }; @@ -241,7 +242,8 @@ struct inet_sock { struct inet_cork_full cork; }; -#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_TS_OPT_ID 2 /* ts_opt_id field is valid, overriding sk_tskey */ enum { INET_FLAGS_PKTINFO = 0, diff --git a/include/net/sock.h b/include/net/sock.h index c58ca8dd561b..ccf28c2b70b1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -954,6 +954,12 @@ enum sock_flags { }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) +/* + * The highest bit of sk_tsflags is reserved for kernel-internal + * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that + * SOF_TIMESTAMPING* values do not reach this reserved area + */ +#define SOCKCM_FLAG_TS_OPT_ID BIT(31) static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) { @@ -1796,6 +1802,7 @@ struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; + u32 ts_opt_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, -- cgit From 822b5bc6db55f1c3ea51659c423784ac6919ddd4 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 1 Oct 2024 05:57:15 -0700 Subject: net_tstamp: add SCM_TS_OPT_ID for RAW sockets The last type of sockets which supports SOF_TIMESTAMPING_OPT_ID is RAW sockets. To add new option this patch converts all callers (direct and indirect) of _sock_tx_timestamp to provide sockcm_cookie instead of tsflags. And while here fix __sock_tx_timestamp to receive tsflags as __u32 instead of __u16. Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20241001125716.2832769-3-vadfed@meta.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index ccf28c2b70b1..e282127092ab 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2660,39 +2660,48 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, sock_write_timestamp(sk, 0); } -void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); +void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet - * @tsflags: timestamping flags to use + * @sockc: pointer to socket cmsg cookie to get timestamping info * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ -static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, +static inline void _sock_tx_timestamp(struct sock *sk, + const struct sockcm_cookie *sockc, __u8 *tx_flags, __u32 *tskey) { + __u32 tsflags = sockc->tsflags; + if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && - tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = atomic_inc_return(&sk->sk_tskey) - 1; + tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { + if (tsflags & SOCKCM_FLAG_TS_OPT_ID) + *tskey = sockc->ts_opt_id; + else + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; + } } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; } -static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags, +static inline void sock_tx_timestamp(struct sock *sk, + const struct sockcm_cookie *sockc, __u8 *tx_flags) { - _sock_tx_timestamp(sk, tsflags, tx_flags, NULL); + _sock_tx_timestamp(sk, sockc, tx_flags, NULL); } -static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) +static inline void skb_setup_tx_timestamp(struct sk_buff *skb, + const struct sockcm_cookie *sockc) { - _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags, + _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } -- cgit From 5a9071a760a61b00260334ad576fe60debafaafc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2024 17:30:40 +0000 Subject: tcp: annotate data-races around icsk->icsk_pending icsk->icsk_pending can be read locklessly already. Following patch in the series will add another lockless read. Add smp_load_acquire() and smp_store_release() annotations because following patch will add a test in tcp_write_timer(), and READ_ONCE()/WRITE_ONCE() alone would possibly lead to races. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241002173042.917928-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c0deaafebfdc..914d19772704 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -197,7 +197,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) struct inet_connection_sock *icsk = inet_csk(sk); if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { - icsk->icsk_pending = 0; + smp_store_release(&icsk->icsk_pending, 0); #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif @@ -229,7 +229,7 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { - icsk->icsk_pending = what; + smp_store_release(&icsk->icsk_pending, what); icsk->icsk_timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); } else if (what == ICSK_TIME_DACK) { -- cgit From 81df4fa94ee8c0800ed42c47357435602ed105ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2024 17:30:42 +0000 Subject: tcp: add a fast path in tcp_delack_timer() delack timer is not stopped from inet_csk_clear_xmit_timer() because we do not define INET_CSK_CLEAR_TIMERS. This is a conscious choice : inet_csk_clear_xmit_timer() is often called from another cpu. Calling del_timer() would cause false sharing and lock contention. This means that very often, tcp_delack_timer() is called at the timer expiration, while there is no ACK to transmit. This can be detected very early, avoiding the socket spinlock. Notes: - test about tp->compressed_ack is racy, but in the unlikely case there is a race, the dedicated compressed_ack_timer hrtimer would close it. - Even if the fast path is not taken, reading icsk->icsk_ack.pending and tp->compressed_ack before acquiring the socket spinlock reduces acquisition time and chances of contention. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241002173042.917928-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 914d19772704..3c82fad904d4 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -202,7 +202,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_ack.pending, 0); icsk->icsk_ack.retry = 0; #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); @@ -233,7 +233,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, icsk->icsk_timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending |= ICSK_ACK_TIMER; + smp_store_release(&icsk->icsk_ack.pending, + icsk->icsk_ack.pending | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); } else { -- cgit From 539770616521e5b046ca7612eb79ba11b53edb1d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 3 Oct 2024 12:52:17 +0100 Subject: net: dsa: remove obsolete phylink dsa_switch operations No driver now uses the DSA switch phylink members, so we can now remove the method pointers, but we need to leave empty shim functions to allow those drivers that do not provide phylink MAC operations structure to continue functioning. Signed-off-by: Russell King (oracle) Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean # sja1105, felix, dsa_loop Link: https://patch.msgid.link/E1swKNV-0060oN-1b@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index d7a6c2930277..72ae65e7246a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -885,21 +885,6 @@ struct dsa_switch_ops { */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); - struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, - int port, - phy_interface_t iface); - void (*phylink_mac_config)(struct dsa_switch *ds, int port, - unsigned int mode, - const struct phylink_link_state *state); - void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface); - void (*phylink_mac_link_up)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev, - int speed, int duplex, - bool tx_pause, bool rx_pause); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* -- cgit From 76aed95319da25d6884dff01d5f0149e4b542f96 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 4 Oct 2024 15:10:29 -0700 Subject: rtnetlink: Add per-netns RTNL. The goal is to break RTNL down into per-netns mutex. This patch adds per-netns mutex and its helper functions, rtnl_net_lock() and rtnl_net_unlock(). rtnl_net_lock() acquires the global RTNL and per-netns RTNL mutex, and rtnl_net_unlock() releases them. We will replace 800+ rtnl_lock() with rtnl_net_lock() and finally removes rtnl_lock() in rtnl_net_lock(). When we need to nest per-netns RTNL mutex, we will use __rtnl_net_lock(), and its locking order is defined by rtnl_net_lock_cmp_fn() as follows: 1. init_net is first 2. netns address ascending order Note that the conversion will be done under CONFIG_DEBUG_NET_SMALL_RTNL with LOCKDEP so that we can carefully add the extra mutex without slowing down RTNL operations during conversion. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/net_namespace.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index e67b483cc8bb..873c0f9fdac6 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -188,6 +188,10 @@ struct net { #if IS_ENABLED(CONFIG_SMC) struct netns_smc smc; #endif +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + /* Move to a better place when the config guard is removed. */ + struct mutex rtnl_mutex; +#endif } __randomize_layout; #include -- cgit From 02f220b5267042d0de649614eec84ded8aeecb4f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 20:26:55 +0200 Subject: wifi: ipw2x00/lib80211: move remaining lib80211 into libipw There's already much code in libipw that used to be shared with more drivers, but now with the prior cleanups, those old Intel ipw2x00 drivers are also the only ones using whatever is now left of lib80211. Move lib80211 entirely into libipw. Link: https://patch.msgid.link/20241007202707.915ef7b9e7c7.Ib9876d2fe3c90f11d6df458b16d0b7d4bf551a8d@changeid Signed-off-by: Johannes Berg --- include/net/lib80211.h | 122 ------------------------------------------------- 1 file changed, 122 deletions(-) delete mode 100644 include/net/lib80211.h (limited to 'include/net') diff --git a/include/net/lib80211.h b/include/net/lib80211.h deleted file mode 100644 index fd0f15d87d80..000000000000 --- a/include/net/lib80211.h +++ /dev/null @@ -1,122 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * lib80211.h -- common bits for IEEE802.11 wireless drivers - * - * Copyright (c) 2008, John W. Linville - * - * Some bits copied from old ieee80211 component, w/ original copyright - * notices below: - * - * Original code based on Host AP (software wireless LAN access point) driver - * for Intersil Prism2/2.5/3. - * - * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen - * - * Copyright (c) 2002-2003, Jouni Malinen - * - * Adaption to a generic IEEE 802.11 stack by James Ketrenos - * - * - * Copyright (c) 2004, Intel Corporation - * - */ - -#ifndef LIB80211_H -#define LIB80211_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#define NUM_WEP_KEYS 4 - -enum { - IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), -}; - -struct module; - -struct lib80211_crypto_ops { - const char *name; - struct list_head list; - - /* init new crypto context (e.g., allocate private data space, - * select IV, etc.); returns NULL on failure or pointer to allocated - * private data on success */ - void *(*init) (int keyidx); - - /* deinitialize crypto context and free allocated private data */ - void (*deinit) (void *priv); - - /* encrypt/decrypt return < 0 on error or >= 0 on success. The return - * value from decrypt_mpdu is passed as the keyidx value for - * decrypt_msdu. skb must have enough head and tail room for the - * encryption; if not, error will be returned; these functions are - * called for all MPDUs (i.e., fragments). - */ - int (*encrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); - int (*decrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); - - /* These functions are called for full MSDUs, i.e. full frames. - * These can be NULL if full MSDU operations are not needed. */ - int (*encrypt_msdu) (struct sk_buff * skb, int hdr_len, void *priv); - int (*decrypt_msdu) (struct sk_buff * skb, int keyidx, int hdr_len, - void *priv); - - int (*set_key) (void *key, int len, u8 * seq, void *priv); - int (*get_key) (void *key, int len, u8 * seq, void *priv); - - /* procfs handler for printing out key information and possible - * statistics */ - void (*print_stats) (struct seq_file *m, void *priv); - - /* Crypto specific flag get/set for configuration settings */ - unsigned long (*get_flags) (void *priv); - unsigned long (*set_flags) (unsigned long flags, void *priv); - - /* maximum number of bytes added by encryption; encrypt buf is - * allocated with extra_prefix_len bytes, copy of in_buf, and - * extra_postfix_len; encrypt need not use all this space, but - * the result must start at the beginning of the buffer and correct - * length must be returned */ - int extra_mpdu_prefix_len, extra_mpdu_postfix_len; - int extra_msdu_prefix_len, extra_msdu_postfix_len; - - struct module *owner; -}; - -struct lib80211_crypt_data { - struct list_head list; /* delayed deletion list */ - const struct lib80211_crypto_ops *ops; - void *priv; - atomic_t refcnt; -}; - -struct lib80211_crypt_info { - char *name; - /* Most clients will already have a lock, - so just point to that. */ - spinlock_t *lock; - - struct lib80211_crypt_data *crypt[NUM_WEP_KEYS]; - int tx_keyidx; /* default TX key index (crypt[tx_keyidx]) */ - struct list_head crypt_deinit_list; - struct timer_list crypt_deinit_timer; - int crypt_quiesced; -}; - -int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name, - spinlock_t *lock); -void lib80211_crypt_info_free(struct lib80211_crypt_info *info); -int lib80211_register_crypto_ops(const struct lib80211_crypto_ops *ops); -int lib80211_unregister_crypto_ops(const struct lib80211_crypto_ops *ops); -const struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name); -void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info, - struct lib80211_crypt_data **crypt); - -#endif /* LIB80211_H */ -- cgit From 3a1d429ebd43bcfdf3590096ca72cbf593d1598b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:02:53 +0200 Subject: wifi: wext/libipw: move spy implementation to libipw There's no driver left using this other than ipw2200, so move the data bookkeeping and code into libipw. Link: https://patch.msgid.link/20241007210254.037d864cda7d.Ib2197cb056ff05746d3521a5fba637062acb7314@changeid Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/net') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index 7af1082ea9a0..a7b502958d27 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -418,8 +418,6 @@ struct iw_spy_data { struct libipw_device; /* The struct */ struct iw_public_data { - /* Driver enhanced spy support */ - struct iw_spy_data * spy_data; /* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */ struct libipw_device * libipw; }; @@ -443,22 +441,6 @@ static inline void wireless_nlevent_flush(void) {} /* We may need a function to send a stream of events to user space. * More on that later... */ -/* Standard handler for SIOCSIWSPY */ -int iw_handler_set_spy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCGIWSPY */ -int iw_handler_get_spy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCSIWTHRSPY */ -int iw_handler_set_thrspy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCGIWTHRSPY */ -int iw_handler_get_thrspy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Driver call to update spy records */ -void wireless_spy_update(struct net_device *dev, unsigned char *address, - struct iw_quality *wstats); - /************************* INLINE FUNCTIONS *************************/ /* * Function that are so simple that it's more efficient inlining them -- cgit From 836265d31631e28000fc8917ce697fc687a58724 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:35:25 +0200 Subject: wifi: remove iw_public_data from struct net_device Given the previous patches, we no longer need the struct iw_public_data etc., it's only used by the old Intel drivers (and ps3_gelic creates it but then doesn't use it). Remove all of that, including the pointer in struct net_device. Link: https://patch.msgid.link/20241007213525.8b2d52b60531.I6a27aaf30bded9a0977f07f47fba2bd31a3b3330@changeid Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/net') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index a7b502958d27..fc44fcca1d5c 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -404,24 +404,6 @@ struct iw_spy_data { u_char spy_thr_under[IW_MAX_SPY]; }; -/* --------------------- DEVICE WIRELESS DATA --------------------- */ -/* - * This is all the wireless data specific to a device instance that - * is managed by the core of Wireless Extensions or the 802.11 layer. - * We only keep pointer to those structures, so that a driver is free - * to share them between instances. - * This structure should be initialised before registering the device. - * Access to this data follow the same rules as any other struct net_device - * data (i.e. valid as long as struct net_device exist, same locking rules). - */ -/* Forward declaration */ -struct libipw_device; -/* The struct */ -struct iw_public_data { - /* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */ - struct libipw_device * libipw; -}; - /**************************** PROTOTYPES ****************************/ /* * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c). -- cgit From aee809aaa2d13bf560fe38d28c4969605e6d9d0e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:47:16 +0200 Subject: wifi: cfg80211: unexport wireless_nlevent_flush() This no longer needs to be exported, so don't export it. Link: https://patch.msgid.link/20241007214715.3dd736dc3ac0.I1388536e99c37f28a007dd753c473ad21513d9a9@changeid Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/net') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index fc44fcca1d5c..804587b7592b 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -413,12 +413,6 @@ struct iw_spy_data { /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); -#ifdef CONFIG_WEXT_CORE -/* flush all previous wext events - if work is done from netdev notifiers */ -void wireless_nlevent_flush(void); -#else -static inline void wireless_nlevent_flush(void) {} -#endif /* We may need a function to send a stream of events to user space. * More on that later... */ -- cgit From ff919efb5fe8256fba132b5f2c58df3c38bdd0b3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 22:00:03 +0200 Subject: wireless: wext: shorten struct iw_ioctl_description There's no need for "future" extensions in an internal struct, and we don't need a u32 for flags, use just a u8. Also remove the unused IW_DESCR_FLAG_WAIT flag. Link: https://patch.msgid.link/20241007220003.309bd52fa763.I9a1229fa7f2be53d4f50e63671ed441d0968bb41@changeid Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index 804587b7592b..c9b46b996197 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -279,8 +279,6 @@ #define IW_DESCR_FLAG_RESTRICT 0x0004 /* GET : request is ROOT only */ /* SET : Omit payload from generated iwevent */ #define IW_DESCR_FLAG_NOMAX 0x0008 /* GET : no limit on request size */ -/* Driver level flags */ -#define IW_DESCR_FLAG_WAIT 0x0100 /* Wait for driver event */ /****************************** TYPES ******************************/ @@ -373,11 +371,10 @@ struct iw_handler_def { */ struct iw_ioctl_description { __u8 header_type; /* NULL, iw_point or other */ - __u8 token_type; /* Future */ + __u8 flags; /* Special handling of the request */ __u16 token_size; /* Granularity of payload */ __u16 min_tokens; /* Min acceptable token number */ __u16 max_tokens; /* Max acceptable token number */ - __u32 flags; /* Special handling of the request */ }; /* Need to think of short header translation table. Later. */ -- cgit From da5e06dee58ad153a4933fd40fc53d571bfef373 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sun, 6 Oct 2024 07:26:09 +0900 Subject: net-timestamp: namespacify the sysctl_tstamp_allow_data Let it be tuned in per netns by admins. Signed-off-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241005222609.94980-1-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/core.h | 1 + include/net/sock.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 78214f1b43a2..9b36f0ff0c20 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -15,6 +15,7 @@ struct netns_core { int sysctl_somaxconn; int sysctl_optmem_max; u8 sysctl_txrehash; + u8 sysctl_tstamp_allow_data; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/include/net/sock.h b/include/net/sock.h index e282127092ab..b32f1424ecc5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2824,8 +2824,6 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; -extern int sysctl_tstamp_allow_data; - extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; -- cgit From 3fe3dbaf26723c473d42a58b636a2500586b821e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 7 Oct 2024 01:44:56 +0100 Subject: caif: Remove unused cfsrvl_getphyid cfsrvl_getphyid() has been unused since 2011's commit f36214408470 ("caif: Use RCU and lists in cfcnfg.c for managing caif link layers") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241007004456.149899-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/caif/cfsrvl.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/caif/cfsrvl.h b/include/net/caif/cfsrvl.h index 5ee7b322e18b..a000dc45f966 100644 --- a/include/net/caif/cfsrvl.h +++ b/include/net/caif/cfsrvl.h @@ -40,7 +40,6 @@ void cfsrvl_init(struct cfsrvl *service, struct dev_info *dev_info, bool supports_flowctrl); bool cfsrvl_ready(struct cfsrvl *service, int *err); -u8 cfsrvl_getphyid(struct cflayer *layer); static inline void cfsrvl_get(struct cflayer *layr) { -- cgit From db03488897a70367aeafe82d07a78943d2a6068e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 9 Oct 2024 08:33:05 +0200 Subject: Revert "wifi: cfg80211: unexport wireless_nlevent_flush()" Revert this, I neglected to take into account the fact that cfg80211 itself can be a module, but wext is always builtin. Fixes: aee809aaa2d1 ("wifi: cfg80211: unexport wireless_nlevent_flush()") Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index c9b46b996197..b80e474cb0aa 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -410,6 +410,12 @@ struct iw_spy_data { /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); +#ifdef CONFIG_WEXT_CORE +/* flush all previous wext events - if work is done from netdev notifiers */ +void wireless_nlevent_flush(void); +#else +static inline void wireless_nlevent_flush(void) {} +#endif /* We may need a function to send a stream of events to user space. * More on that later... */ -- cgit From 6607c17c6c5e029da03a90085db22daf518232bf Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Tue, 8 Oct 2024 00:06:15 -0700 Subject: net: mana: Enable debugfs files for MANA device Implement debugfs in MANA driver to be able to view RX,TX,EQ queue specific attributes and dump their gdma queues. These dumps can be used by other userspace utilities to improve debuggability and troubleshooting Following files are added in debugfs: /sys/kernel/debug/mana/ |-------------- 1 |--------------- EQs | |------- eq0 | | |---head | | |---tail | | |---eq_dump | |------- eq1 | . | . | |--------------- adapter-MTU |--------------- vport0 |------- RX-0 | |---cq_budget | |---cq_dump | |---cq_head | |---cq_tail | |---rq_head | |---rq_nbuf | |---rq_tail | |---rxq_dump |------- RX-1 . . |------- TX-0 | |---cq_budget | |---cq_dump | |---cq_head | |---cq_tail | |---sq_head | |---sq_pend_skb_qlen | |---sq_tail | |---txq_dump |------- TX-1 . . Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- include/net/mana/gdma.h | 6 +++++- include/net/mana/mana.h | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index de47fa533b15..90f56656b572 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -267,7 +267,8 @@ struct gdma_event { struct gdma_queue; struct mana_eq { - struct gdma_queue *eq; + struct gdma_queue *eq; + struct dentry *mana_eq_debugfs; }; typedef void gdma_eq_callback(void *context, struct gdma_queue *q, @@ -365,6 +366,7 @@ struct gdma_irq_context { struct gdma_context { struct device *dev; + struct dentry *mana_pci_debugfs; /* Per-vPort max number of queues */ unsigned int max_num_queues; @@ -878,5 +880,7 @@ int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req, u32 resp_len, void *resp); int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); +void mana_register_debugfs(void); +void mana_unregister_debugfs(void); #endif /* _GDMA_H */ diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 9b0faa24b758..0d00b24eacaf 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -350,6 +350,7 @@ struct mana_rxq { int xdp_rc; /* XDP redirect return code */ struct page_pool *page_pool; + struct dentry *mana_rx_debugfs; /* MUST BE THE LAST MEMBER: * Each receive buffer has an associated mana_recv_buf_oob. @@ -363,6 +364,8 @@ struct mana_tx_qp { struct mana_cq tx_cq; mana_handle_t tx_object; + + struct dentry *mana_tx_debugfs; }; struct mana_ethtool_stats { @@ -407,6 +410,7 @@ struct mana_context { u16 num_ports; struct mana_eq *eqs; + struct dentry *mana_eqs_debugfs; struct net_device *ports[MAX_PORTS_IN_MANA_DEV]; }; @@ -468,6 +472,9 @@ struct mana_port_context { bool port_st_save; /* Saved port state */ struct mana_ethtool_stats eth_stats; + + /* Debugfs */ + struct dentry *mana_port_debugfs; }; netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev); @@ -494,6 +501,7 @@ int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); extern const struct ethtool_ops mana_ethtool_ops; +extern struct dentry *mana_debugfs_root; /* A CQ can be created not associated with any EQ */ #define GDMA_CQ_NO_EQ 0xffff -- cgit From 2b78d30620d7f8a9f9ce312ad21200ec7a554bd9 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:24:29 +0200 Subject: ipv4: Convert ip_route_use_hint() to dscp_t. Pass a dscp_t variable to ip_route_use_hint(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Only ip_rcv_finish_core() actually calls ip_route_use_hint(). Use the ip4h_dscp() helper to get the DSCP from the IPv4 header. While there, modify the declaration of ip_route_use_hint() in include/net/route.h so that it matches the prototype of its implementation in net/ipv4/route.c. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/c40994fdf804db7a363d04fdee01bf48dddda676.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 5e4374d66927..c219c0fecdcf 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -203,8 +203,8 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); -int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, +int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, -- cgit From d32976408744a589f04b5c939f8f01f7167e5167 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:24:54 +0200 Subject: ipv4: Convert ip_mc_validate_source() to dscp_t. Pass a dscp_t variable to ip_mc_validate_source(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Callers of ip_mc_validate_source() to consider are: * ip_route_input_mc() which already has a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversion. * udp_v4_early_demux() which gets the DSCP directly from the IPv4 header and can simply use the ip4h_dscp() helper. Also, stop including net/inet_dscp.h in udp.c as we don't use any of its declarations anymore. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/c91b2cca04718b7ee6cf5b9c1d5b40507d65a8d4.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index c219c0fecdcf..586e59f7ed8a 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -198,8 +198,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 fl4->fl4_gre_key = gre_key; return ip_route_output_key(net, fl4); } + int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, + dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); -- cgit From d36236ab52754ef6bd083be945e9c2e93f466022 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:25:02 +0200 Subject: ipv4: Convert fib_validate_source() to dscp_t. Pass a dscp_t variable to fib_validate_source(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. All callers of fib_validate_source() already have a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversions. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/08612a4519bc5a3578bb493fbaad82437ebb73dc.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 967e4dc555fa..06130933542d 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -449,8 +449,9 @@ int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, - u8 tos, int oif, struct net_device *dev, + dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); + #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { -- cgit From 87173021f1583ee37f4801fcde354729da8db3dc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 10:29:03 -0700 Subject: ipv4: Link IPv4 address to per-netns hash table. As a prep for per-netns RTNL conversion, we want to namespacify the IPv4 address hash table and the GC work. Let's allocate the per-netns IPv4 address hash table to net->ipv4.inet_addr_lst and link IPv4 addresses into it. The actual users will be converted later. Note that the IPv6 address hash table is already namespacified. Reviewed-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008172906.1326-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 276f622f3516..29eba2eaaa26 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -270,5 +270,6 @@ struct netns_ipv4 { atomic_t rt_genid; siphash_key_t ip_id_key; + struct hlist_head *inet_addr_lst; }; #endif -- cgit From 1675f385213edc14ed849e079d6866b48e552252 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 10:29:05 -0700 Subject: ipv4: Namespacify IPv4 address GC. Each IPv4 address could have a lifetime, which is useful for DHCP, and GC is periodically executed as check_lifetime_work. check_lifetime() does the actual GC under RTNL. 1. Acquire RTNL 2. Iterate inet_addr_lst 3. Remove IPv4 address if expired 4. Release RTNL Namespacifying the GC is required for per-netns RTNL, but using the per-netns hash table will shorten the time on the hash bucket iteration under RTNL. Let's add per-netns GC work and use the per-netns hash table. Reviewed-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008172906.1326-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 29eba2eaaa26..66a4cffc44ee 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -271,5 +271,6 @@ struct netns_ipv4 { atomic_t rt_genid; siphash_key_t ip_id_key; struct hlist_head *inet_addr_lst; + struct delayed_work addr_chk_work; }; #endif -- cgit From 9e542ff8b79ae1871074d3a7dca7de9e7374eeda Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Oct 2024 09:32:04 -0700 Subject: net: Remove likely from l3mdev_master_ifindex_by_index The likely() annotation in l3mdev_master_ifindex_by_index() has been found to be incorrect 100% of the time in real-world workloads (e.g., web servers). Annotated branches shows the following in these servers: correct incorrect % Function File Line 0 169053813 100 l3mdev_master_ifindex_by_index l3mdev.h 81 This is happening because l3mdev_master_ifindex_by_index() is called from __inet_check_established(), which calls l3mdev_master_ifindex_by_index() passing the socked bounded interface. l3mdev_master_ifindex_by_index(net, sk->sk_bound_dev_if); Since most sockets are not going to be bound to a network device, the likely() is giving the wrong assumption. Remove the likely() annotation to ensure more accurate branch prediction. Signed-off-by: Breno Leitao Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241008163205.3939629-1-leitao@debian.org Signed-off-by: Paolo Abeni --- include/net/l3mdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 031c661aa14d..2d6141f28b53 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -78,7 +78,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) struct net_device *dev; int rc = 0; - if (likely(ifindex)) { + if (ifindex) { rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); -- cgit From 13d68a16430312fc21990f48326366eb73891202 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:47 +0200 Subject: genetlink: extend info user-storage to match NL cb ctx This allows a more uniform implementation of non-dump and dump operations, and will be used later in the series to avoid some per-operation allocation. Additionally rename the NL_ASSERT_DUMP_CTX_FITS macro, to fit a more extended usage. Suggested-by: Jakub Kicinski Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/1130cc2896626b84587a2a5f96a5c6829638f4da.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9ab49bfeae78..9d3726e8f90e 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -124,7 +124,8 @@ struct genl_family { * @genlhdr: generic netlink message header * @attrs: netlink attributes * @_net: network namespace - * @user_ptr: user pointers + * @ctx: storage space for the use by the family + * @user_ptr: user pointers (deprecated, use ctx instead) * @extack: extended ACK report struct */ struct genl_info { @@ -135,7 +136,10 @@ struct genl_info { struct genlmsghdr * genlhdr; struct nlattr ** attrs; possible_net_t _net; - void * user_ptr[2]; + union { + u8 ctx[NETLINK_CTX_SIZE]; + void * user_ptr[2]; + }; struct netlink_ext_ack *extack; }; -- cgit From 4b623f9f0f59652ea71fcb27d60b4c3b65126dbb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:49 +0200 Subject: net-shapers: implement NL get operation Introduce the basic infrastructure to implement the net-shaper core functionality. Each network devices carries a net-shaper cache, the NL get() operation fetches the data from such cache. The cache is initially empty, will be fill by the set()/group() operation implemented later and is destroyed at device cleanup time. The net_shaper_fill_handle(), net_shaper_ctx_init(), and net_shaper_generic_pre() implementations handle generic index type attributes, despite the current caller always pass a constant value to avoid more noise in later patches using them with different attributes. Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/ddd10fd645a9367803ad02fca4a5664ea5ace170.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/net_shaper.h | 120 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 include/net/net_shaper.h (limited to 'include/net') diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h new file mode 100644 index 000000000000..5c3f49b52fe9 --- /dev/null +++ b/include/net/net_shaper.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_SHAPER_H_ +#define _NET_SHAPER_H_ + +#include + +#include + +struct net_device; +struct devlink; +struct netlink_ext_ack; + +enum net_shaper_binding_type { + NET_SHAPER_BINDING_TYPE_NETDEV, + /* NET_SHAPER_BINDING_TYPE_DEVLINK_PORT */ +}; + +struct net_shaper_binding { + enum net_shaper_binding_type type; + union { + struct net_device *netdev; + struct devlink *devlink; + }; +}; + +struct net_shaper_handle { + enum net_shaper_scope scope; + u32 id; +}; + +/** + * struct net_shaper - represents a shaping node on the NIC H/W + * zeroed field are considered not set. + * @parent: Unique identifier for the shaper parent, usually implied + * @handle: Unique identifier for this shaper + * @metric: Specify if the rate limits refers to PPS or BPS + * @bw_min: Minimum guaranteed rate for this shaper + * @bw_max: Maximum peak rate allowed for this shaper + * @burst: Maximum burst for the peek rate of this shaper + * @priority: Scheduling priority for this shaper + * @weight: Scheduling weight for this shaper + */ +struct net_shaper { + struct net_shaper_handle parent; + struct net_shaper_handle handle; + enum net_shaper_metric metric; + u64 bw_min; + u64 bw_max; + u64 burst; + u32 priority; + u32 weight; + + /* private: */ + u32 leaves; /* accounted only for NODE scope */ + struct rcu_head rcu; +}; + +/** + * struct net_shaper_ops - Operations on device H/W shapers + * + * The operations applies to either net_device and devlink objects. + * The initial shaping configuration at device initialization is empty: + * does not constraint the rate in any way. + * The network core keeps track of the applied user-configuration in + * the net_device or devlink structure. + * The operations are serialized via a per device lock. + * + * Device not supporting any kind of nesting should not provide the + * group operation. + * + * Each shaper is uniquely identified within the device with a 'handle' + * comprising the shaper scope and a scope-specific id. + */ +struct net_shaper_ops { + /** + * @group: create the specified shapers scheduling group + * + * Nest the @leaves shapers identified under the * @node shaper. + * All the shapers belong to the device specified by @binding. + * The @leaves arrays size is specified by @leaves_count. + * Create either the @leaves and the @node shaper; or if they already + * exists, links them together in the desired way. + * @leaves scope must be NET_SHAPER_SCOPE_QUEUE. + */ + int (*group)(struct net_shaper_binding *binding, int leaves_count, + const struct net_shaper *leaves, + const struct net_shaper *node, + struct netlink_ext_ack *extack); + + /** + * @set: Updates the specified shaper + * + * Updates or creates the @shaper on the device specified by @binding. + */ + int (*set)(struct net_shaper_binding *binding, + const struct net_shaper *shaper, + struct netlink_ext_ack *extack); + + /** + * @delete: Removes the specified shaper + * + * Removes the shaper configuration as identified by the given @handle + * on the device specified by @binding, restoring the default behavior. + */ + int (*delete)(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle, + struct netlink_ext_ack *extack); + + /** + * @capabilities: get the shaper features supported by the device + * + * Fills the bitmask @cap with the supported capabilities for the + * specified @scope and device specified by @binding. + */ + void (*capabilities)(struct net_shaper_binding *binding, + enum net_shaper_scope scope, unsigned long *cap); +}; + +#endif -- cgit From d677aebd663ddc287f2b2bda098474694a0ca875 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 03:41:00 +0000 Subject: tcp: move sysctl_tcp_l3mdev_accept to netns_ipv4_read_rx sysctl_tcp_l3mdev_accept is read from TCP receive fast path from tcp_v6_early_demux(), __inet6_lookup_established, inet_request_bound_dev_if(). Move it to netns_ipv4_read_rx. Remove the '#ifdef CONFIG_NET_L3_MASTER_DEV' that was guarding its definition. Note this adds a hole of three bytes that could be filled later. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Cc: Wei Wang Cc: Coco Li Link: https://patch.msgid.link/20241010034100.320832-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 66a4cffc44ee..3b1de80b5c25 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -76,6 +76,8 @@ struct netns_ipv4 { __cacheline_group_begin(netns_ipv4_read_rx); u8 sysctl_ip_early_demux; u8 sysctl_tcp_early_demux; + u8 sysctl_tcp_l3mdev_accept; + /* 3 bytes hole, try to pack */ int sysctl_tcp_reordering; int sysctl_tcp_rmem[3]; __cacheline_group_end(netns_ipv4_read_rx); @@ -151,9 +153,6 @@ struct netns_ipv4 { u8 sysctl_fwmark_reflect; u8 sysctl_tcp_fwmark_accept; -#ifdef CONFIG_NET_L3_MASTER_DEV - u8 sysctl_tcp_l3mdev_accept; -#endif u8 sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probe_floor; int sysctl_tcp_base_mss; -- cgit From a716ff52bebfe806fbcf5227cee4c7bda4e6724b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:01 +0000 Subject: fib: rules: use READ_ONCE()/WRITE_ONCE() on ops->fib_rules_seq Using RTNL to protect ops->fib_rules_seq reads seems a big hammer. Writes are protected by RTNL. We can use READ_ONCE() on readers. Constify 'struct net' argument of fib_rules_seq_read() and lookup_rules_ops(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/fib_rules.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index d17855c52ef9..04383d90a1e3 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -176,7 +176,7 @@ int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); -unsigned int fib_rules_seq_read(struct net *net, int family); +unsigned int fib_rules_seq_read(const struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); -- cgit From 16207384d29287a19f81436e1953b41946aa8258 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:02 +0000 Subject: ipv4: use READ_ONCE()/WRITE_ONCE() on net->ipv4.fib_seq Using RTNL to protect ops->fib_rules_seq reads seems a big hammer. Writes are protected by RTNL. We can use READ_ONCE() when reading it. Constify 'struct net' argument of fib4_rules_seq_read() Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 4 ++-- include/net/netns/ipv4.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 06130933542d..b6e44f4eaa4c 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -347,7 +347,7 @@ static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, return 0; } -static inline unsigned int fib4_rules_seq_read(struct net *net) +static inline unsigned int fib4_rules_seq_read(const struct net *net) { return 0; } @@ -411,7 +411,7 @@ static inline bool fib4_has_custom_rules(const struct net *net) bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -unsigned int fib4_rules_seq_read(struct net *net); +unsigned int fib4_rules_seq_read(const struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3b1de80b5c25..3c014170e001 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -262,7 +262,7 @@ struct netns_ipv4 { #endif struct fib_notifier_ops *notifier_ops; - unsigned int fib_seq; /* protected by rtnl_mutex */ + unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct fib_notifier_ops *ipmr_notifier_ops; unsigned int ipmr_seq; /* protected by rtnl_mutex */ -- cgit From e60ea45447768c48309b944596a8a34f6bae50e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:03 +0000 Subject: ipv6: use READ_ONCE()/WRITE_ONCE() on fib6_table->fib_seq Using RTNL to protect ops->fib_rules_seq reads seems a big hammer. Writes are protected by RTNL. We can use READ_ONCE() when reading it. Constify 'struct net' argument of fib6_tables_seq_read() and fib6_rules_seq_read(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip6_fib.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6cb867ce4878..7c87873ae211 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -394,7 +394,7 @@ struct fib6_table { struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; - unsigned int fib_seq; + unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; @@ -563,7 +563,7 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); -unsigned int fib6_tables_seq_read(struct net *net); +unsigned int fib6_tables_seq_read(const struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); @@ -632,7 +632,7 @@ void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -unsigned int fib6_rules_seq_read(struct net *net); +unsigned int fib6_rules_seq_read(const struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, @@ -676,7 +676,7 @@ static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, { return 0; } -static inline unsigned int fib6_rules_seq_read(struct net *net) +static inline unsigned int fib6_rules_seq_read(const struct net *net) { return 0; } -- cgit From 2698acd6ea4770809af7e65bb8b3250e0a3a807e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:05 +0000 Subject: net: do not acquire rtnl in fib_seq_sum() After we made sure no fib_seq_read() handlers needs RTNL anymore, we can remove RTNL from fib_seq_sum(). Note that after RTNL was dropped, fib_seq_sum() result was possibly outdated anyway. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/fib_notifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 6d59221ff05a..48aad6128fea 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -28,7 +28,7 @@ enum fib_event_type { struct fib_notifier_ops { int family; struct list_head list; - unsigned int (*fib_seq_read)(struct net *net); + unsigned int (*fib_seq_read)(const struct net *net); int (*fib_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); struct module *owner; -- cgit From 7f20dbd7de7b9b2804e6bf54b0c22f2bc447cd64 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:20 +0800 Subject: net: tunnel: add pskb_inet_may_pull_reason() helper Introduce the function pskb_inet_may_pull_reason() and make pskb_inet_may_pull a simple inline call to it. The drop reasons of it just come from pskb_may_pull_reason(). Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Reviewed-by: Kalesh AP Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6194fbb564c6..7fc2f7bf837a 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -439,7 +439,8 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); -static inline bool pskb_inet_may_pull(struct sk_buff *skb) +static inline enum skb_drop_reason +pskb_inet_may_pull_reason(struct sk_buff *skb) { int nhlen; @@ -456,7 +457,12 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) nhlen = 0; } - return pskb_network_may_pull(skb, nhlen); + return pskb_network_may_pull_reason(skb, nhlen); +} + +static inline bool pskb_inet_may_pull(struct sk_buff *skb) +{ + return pskb_inet_may_pull_reason(skb) == SKB_NOT_DROPPED_YET; } /* Variant of pskb_inet_may_pull(). -- cgit From 9990ddf47d4168088e2246c3d418bf526e40830d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:21 +0800 Subject: net: tunnel: make skb_vlan_inet_prepare() return drop reasons Make skb_vlan_inet_prepare return the skb drop reasons, which is just what pskb_may_pull_reason() returns. Meanwhile, adjust all the call of it. Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 7fc2f7bf837a..4e4f9e24c9c1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -467,11 +467,12 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) /* Variant of pskb_inet_may_pull(). */ -static inline bool skb_vlan_inet_prepare(struct sk_buff *skb, - bool inner_proto_inherit) +static inline enum skb_drop_reason +skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit) { int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN; __be16 type = skb->protocol; + enum skb_drop_reason reason; /* Essentially this is skb_protocol(skb, true) * And we get MAC len. @@ -492,11 +493,13 @@ static inline bool skb_vlan_inet_prepare(struct sk_buff *skb, /* For ETH_P_IPV6/ETH_P_IP we make sure to pull * a base network header in skb->head. */ - if (!pskb_may_pull(skb, maclen + nhlen)) - return false; + reason = pskb_may_pull_reason(skb, maclen + nhlen); + if (reason) + return reason; skb_set_network_header(skb, maclen); - return true; + + return SKB_NOT_DROPPED_YET; } static inline int ip_encap_hlen(struct ip_tunnel_encap *e) -- cgit From 4c06d9daf8e6215447ca8a2ddd59fa09862c9bae Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:22 +0800 Subject: net: vxlan: add skb drop reasons to vxlan_rcv() Introduce skb drop reasons to the function vxlan_rcv(). Following new drop reasons are added: SKB_DROP_REASON_VXLAN_INVALID_HDR SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND SKB_DROP_REASON_IP_TUNNEL_ECN Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/dropreason-core.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 4748680e8c88..98259d2b3e92 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -92,6 +92,9 @@ FN(PACKET_SOCK_ERROR) \ FN(TC_CHAIN_NOTFOUND) \ FN(TC_RECLASSIFY_LOOP) \ + FN(VXLAN_INVALID_HDR) \ + FN(VXLAN_VNI_NOT_FOUND) \ + FN(IP_TUNNEL_ECN) \ FNe(MAX) /** @@ -418,6 +421,19 @@ enum skb_drop_reason { * iterations. */ SKB_DROP_REASON_TC_RECLASSIFY_LOOP, + /** + * @SKB_DROP_REASON_VXLAN_INVALID_HDR: VXLAN header is invalid. E.g.: + * 1) reserved fields are not zero + * 2) "I" flag is not set + */ + SKB_DROP_REASON_VXLAN_INVALID_HDR, + /** @SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND: no VXLAN device found for VNI */ + SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND, + /** + * @SKB_DROP_REASON_IP_TUNNEL_ECN: skb is dropped according to + * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. + */ + SKB_DROP_REASON_IP_TUNNEL_ECN, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen -- cgit From 289fd4e75219a96f77c5d679166035cd5118d139 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:24 +0800 Subject: net: vxlan: make vxlan_snoop() return drop reasons Change the return type of vxlan_snoop() from bool to enum skb_drop_reason. In this commit, two drop reasons are introduced: SKB_DROP_REASON_MAC_INVALID_SOURCE SKB_DROP_REASON_VXLAN_ENTRY_EXISTS Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/dropreason-core.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 98259d2b3e92..1cb8d7c953be 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -94,6 +94,8 @@ FN(TC_RECLASSIFY_LOOP) \ FN(VXLAN_INVALID_HDR) \ FN(VXLAN_VNI_NOT_FOUND) \ + FN(MAC_INVALID_SOURCE) \ + FN(VXLAN_ENTRY_EXISTS) \ FN(IP_TUNNEL_ECN) \ FNe(MAX) @@ -429,6 +431,13 @@ enum skb_drop_reason { SKB_DROP_REASON_VXLAN_INVALID_HDR, /** @SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND: no VXLAN device found for VNI */ SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND, + /** @SKB_DROP_REASON_MAC_INVALID_SOURCE: source mac is invalid */ + SKB_DROP_REASON_MAC_INVALID_SOURCE, + /** + * @SKB_DROP_REASON_VXLAN_ENTRY_EXISTS: trying to migrate a static + * entry or an entry pointing to a nexthop. + */ + SKB_DROP_REASON_VXLAN_ENTRY_EXISTS, /** * @SKB_DROP_REASON_IP_TUNNEL_ECN: skb is dropped according to * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. -- cgit From d209706f562ee4fa81bdf24cf6b679c3222aa06c Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:25 +0800 Subject: net: vxlan: make vxlan_set_mac() return drop reasons Change the return type of vxlan_set_mac() from bool to enum skb_drop_reason. In this commit, the drop reason "SKB_DROP_REASON_LOCAL_MAC" is introduced for the case that the source mac of the packet is a local mac. Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/dropreason-core.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 1cb8d7c953be..fbf92d442c1b 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -97,6 +97,7 @@ FN(MAC_INVALID_SOURCE) \ FN(VXLAN_ENTRY_EXISTS) \ FN(IP_TUNNEL_ECN) \ + FN(LOCAL_MAC) \ FNe(MAX) /** @@ -443,6 +444,11 @@ enum skb_drop_reason { * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. */ SKB_DROP_REASON_IP_TUNNEL_ECN, + /** + * @SKB_DROP_REASON_LOCAL_MAC: the source MAC address is equal to + * the MAC address of the local netdev. + */ + SKB_DROP_REASON_LOCAL_MAC, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen -- cgit From b71a576e452b800efeac49ecca116d954601d911 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:26 +0800 Subject: net: vxlan: use kfree_skb_reason() in vxlan_xmit() Replace kfree_skb() with kfree_skb_reason() in vxlan_xmit(). Following new skb drop reasons are introduced for vxlan: /* no remote found for xmit */ SKB_DROP_REASON_VXLAN_NO_REMOTE /* packet without necessary metadata reached a device which is * in "external" mode */ SKB_DROP_REASON_TUNNEL_TXINFO Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/dropreason-core.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index fbf92d442c1b..d59bb96c5a02 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -96,7 +96,9 @@ FN(VXLAN_VNI_NOT_FOUND) \ FN(MAC_INVALID_SOURCE) \ FN(VXLAN_ENTRY_EXISTS) \ + FN(VXLAN_NO_REMOTE) \ FN(IP_TUNNEL_ECN) \ + FN(TUNNEL_TXINFO) \ FN(LOCAL_MAC) \ FNe(MAX) @@ -439,11 +441,18 @@ enum skb_drop_reason { * entry or an entry pointing to a nexthop. */ SKB_DROP_REASON_VXLAN_ENTRY_EXISTS, + /** @SKB_DROP_REASON_VXLAN_NO_REMOTE: no remote found for xmit */ + SKB_DROP_REASON_VXLAN_NO_REMOTE, /** * @SKB_DROP_REASON_IP_TUNNEL_ECN: skb is dropped according to * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. */ SKB_DROP_REASON_IP_TUNNEL_ECN, + /** + * @SKB_DROP_REASON_TUNNEL_TXINFO: packet without necessary metadata + * reached a device which is in "external" mode. + */ + SKB_DROP_REASON_TUNNEL_TXINFO, /** * @SKB_DROP_REASON_LOCAL_MAC: the source MAC address is equal to * the MAC address of the local netdev. -- cgit From b692bf9a7543af7ad11a59d182a3757578f0ba53 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:53 +0200 Subject: xsk: Get rid of xdp_buff_xsk::xskb_list_node Let's bring xdp_buff_xsk back to occupying 2 cachelines by removing xskb_list_node - for the purpose of gathering the xskb frags free_list_node can be used, head of the list (xsk_buff_pool::xskb_list) stays as-is, just reuse the node ptr. It is safe to do as a single xdp_buff_xsk can never reside in two pool's lists simultaneously. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-2-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 14 +++++++------- include/net/xsk_buff_pool.h | 1 - 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 0a5dca2b2b3f..360bc1244c6a 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -126,8 +126,8 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) if (likely(!xdp_buff_has_frags(xdp))) goto out; - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { - list_del(&pos->xskb_list_node); + list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { + list_del(&pos->free_list_node); xp_free(pos); } @@ -140,7 +140,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); - list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); + list_add_tail(&frag->free_list_node, &frag->pool->xskb_list); } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) @@ -150,9 +150,9 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, - struct xdp_buff_xsk, xskb_list_node); + struct xdp_buff_xsk, free_list_node); if (frag) { - list_del(&frag->xskb_list_node); + list_del(&frag->free_list_node); ret = &frag->xdp; } @@ -163,7 +163,7 @@ static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); - list_del(&xskb->xskb_list_node); + list_del(&xskb->free_list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) @@ -172,7 +172,7 @@ static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, - xskb_list_node); + free_list_node); return &frag->xdp; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index bacb33f1e3e5..aa7f1d0b3a5e 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -30,7 +30,6 @@ struct xdp_buff_xsk { struct xsk_buff_pool *pool; u64 orig_addr; struct list_head free_list_node; - struct list_head xskb_list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) -- cgit From 30ec2c1baaead43903ad63ff8e3083949059083c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:54 +0200 Subject: xsk: s/free_list_node/list_node/ Now that free_list_node's purpose is two-folded, make it just a 'list_node'. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-3-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 14 +++++++------- include/net/xsk_buff_pool.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 360bc1244c6a..40085afd9160 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -126,8 +126,8 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) if (likely(!xdp_buff_has_frags(xdp))) goto out; - list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { - list_del(&pos->free_list_node); + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { + list_del(&pos->list_node); xp_free(pos); } @@ -140,7 +140,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); - list_add_tail(&frag->free_list_node, &frag->pool->xskb_list); + list_add_tail(&frag->list_node, &frag->pool->xskb_list); } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) @@ -150,9 +150,9 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, - struct xdp_buff_xsk, free_list_node); + struct xdp_buff_xsk, list_node); if (frag) { - list_del(&frag->free_list_node); + list_del(&frag->list_node); ret = &frag->xdp; } @@ -163,7 +163,7 @@ static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); - list_del(&xskb->free_list_node); + list_del(&xskb->list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) @@ -172,7 +172,7 @@ static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, - free_list_node); + list_node); return &frag->xdp; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index aa7f1d0b3a5e..af8b6f776f86 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,7 +29,7 @@ struct xdp_buff_xsk { dma_addr_t frame_dma; struct xsk_buff_pool *pool; u64 orig_addr; - struct list_head free_list_node; + struct list_head list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) -- cgit From bea14124bacbe5c9366381e62635eed28ac892ae Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:55 +0200 Subject: xsk: Get rid of xdp_buff_xsk::orig_addr Continue the process of dieting xdp_buff_xsk by removing orig_addr member. It can be calculated from xdp->data_hard_start where it was previously used, so it is not anything that has to be carried around in struct used widely in hot path. This has been used for initializing xdp_buff_xsk::frame_dma during pool setup and as a shortcut in xp_get_handle() to retrieve address provided to xsk Rx queue. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-4-maciej.fijalkowski@intel.com --- include/net/xsk_buff_pool.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index af8b6f776f86..468a23b1b4c5 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -28,7 +28,6 @@ struct xdp_buff_xsk { dma_addr_t dma; dma_addr_t frame_dma; struct xsk_buff_pool *pool; - u64 orig_addr; struct list_head list_node; }; @@ -119,7 +118,6 @@ void xp_free(struct xdp_buff_xsk *xskb); static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, u64 addr) { - xskb->orig_addr = addr; xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; } @@ -221,14 +219,19 @@ static inline void xp_release(struct xdp_buff_xsk *xskb) xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; } -static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) +static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, + struct xsk_buff_pool *pool) { - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + u64 orig_addr = xskb->xdp.data - pool->addrs; + u64 offset; - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); + if (!pool->unaligned) + return orig_addr; + + offset = xskb->xdp.data - xskb->xdp.data_hard_start; + orig_addr -= offset; + offset += pool->headroom; + return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) -- cgit From 6e126872191df946a6fe01b79273119d32d96711 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:56 +0200 Subject: xsk: Carry a copy of xdp_zc_max_segs within xsk_buff_pool This so we avoid dereferencing struct net_device within hot path. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-5-maciej.fijalkowski@intel.com --- include/net/xsk_buff_pool.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 468a23b1b4c5..bb03cee716b3 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -76,6 +76,7 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u32 xdp_zc_max_segs; u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; -- cgit From 78e2baf3d96edd21c6f26d8afc0e68d02ec2c51c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:13 +0000 Subject: net: add TIME_WAIT logic to sk_to_full_sk() TCP will soon attach TIME_WAIT sockets to some ACK and RST. Make sure sk_to_full_sk() detects this and does not return a non full socket. v3: also changed sk_const_to_full_sk() Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Martin KaFai Lau Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_sock.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index f01dd273bea6..56d8bc5593d3 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) static inline struct sock *sk_to_full_sk(struct sock *sk) { #ifdef CONFIG_INET - if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = inet_reqsk(sk)->rsk_listener; + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) + sk = NULL; #endif return sk; } @@ -331,8 +333,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk) static inline const struct sock *sk_const_to_full_sk(const struct sock *sk) { #ifdef CONFIG_INET - if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = ((const struct request_sock *)sk)->rsk_listener; + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) + sk = NULL; #endif return sk; } -- cgit From bc43a3c83cad46a27d6e3bf869acdd926bbe79ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:14 +0000 Subject: net_sched: sch_fq: prepare for TIME_WAIT sockets TCP stack is not attaching skb to TIME_WAIT sockets yet, but we would like to allow this in the future. Add sk_listener_or_tw() helper to detect the three states that FQ needs to take care. Like NEW_SYN_RECV, TIME_WAIT are not full sockets and do not contain sk->sk_pacing_status, sk->sk_pacing_rate. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 6da420ab1ee1..2f200d91ef11 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2802,6 +2802,16 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } +/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT + * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE) + * TCP RST and ACK can be attached to TIME_WAIT. + */ +static inline bool sk_listener_or_tw(const struct sock *sk) +{ + return (1 << READ_ONCE(sk->sk_state)) & + (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT); +} + void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); -- cgit From 5ced52fa8f0dc23adb067f7b8a009a5ee051efb7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:15 +0000 Subject: net: add skb_set_owner_edemux() helper This can be used to attach a socket to an skb, taking a reference on sk->sk_refcnt. This helper might be a NOP if sk->sk_refcnt is zero. Use it from tcp_make_synack(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 2f200d91ef11..bf7fa3db10ae 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1760,6 +1760,15 @@ void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); + +static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + skb->sk = sk; + skb->destructor = sock_edemux; + } +} #else #define sock_edemux sock_efree #endif -- cgit From 79636038d37e7bd4d078238f2a3f002cab4423bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:17 +0000 Subject: ipv4: tcp: give socket pointer to control skbs ip_send_unicast_reply() send orphaned 'control packets'. These are RST packets and also ACK packets sent from TIME_WAIT. Some eBPF programs would prefer to have a meaningful skb->sk pointer as much as possible. This means that TCP can now attach TIME_WAIT sockets to outgoing skbs. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index bab084df1567..4be0a6a603b2 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -288,7 +288,8 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, + struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, -- cgit From 95b3120a485f77a9bb8060bf3398311e3dcb6c65 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 16:52:16 -0700 Subject: neighbour: Remove NEIGH_DN_TABLE. Since commit 1202cdd66531 ("Remove DECnet support from kernel"), NEIGH_DN_TABLE is no longer used. MPLS has implicit dependency on it in nla_put_via(), but nla_get_via() does not support DECnet. Let's remove NEIGH_DN_TABLE. Now, neigh_tables[] has only 2 elements and no extra iteration for DECnet in many places. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241014235216.10785-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index a44f262a7384..3887ed9e5026 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -239,7 +239,6 @@ struct neigh_table { enum { NEIGH_ARP_TABLE = 0, NEIGH_ND_TABLE = 1, - NEIGH_DN_TABLE = 2, NEIGH_NR_TABLES, NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ }; -- cgit From e1c6c383123ab1caadbfe39b3362ce0cc09dd766 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:28 -0700 Subject: rtnetlink: Remove rtnl_register() and rtnl_register_module(). No one uses rtnl_register() and rtnl_register_module(). Let's remove them. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-12-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 2d3eb7cb4dff..bb49c5708ce7 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -29,6 +29,16 @@ static inline enum rtnl_kinds rtnl_msgtype_kind(int msgtype) return msgtype & RTNL_KIND_MASK; } +/** + * struct rtnl_msg_handler - rtnetlink message type and handlers + * + * @owner: NULL for built-in, THIS_MODULE for module + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions + */ struct rtnl_msg_handler { struct module *owner; int protocol; @@ -38,11 +48,6 @@ struct rtnl_msg_handler { int flags; }; -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -int rtnl_register_module(struct module *owner, int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -int rtnl_unregister(int protocol, int msgtype); void rtnl_unregister_all(int protocol); int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n); -- cgit From 43c7ce69d28e185f62fe2b8be2c681c5cac0bc6b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:50 -0700 Subject: rtnetlink: Protect struct rtnl_link_ops with SRCU. Once RTNL is replaced with rtnl_net_lock(), we need a mechanism to guarantee that rtnl_link_ops is alive during inflight RTM_NEWLINK even when its module is being unloaded. Let's use SRCU to protect ops. rtnl_link_ops_get() now iterates link_ops under RCU and returns SRCU-protected ops pointer. The caller must call rtnl_link_ops_put() to release the pointer after the use. Also, __rtnl_link_unregister() unlinks the ops first and calls synchronize_srcu() to wait for inflight RTM_NEWLINK requests to complete. Note that link_ops needs to be protected by its dedicated lock when RTNL is removed. Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bb49c5708ce7..1a6aa5ca74f3 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -3,6 +3,7 @@ #define __NET_RTNETLINK_H #include +#include #include typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, @@ -69,7 +70,8 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) /** * struct rtnl_link_ops - rtnetlink link operations * - * @list: Used internally + * @list: Used internally, protected by RTNL and SRCU + * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit * @maxtype: Highest device specific netlink attribute number @@ -100,6 +102,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) */ struct rtnl_link_ops { struct list_head list; + struct srcu_struct srcu; const char *kind; -- cgit From 26eebdc4b005ccd4cf63f4fef4c9c0adf9bfa380 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:56 -0700 Subject: rtnetlink: Return int from rtnl_af_register(). The next patch will add init_srcu_struct() in rtnl_af_register(), then we need to handle its error. Let's add the error handling in advance to make the following patch cleaner. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Matt Johnston Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 1a6aa5ca74f3..969138ae2f4b 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -204,7 +204,7 @@ struct rtnl_af_ops { size_t (*get_stats_af_size)(const struct net_device *dev); }; -void rtnl_af_register(struct rtnl_af_ops *ops); +int rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]); -- cgit From 6ab0f866948323724e95cf14d9e47fd77703c192 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:57 -0700 Subject: rtnetlink: Protect struct rtnl_af_ops with SRCU. Once RTNL is replaced with rtnl_net_lock(), we need a mechanism to guarantee that rtnl_af_ops is alive during inflight RTM_SETLINK even when its module is being unloaded. Let's use SRCU to protect ops. rtnl_af_lookup() now iterates rtnl_af_ops under RCU and returns SRCU-protected ops pointer. The caller must call rtnl_af_put() to release the pointer after the use. Also, rtnl_af_unregister() unlinks the ops first and calls synchronize_srcu() to wait for inflight RTM_SETLINK requests to complete. Note that rtnl_af_ops needs to be protected by its dedicated lock when RTNL is removed. Note also that BUG_ON() in do_setlink() is changed to the normal error handling as a different af_ops might be found after validate_linkmsg(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 969138ae2f4b..e0d9a8eae6b6 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -172,7 +172,8 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops); /** * struct rtnl_af_ops - rtnetlink address family operations * - * @list: Used internally + * @list: Used internally, protected by RTNL and SRCU + * @srcu: Used internally * @family: Address family * @fill_link_af: Function to fill IFLA_AF_SPEC with address family * specific netlink attributes. @@ -185,6 +186,8 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops); */ struct rtnl_af_ops { struct list_head list; + struct srcu_struct srcu; + int family; int (*fill_link_af)(struct sk_buff *skb, -- cgit From 83c289e81e88d01e55d6d56531502ed7b4886a05 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Oct 2024 19:19:34 +0300 Subject: net/sched: act_api: unexport tcf_action_dump_1() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This isn't used outside act_api.c, but is called by tcf_dump_walker() prior to its definition. So move it upwards and make it static. Simultaneously, reorder the variable declarations so that they follow the networking "reverse Christmas tree" coding style. Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241017161934.3599046-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- include/net/act_api.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 77ee0c657e2c..404df8557f6a 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -219,7 +219,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); -int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); static inline void tcf_action_update_bstats(struct tc_action *a, struct sk_buff *skb) -- cgit From c972c1c41d9b20fb38b54e77dcee763e27e715a9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 18:41:00 -0700 Subject: ipv4: Switch inet_addr_hash() to less predictable hash. Recently, commit 4a0ec2aa0704 ("ipv6: switch inet6_addr_hash() to less predictable hash") and commit 4daf4dc275f1 ("ipv6: switch inet6_acaddr_hash() to less predictable hash") hardened IPv6 address hash functions. inet_addr_hash() is also highly predictable, and a malicious use could abuse a specific bucket. Let's follow the change on IPv4 by using jhash_1word(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241018014100.93776-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/ip.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 4be0a6a603b2..0e548c1f2a0e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -690,6 +690,11 @@ static inline unsigned int ipv4_addr_hash(__be32 ip) return (__force unsigned int) ip; } +static inline u32 __ipv4_addr_hash(const __be32 ip, const u32 initval) +{ + return jhash_1word((__force u32)ip, initval); +} + static inline u32 ipv4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) -- cgit From 074a8b54dacc1920f54381f3661ecee6786b0c21 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 7 Oct 2024 15:00:45 +0300 Subject: wifi: mac80211: Add support to indicate that a new interface is to be added Add support to indicate to the driver that an interface is about to be added so that the driver could prepare its resources early if it needs so. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.e0e8563e1c30.Ifccc96a46a347eb15752caefc9f4eff31f75ed47@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 333e0fae6796..0b8df8ec5a3b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4444,6 +4444,12 @@ struct ieee80211_prep_tx_info { * if the requested TID-To-Link mapping can be accepted or not. * If it's not accepted the driver may suggest a preferred mapping and * modify @ttlm parameter with the suggested TID-to-Link mapping. + * @prep_add_interface: prepare for interface addition. This can be used by + * drivers to prepare for the addition of a new interface, e.g., allocate + * the needed resources etc. This callback doesn't guarantee that an + * interface with the specified type would be added, and thus drivers that + * implement this callback need to handle such cases. The type is the full + * &enum nl80211_iftype. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -4828,6 +4834,8 @@ struct ieee80211_ops { enum ieee80211_neg_ttlm_res (*can_neg_ttlm)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_neg_ttlm *ttlm); + void (*prep_add_interface)(struct ieee80211_hw *hw, + enum nl80211_iftype type); }; /** -- cgit From 62262dd00c319195f2e14022903b7ebbb53119bc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:46 +0300 Subject: wifi: cfg80211: disallow SMPS in AP mode In practice, userspace hasn't been able to set this for many years, and mac80211 has already rejected it (which is now no longer needed), so reject SMPS mode (other than "OFF" to be a bit more compatible) in AP mode. Also remove the parameter from the AP settings struct. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.fe1fc46484cf.I8676fb52b818a4bedeb9c25b901e1396277ffc0b@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 69ec1eb41a09..c8ce5c2e14f4 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1460,7 +1460,6 @@ struct cfg80211_unsol_bcast_probe_resp { * @crypto: crypto settings * @privacy: the BSS uses privacy * @auth_type: Authentication type (algorithm) - * @smps_mode: SMPS mode * @inactivity_timeout: time in seconds to determine station's inactivity. * @p2p_ctwindow: P2P CT Window * @p2p_opp_ps: P2P opportunistic PS @@ -1498,7 +1497,6 @@ struct cfg80211_ap_settings { struct cfg80211_crypto_settings crypto; bool privacy; enum nl80211_auth_type auth_type; - enum nl80211_smps_mode smps_mode; int inactivity_timeout; u8 p2p_ctwindow; bool p2p_opp_ps; -- cgit From 9c5f2c7eeb585834f8dadb552b4fd811dd2dee6f Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 7 Oct 2024 15:00:47 +0300 Subject: wifi: mac80211: rename IEEE80211_CHANCTX_CHANGE_MIN_WIDTH The name is misleading, this actually indicates that ieee80211_chanctx_conf::min_def was updated. Rename it to IEEE80211_CHANCTX_CHANGE_MIN_DEF. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.726b5f12ae0c.I3bd9e594c9d2735183ec049a4c7224bd0a9599c9@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 0b8df8ec5a3b..c42ad5a0c303 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -213,7 +213,7 @@ struct ieee80211_low_level_stats { * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel, * this is used only with channel switching with CSA - * @IEEE80211_CHANCTX_CHANGE_MIN_WIDTH: The min required channel width changed + * @IEEE80211_CHANCTX_CHANGE_MIN_DEF: The min chandef changed * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider * bandwidth) OFDMA settings need to be changed * @IEEE80211_CHANCTX_CHANGE_PUNCTURING: The punctured channel(s) bitmap @@ -224,7 +224,7 @@ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_RX_CHAINS = BIT(1), IEEE80211_CHANCTX_CHANGE_RADAR = BIT(2), IEEE80211_CHANCTX_CHANGE_CHANNEL = BIT(3), - IEEE80211_CHANCTX_CHANGE_MIN_WIDTH = BIT(4), + IEEE80211_CHANCTX_CHANGE_MIN_DEF = BIT(4), IEEE80211_CHANCTX_CHANGE_AP = BIT(5), IEEE80211_CHANCTX_CHANGE_PUNCTURING = BIT(6), }; -- cgit From e21dd758cf4c51d508e6665b653c5836103d1027 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 7 Oct 2024 15:00:48 +0300 Subject: wifi: mac80211: make bss_param_ch_cnt available for the low level driver Drivers may need to track this. Make it available for them, and maintain the value when beacons are received. When link X receives a beacon, iterate the RNR elements and update all the links with their respective data. Track the link id that updated the data so that each link can know whether the update came from its own beacon or from another link. In case, the update came from the link's own beacon, always update the updater link id. The purpose is to let the low level driver know if a link is losing its beacons. If link X is losing its beacons, it can still track the bss_param_ch_cnt and know where the update came from. Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.e2d8d1a722ad.I04b883daba2cd48e5730659eb62ca1614c899cbb@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c42ad5a0c303..20562676e9cd 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -740,6 +740,19 @@ struct ieee80211_parsed_tpe { * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the * reception of an EHT TB PPDU on an RU that spans the entire PPDU * bandwidth + * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This + * information is the latest known value. It can come from this link's + * beacon or from a beacon sent by another link. + * @bss_param_ch_cnt_link_id: in BSS-mode, the link_id to which the beacon + * that updated &bss_param_ch_cnt belongs. E.g. if link 1 doesn't hear + * its beacons, and link 2 sent a beacon with an RNR element that updated + * link 1's BSS params change count, then, link 1's + * bss_param_ch_cnt_link_id will be 2. That means that link 1 knows that + * link 2 was the link that updated its bss_param_ch_cnt value. + * In case link 1 hears its beacon again, bss_param_ch_cnt_link_id will + * be updated to 1, even if bss_param_ch_cnt didn't change. This allows + * the link to know that it heard the latest value from its own beacon + * (as opposed to hearing its value from another link's beacon). */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; @@ -834,6 +847,8 @@ struct ieee80211_bss_conf { bool eht_su_beamformee; bool eht_mu_beamformer; bool eht_80mhz_full_bw_ul_mumimo; + u8 bss_param_ch_cnt; + u8 bss_param_ch_cnt_link_id; }; /** -- cgit From 88b67e91e2928c3311f3658d1270b40708b0de00 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:54 +0300 Subject: wifi: mac80211: call rate_control_rate_update() for link STA In order to update the right link information, call the update rate_control_rate_update() with the right link_sta, and then pass that through to the driver's sta_rc_update() method. The software rate control still doesn't support it, but that'll be skipped by not having a rate control ref. Since it now operates on a link sta, rename the driver method. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.5851b6b5fd41.Ibdf50d96afa4b761dd9b9dfd54a1147e77a75329@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 20562676e9cd..7a6edc04e212 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4090,8 +4090,8 @@ struct ieee80211_prep_tx_info { * in @sta_state. * The callback can sleep. * - * @sta_rc_update: Notifies the driver of changes to the bitrates that can be - * used to transmit to the station. The changes are advertised with bits + * @link_sta_rc_update: Notifies the driver of changes to the bitrates that can + * be used to transmit to the station. The changes are advertised with bits * from &enum ieee80211_rate_control_changed and the values are reflected * in the station data. This callback should only be used when the driver * uses hardware rate control (%IEEE80211_HW_HAS_RATE_CONTROL) since @@ -4581,10 +4581,10 @@ struct ieee80211_ops { void (*sta_pre_rcu_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); - void (*sta_rc_update)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct ieee80211_sta *sta, - u32 changed); + void (*link_sta_rc_update)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + u32 changed); void (*sta_rate_tbl_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); -- cgit From 751e7489c1d74b94ffffbed619d8fd724eeff4ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:58 +0300 Subject: wifi: mac80211: expose ieee80211_chan_width_to_rx_bw() to drivers Drivers might need to also do this calculation, no point in them duplicating the code. Since it's so simple, just make it an inline. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.af003cb4a088.I8b5d29504b726caae24af6013c65b3daebe842a2@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7a6edc04e212..32bdda90a4ac 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7704,6 +7704,33 @@ void ieee80211_set_active_links_async(struct ieee80211_vif *vif, */ void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif); +/** + * ieee80211_chan_width_to_rx_bw - convert channel width to STA RX bandwidth + * @width: the channel width value to convert + * Return: the STA RX bandwidth value for the channel width + */ +static inline enum ieee80211_sta_rx_bandwidth +ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) +{ + switch (width) { + default: + WARN_ON_ONCE(1); + fallthrough; + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + return IEEE80211_STA_RX_BW_20; + case NL80211_CHAN_WIDTH_40: + return IEEE80211_STA_RX_BW_40; + case NL80211_CHAN_WIDTH_80: + return IEEE80211_STA_RX_BW_80; + case NL80211_CHAN_WIDTH_160: + case NL80211_CHAN_WIDTH_80P80: + return IEEE80211_STA_RX_BW_160; + case NL80211_CHAN_WIDTH_320: + return IEEE80211_STA_RX_BW_320; + } +} + /* for older drivers - let's not document these ... */ int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); -- cgit From 3607798ad9bdef35ad08489a8239390fccaac6b5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:42 +0200 Subject: wifi: cfg80211: add option for vif allowed radios This allows users to prevent a vif from affecting radios other than the configured ones. This can be useful in cases where e.g. an AP is running on one radio, and triggering a scan on another radio should not disturb it. Changing the allowed radios list for a vif is supported, but only while it is down. While it is possible to achieve the same by always explicitly specifying a frequency list for scan requests and ensuring that the wrong channel/band is never accidentally set on an unrelated interface, this change makes multi-radio wiphy setups a lot easier to deal with for CLI users. By itself, this patch only enforces the radio mask for scanning requests and remain-on-channel. Follow-up changes build on this to limit configured frequencies. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/eefcb218780f71a1549875d149f1196486762756.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c8ce5c2e14f4..95d05e67e69a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6221,6 +6221,7 @@ enum ieee80211_ap_reg_power { * entered. * @links.cac_time_ms: CAC time in ms * @valid_links: bitmap describing what elements of @links are valid + * @radio_mask: Bitmask of radios that this interface is allowed to operate on. */ struct wireless_dev { struct wiphy *wiphy; @@ -6333,6 +6334,8 @@ struct wireless_dev { unsigned int cac_time_ms; } links[IEEE80211_MLD_MAX_NUM_LINKS]; u16 valid_links; + + u32 radio_mask; }; static inline const u8 *wdev_address(struct wireless_dev *wdev) @@ -6518,6 +6521,17 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, const struct cfg80211_chan_def *chandef); +/** + * cfg80211_wdev_channel_allowed - Check if the wdev may use the channel + * + * @wdev: the wireless device + * @chan: channel to check + * + * Return: whether or not the wdev may use the channel + */ +bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, + struct ieee80211_channel *chan); + /** * ieee80211_get_response_rate - get basic rate for a given rate * -- cgit From ebda716ea4da03326ac4d0a71604d18aa8a2e695 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:45 +0200 Subject: wifi: cfg80211: report per wiphy radio antenna mask With multi-radio devices, each radio typically gets a fixed set of antennas. In order to be able to disable specific antennas for some radios, user space needs to know which antenna mask bits are assigned to which radio. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/e0a26afa2c88eaa188ec96ec6d17ecac4e827641.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 95d05e67e69a..3100733f3e23 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5434,6 +5434,8 @@ struct wiphy_radio_freq_range { * @iface_combinations: Valid interface combinations array, should not * list single interface types. * @n_iface_combinations: number of entries in @iface_combinations array. + * + * @antenna_mask: bitmask of antennas connected to this radio. */ struct wiphy_radio { const struct wiphy_radio_freq_range *freq_range; @@ -5441,6 +5443,8 @@ struct wiphy_radio { const struct ieee80211_iface_combination *iface_combinations; int n_iface_combinations; + + u32 antenna_mask; }; #define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff -- cgit From 006a97ceb6732c861c0e2fa3b6a34512caac9354 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:46 +0200 Subject: wifi: mac80211: remove status->ampdu_delimiter_crc This was never used by any driver, so remove it to free up some space. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/e6fee6eed49b105261830db1c74f13841fb9616c.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 32bdda90a4ac..d4f23224eff9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1463,8 +1463,6 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU * @RX_FLAG_AMPDU_DELIM_CRC_ERROR: A delimiter CRC error has been detected * on this subframe - * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC - * is stored in the @ampdu_delimiter_crc field) * @RX_FLAG_MIC_STRIPPED: The mic was stripped of this packet. Decryption was * done by the hardware * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without @@ -1536,7 +1534,7 @@ enum mac80211_rx_flags { RX_FLAG_AMPDU_LAST_KNOWN = BIT(12), RX_FLAG_AMPDU_IS_LAST = BIT(13), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), - RX_FLAG_AMPDU_DELIM_CRC_KNOWN = BIT(15), + /* one free bit at 15 */ RX_FLAG_MACTIME = BIT(16) | BIT(17), RX_FLAG_MACTIME_PLCP_START = 1 << 16, RX_FLAG_MACTIME_START = 2 << 16, @@ -1633,7 +1631,6 @@ enum mac80211_rx_encoding { * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU - * @ampdu_delimiter_crc: A-MPDU delimiter CRC * @zero_length_psdu_type: radiotap type of the 0-length PSDU * @link_valid: if the link which is identified by @link_id is valid. This flag * is set only when connection is MLO. @@ -1671,7 +1668,6 @@ struct ieee80211_rx_status { s8 signal; u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; - u8 ampdu_delimiter_crc; u8 zero_length_psdu_type; u8 link_valid:1, link_id:4; }; -- cgit From 9c4f830927750a2bf9fd9426a5257f0fdce3b662 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:47 +0200 Subject: wifi: cfg80211: pass net_device to .set_monitor_channel Preparation for allowing multiple monitor interfaces with different channels on a multi-radio wiphy. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/35fa652dbfebf93343f8b9a08fdef0467a2a02dc.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 3100733f3e23..5feb93ba0400 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4694,6 +4694,7 @@ struct cfg80211_ops { struct ieee80211_channel *chan); int (*set_monitor_channel)(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef); int (*scan)(struct wiphy *wiphy, -- cgit From 9d40f7e32774279be7e5a7a278d7a290872b2f81 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:48 +0200 Subject: wifi: mac80211: add flag to opt out of virtual monitor support This is useful for multi-radio devices that are capable of monitoring on multiple channels simultanenously. When this flag is set, each monitor interface is passed to the driver individually and can have a configured channel. The vif mac address for non-active monitor interfaces is cleared, in order to allow the driver to tell them apart from active ones. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/3c55505ee0cf0a5f141fbcb30d1e8be8d9f40373.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d4f23224eff9..b4f246cdcca4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2694,6 +2694,11 @@ struct ieee80211_txq { * a virtual monitor interface when monitor interfaces are the only * active interfaces. * + * @IEEE80211_HW_NO_VIRTUAL_MONITOR: The driver would like to be informed + * of any monitor interface, as well as their configured channel. + * This is useful for supporting multiple monitor interfaces on different + * channels. + * * @IEEE80211_HW_NO_AUTO_VIF: The driver would like for no wlanX to * be created. It is expected user-space will create vifs as * desired (and thus have them named as desired). @@ -2853,6 +2858,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_DYNAMIC_PS, IEEE80211_HW_MFP_CAPABLE, IEEE80211_HW_WANT_MONITOR_VIF, + IEEE80211_HW_NO_VIRTUAL_MONITOR, IEEE80211_HW_NO_AUTO_VIF, IEEE80211_HW_SW_CRYPTO_CONTROL, IEEE80211_HW_SUPPORT_FAST_XMIT, -- cgit From a77e527b470cc38754c730bce1483711f643bb60 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:49 +0200 Subject: wifi: cfg80211: add monitor SKIP_TX flag This can be used to indicate that the user is not interested in receiving locally sent packets on the monitor interface. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/f0c20f832eadd36c71fba9a2a16ba57d78389b6c.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5feb93ba0400..8f9853b1a5d1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2267,6 +2267,7 @@ static inline int cfg80211_get_station(struct net_device *dev, * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering * @MONITOR_FLAG_COOK_FRAMES: report frames after processing * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address + * @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames */ enum monitor_flags { MONITOR_FLAG_CHANGED = BIT(__NL80211_MNTR_FLAG_INVALID), @@ -2276,6 +2277,7 @@ enum monitor_flags { MONITOR_FLAG_OTHER_BSS = BIT(NL80211_MNTR_FLAG_OTHER_BSS), MONITOR_FLAG_COOK_FRAMES = BIT(NL80211_MNTR_FLAG_COOK_FRAMES), MONITOR_FLAG_ACTIVE = BIT(NL80211_MNTR_FLAG_ACTIVE), + MONITOR_FLAG_SKIP_TX = BIT(NL80211_MNTR_FLAG_SKIP_TX), }; /** -- cgit From 68ed5c38b512b734caf3da1f87db4a99fcfe3002 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:33 -0700 Subject: phonet: Pass net and ifindex to phonet_address_notify(). Currently, phonet_address_notify() fetches netns and ifindex from dev. Once addr_doit() is converted to RCU, phonet_address_notify() will be called outside of RCU due to GFP_KERNEL, and dev will be unavailable there. Let's pass net and ifindex to phonet_address_notify(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index e9dc8dca5817..6b2102b4ece3 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -38,7 +38,7 @@ int phonet_address_add(struct net_device *dev, u8 addr); int phonet_address_del(struct net_device *dev, u8 addr); u8 phonet_address_get(struct net_device *dev, u8 addr); int phonet_address_lookup(struct net *net, u8 addr); -void phonet_address_notify(int event, struct net_device *dev, u8 addr); +void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr); int phonet_route_add(struct net_device *dev, u8 daddr); int phonet_route_del(struct net_device *dev, u8 daddr); -- cgit From 42f5fe1dc4babad1c49bcc4121983fffccee3cd9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:34 -0700 Subject: phonet: Convert phonet_device_list.lock to spinlock_t. addr_doit() calls phonet_address_add() or phonet_address_del() for RTM_NEWADDR or RTM_DELADDR, respectively. Both functions only touch phonet_device_list(dev_net(dev)), which is currently protected by RTNL and its dedicated mutex, phonet_device_list.lock. We will convert addr_doit() to RCU and cannot use mutex inside RCU. Let's convert the mutex to spinlock_t. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index 6b2102b4ece3..ac0331d83a81 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -12,12 +12,13 @@ #include #include +#include struct net; struct phonet_device_list { struct list_head list; - struct mutex lock; + spinlock_t lock; }; struct phonet_device_list *phonet_device_list(struct net *net); -- cgit From de51ad08b1177bbbb8b60cb7dd4c3c5dd50d262f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:38 -0700 Subject: phonet: Pass net and ifindex to rtm_phonet_notify(). Currently, rtm_phonet_notify() fetches netns and ifindex from dev. Once route_doit() is converted to RCU, rtm_phonet_notify() will be called outside of RCU due to GFP_KERNEL, and dev will be unavailable there. Let's pass net and ifindex to rtm_phonet_notify(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index ac0331d83a81..021e524fd20a 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -43,7 +43,7 @@ void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr); int phonet_route_add(struct net_device *dev, u8 daddr); int phonet_route_del(struct net_device *dev, u8 daddr); -void rtm_phonet_notify(int event, struct net_device *dev, u8 dst); +void rtm_phonet_notify(struct net *net, int event, u32 ifindex, u8 dst); struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr); struct net_device *phonet_route_output(struct net *net, u8 daddr); -- cgit From 3deec3b4afb4c767007eae1eeedbcf3da599395b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:39 -0700 Subject: phonet: Convert phonet_routes.lock to spinlock_t. route_doit() calls phonet_route_add() or phonet_route_del() for RTM_NEWROUTE or RTM_DELROUTE, respectively. Both functions only touch phonet_pernet(dev_net(dev))->routes, which is currently protected by RTNL and its dedicated mutex, phonet_routes.lock. We will convert route_doit() to RCU and cannot use mutex inside RCU. Let's convert the mutex to spinlock_t. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index 021e524fd20a..37a3e83531c6 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -11,7 +11,6 @@ #define PN_DEV_H #include -#include #include struct net; -- cgit From 26d8db55eeacb7dc78672523f57825916d203de4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:29 -0700 Subject: rtnetlink: Define RTNL_FLAG_DOIT_PERNET for per-netns RTNL doit(). We will push RTNL down to each doit() as rtnl_net_lock(). We can use RTNL_FLAG_DOIT_UNLOCKED to call doit() without RTNL, but doit() will still hold RTNL. Let's define RTNL_FLAG_DOIT_PERNET as an alias of RTNL_FLAG_DOIT_UNLOCKED. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index e0d9a8eae6b6..b260c0cc9671 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -12,6 +12,7 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), +#define RTNL_FLAG_DOIT_PERNET RTNL_FLAG_DOIT_UNLOCKED RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), RTNL_FLAG_DUMP_UNLOCKED = BIT(2), RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */ -- cgit From 1ddf9916ac09313128e40d6581cef889c0b4ce84 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:42 +0200 Subject: xfrm: Add support for per cpu xfrm state handling. Currently all flows for a certain SA must be processed by the same cpu to avoid packet reordering and lock contention of the xfrm state lock. To get rid of this limitation, the IETF standardized per cpu SAs in RFC 9611. This patch implements the xfrm part of it. We add the cpu as a lookup key for xfrm states and a config option to generate acquire messages for each cpu. With that, we can have on each cpu a SA with identical traffic selector so that flows can be processed in parallel on all cpus. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/xfrm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a0bdd58f401c..f5275618e744 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -188,6 +188,7 @@ struct xfrm_state { refcount_t refcnt; spinlock_t lock; + u32 pcpu_num; struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; @@ -1684,7 +1685,7 @@ struct xfrmk_spdinfo { u32 spdhmcnt; }; -struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); @@ -1796,7 +1797,7 @@ int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, struct netlink_ext_ack *extack); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u32 if_id, u8 proto, + u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); -- cgit From 0045e3d80613cc7174dc15f189ee6fc4e73b9365 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:43 +0200 Subject: xfrm: Cache used outbound xfrm states at the policy. Now that we can have percpu xfrm states, the number of active states might increase. To get a better lookup performance, we cache the used xfrm states at the policy for outbound IPsec traffic. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/xfrm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f5275618e744..0b394c5fb5f3 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -184,6 +184,7 @@ struct xfrm_state { }; struct hlist_node byspi; struct hlist_node byseq; + struct hlist_node state_cache; refcount_t refcnt; spinlock_t lock; @@ -537,6 +538,7 @@ struct xfrm_policy_queue { * @xp_net: network namespace the policy lives in * @bydst: hlist node for SPD hash table or rbtree list * @byidx: hlist node for index hash table + * @state_cache_list: hlist head for policy cached xfrm states * @lock: serialize changes to policy structure members * @refcnt: reference count, freed once it reaches 0 * @pos: kernel internal tie-breaker to determine age of policy @@ -567,6 +569,8 @@ struct xfrm_policy { struct hlist_node bydst; struct hlist_node byidx; + struct hlist_head state_cache_list; + /* This lock only affects elements except for entry. */ rwlock_t lock; refcount_t refcnt; -- cgit From 81a331a0e72ddc2f75092603d9577bd1a0ca23ad Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:44 +0200 Subject: xfrm: Add an inbound percpu state cache. Now that we can have percpu xfrm states, the number of active states might increase. To get a better lookup performance, we add a percpu cache to cache the used inbound xfrm states. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/netns/xfrm.h | 1 + include/net/xfrm.h | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index ae60d6664095..23dd647fe024 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -43,6 +43,7 @@ struct netns_xfrm { struct hlist_head __rcu *state_bysrc; struct hlist_head __rcu *state_byspi; struct hlist_head __rcu *state_byseq; + struct hlist_head __percpu *state_cache_input; unsigned int state_hmask; unsigned int state_num; struct work_struct state_hash_work; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0b394c5fb5f3..2b87999bd5aa 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -185,6 +185,7 @@ struct xfrm_state { struct hlist_node byspi; struct hlist_node byseq; struct hlist_node state_cache; + struct hlist_node state_cache_input; refcount_t refcnt; spinlock_t lock; @@ -1650,6 +1651,10 @@ int xfrm_state_update(struct xfrm_state *x); struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family); +struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family); struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, -- cgit From 4bbd360a5084d8f890f814327e1d9fbb1f0f6fa1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 24 Oct 2024 13:14:58 -0700 Subject: socket: Print pf->create() when it does not clear sock->sk on failure. I suggested to put DEBUG_NET_WARN_ON_ONCE() in __sock_create() to catch possible use-after-free. But the warning itself was not useful because our interest is in the callee than the caller. Let's define DEBUG_NET_WARN_ONCE() and print the name of pf->create() and the socket identifier. While at it, we enclose DEBUG_NET_WARN_ON_ONCE() in parentheses too to avoid a checkpatch error. Note that %pf or %pF were obsoleted and will be removed later as per comment in lib/vsprintf.c. Link: https://lore.kernel.org/netdev/202410231427.633734b3-lkp@intel.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241024201458.49412-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_debug.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/net_debug.h b/include/net/net_debug.h index 1e74684cbbdb..9fecb1496be3 100644 --- a/include/net/net_debug.h +++ b/include/net/net_debug.h @@ -149,9 +149,11 @@ do { \ #if defined(CONFIG_DEBUG_NET) -#define DEBUG_NET_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define DEBUG_NET_WARN_ON_ONCE(cond) ((void)WARN_ON_ONCE(cond)) +#define DEBUG_NET_WARN_ONCE(cond, format...) ((void)WARN_ONCE(cond, format)) #else #define DEBUG_NET_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define DEBUG_NET_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif #endif /* _LINUX_NET_DEBUG_H */ -- cgit From 2a0df10434ddeb8b5b5aded90a5659aff3b106d7 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:06 +0200 Subject: devlink: remove unused devlink_resource_occ_get_register() and _unregister() Remove not used devlink_resource_occ_get_register() and devlink_resource_occ_get_unregister() functions; current devlink resource users are fine with devl_ variants of the two. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-7-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index db5eff6cb60f..fdd6a0f9891d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1797,15 +1797,8 @@ void devl_resource_occ_get_register(struct devlink *devlink, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv); -void devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv); void devl_resource_occ_get_unregister(struct devlink *devlink, u64 resource_id); - -void devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id); int devl_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count); -- cgit From e3302f9a503a632c125170dc3c72b2886a910b0a Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:07 +0200 Subject: devlink: remove unused devlink_resource_register() Remove unused devlink_resource_register(); all the drivers use devl_resource_register() variant instead. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-8-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index fdd6a0f9891d..fbb9a2668e24 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1779,12 +1779,6 @@ int devl_resource_register(struct devlink *devlink, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params); -int devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params); void devl_resources_unregister(struct devlink *devlink); void devlink_resources_unregister(struct devlink *devlink); int devl_resource_size_get(struct devlink *devlink, -- cgit From 386c2b877b97cde53c6e422f9081ae133492fd05 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Wed, 23 Oct 2024 16:14:51 +0800 Subject: tcp: add a common helper to debug the underlying issue Following the commit c8770db2d544 ("tcp: check skb is non-NULL in tcp_rto_delta_us()"), we decided to add a helper so that it's easier to get verbose warning on either cases. Link: https://lore.kernel.org/all/5632e043-bdba-4d75-bc7e-bf58014492fd@redhat.com/ Suggested-by: Paolo Abeni Signed-off-by: Jason Xing Cc: Neal Cardwell Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 739a9fb83d0c..8b8d94bb1746 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2430,6 +2430,19 @@ void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); +static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str) +{ + WARN_ONCE(cond, + "%sout:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", + str, + tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, + tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, + tcp_sk(sk)->tlp_high_seq, sk->sk_state, + inet_csk(sk)->icsk_ca_state, + tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache, + inet_csk(sk)->icsk_pmtu_cookie); +} + /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { @@ -2441,17 +2454,7 @@ static inline s64 tcp_rto_delta_us(const struct sock *sk) return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; } else { - WARN_ONCE(1, - "rtx queue empty: " - "out:%u sacked:%u lost:%u retrans:%u " - "tlp_high_seq:%u sk_state:%u ca_state:%u " - "advmss:%u mss_cache:%u pmtu:%u\n", - tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, - tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, - tcp_sk(sk)->tlp_high_seq, sk->sk_state, - inet_csk(sk)->icsk_ca_state, - tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache, - inet_csk(sk)->icsk_pmtu_cookie); + tcp_warn_once(sk, 1, "rtx queue empty: "); return jiffies_to_usecs(rto); } -- cgit From 668d663989c77fcb2a92748645e4c394b03d5988 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Wed, 23 Oct 2024 16:14:52 +0800 Subject: tcp: add more warn of socket in tcp_send_loss_probe() Add two fields to print in the helper which here covers tcp_send_loss_probe(). Link: https://lore.kernel.org/all/5632e043-bdba-4d75-bc7e-bf58014492fd@redhat.com/ Suggested-by: Paolo Abeni Signed-off-by: Jason Xing Cc: Neal Cardwell Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8b8d94bb1746..e9b37b76e894 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2433,8 +2433,9 @@ void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str) { WARN_ONCE(cond, - "%sout:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", + "%scwn:%u out:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", str, + tcp_snd_cwnd(tcp_sk(sk)), tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, tcp_sk(sk)->tlp_high_seq, sk->sk_state, -- cgit From db71aae70e3e646d8ba4cb50e4bd4c281a91c804 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Sat, 26 Oct 2024 12:53:36 +0000 Subject: net: checksum: Move from32to16() to generic header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit from32to16() is used by lib/checksum.c and also by arch/parisc/lib/checksum.c. The next patch will use it in the bpf_csum_diff helper. Move from32to16() to the include/net/checksum.h as csum_from32to16() and remove other implementations. Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Reviewed-by: Toke Høiland-Jørgensen Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20241026125339.26459-2-puranjay@kernel.org --- include/net/checksum.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/checksum.h b/include/net/checksum.h index 1338cb92c8e7..243f972267b8 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -151,6 +151,12 @@ static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) *csum = csum_add(csum_sub(*csum, old), new); } +static inline unsigned short csum_from32to16(unsigned int sum) +{ + sum += (sum >> 16) | (sum << 16); + return (unsigned short)(sum >> 16); +} + struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); -- cgit From 2748697225c38a19666bba6a83afc6bf46ee16be Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Oct 2024 16:52:46 +0300 Subject: net: sched: propagate "skip_sw" flag to struct flow_cls_common_offload Background: switchdev ports offload the Linux bridge, and most of the packets they handle will never see the CPU. The ports between which there exists no hardware data path are considered 'foreign' to switchdev. These can either be normal physical NICs without switchdev offload, or incompatible switchdev ports, or virtual interfaces like veth/dummy/etc. In some cases, an offloaded filter can only do half the work, and the rest must be handled by software. Redirecting/mirroring from the ingress of a switchdev port towards a foreign interface is one example of combined hardware/software data path. The most that the switchdev port can do is to extract the matching packets from its offloaded data path and send them to the CPU. From there on, the software filter runs (a second time, after the first run in hardware) on the packet and performs the mirred action. It makes sense for switchdev drivers which allow this kind of "half offloading" to sense the "skip_sw" flag of the filter/action pair, and deny attempts from the user to install a filter that does not run in software, because that simply won't work. In fact, a mirred action on a switchdev port towards a dummy interface appears to be a valid way of (selectively) monitoring offloaded traffic that flows through it. IFF_PROMISC was also discussed years ago, but (despite initial disagreement) there seems to be consensus that this flag should not affect the destination taken by packets, but merely whether or not the NIC discards packets with unknown MAC DA for local processing. [1] https://lore.kernel.org/netdev/20190830092637.7f83d162@ceranb/ [2] https://lore.kernel.org/netdev/20191002233750.13566-1-olteanv@gmail.com/ Suggested-by: Ido Schimmel Link: https://lore.kernel.org/netdev/ZxUo0Dc0M5Y6l9qF@shredder.mtl.com/ Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241023135251.1752488-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/net/flow_offload.h | 1 + include/net/pkt_cls.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 292cd8f4b762..596ab9791e4d 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -685,6 +685,7 @@ struct flow_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; + bool skip_sw; struct netlink_ext_ack *extack; }; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 4880b3a7aced..cf199af85c52 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -755,6 +755,7 @@ tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; + cls_common->skip_sw = tc_skip_sw(flags); if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } -- cgit From d86c7a9162ae925dcb6632a4544c62eb0eb5c7cc Mon Sep 17 00:00:00 2001 From: George Guo Date: Mon, 28 Oct 2024 20:34:35 +0800 Subject: netlabel: document doi_remove field of struct netlbl_calipso_ops Add documentation of doi_remove field to Kernel doc for struct netlbl_calipso_ops. Flagged by ./scripts/kernel-doc -none. Signed-off-by: George Guo Acked-by: Paul Moore Link: https://patch.msgid.link/20241028123435.3495916-1-dongtai.guo@linux.dev Signed-off-by: Jakub Kicinski --- include/net/netlabel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 529160f76cac..4afd934b1238 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -208,6 +208,7 @@ struct netlbl_lsm_secattr { * struct netlbl_calipso_ops - NetLabel CALIPSO operations * @doi_add: add a CALIPSO DOI * @doi_free: free a CALIPSO DOI + * @doi_remove: remove a CALIPSO DOI * @doi_getdef: returns a reference to a DOI * @doi_putdef: releases a reference of a DOI * @doi_walk: enumerate the DOI list -- cgit From 4138e9ec00936c9fe7d0fe961e32f381b1e36926 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 29 Oct 2024 11:47:14 +0100 Subject: netlink: add NLA_POLICY_MAX_LEN macro Similarly to NLA_POLICY_MIN_LEN, NLA_POLICY_MAX_LEN defines a policy with a maximum length value. The netlink generator for YAML specs has been extended accordingly. Signed-off-by: Antonio Quartulli Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20241029-b4-ovpn-v11-1-de4698c73a25@openvpn.net Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index db6af207287c..2dc671c977ff 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -469,6 +469,7 @@ struct nla_policy { .max = _len \ } #define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) +#define NLA_POLICY_MAX_LEN(_len) NLA_POLICY_MAX(NLA_BINARY, _len) /** * struct nl_info - netlink source information -- cgit From 6b2d11e2d8fc130df4708be0b6b53fd3e6b54cf6 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 30 Oct 2024 04:22:33 +0000 Subject: net/tcp: Add missing lockdep annotations for TCP-AO hlist traversals Under CONFIG_PROVE_RCU_LIST + CONFIG_RCU_EXPERT hlist_for_each_entry_rcu() provides very helpful splats, which help to find possible issues. I missed CONFIG_RCU_EXPERT=y in my testing config the same as described in a3e4bf7f9675 ("configs/debug: make sure PROVE_RCU_LIST=y takes effect"). The fix itself is trivial: add the very same lockdep annotations as were used to dereference ao_info from the socket. Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20241028152645.35a8be66@kernel.org/ Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20241030-tcp-ao-hlist-lockdep-annotate-v1-1-bf641a64d7c6@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ao.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index 1d46460d0fef..df655ce6987d 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -183,7 +183,8 @@ int tcp_ao_hash_skb(unsigned short int family, const u8 *tkey, int hash_offset, u32 sne); int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, sockptr_t optval, int optlen); -struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao, +struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, + struct tcp_ao_info *ao, int sndid, int rcvid); int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, struct request_sock *req, struct sk_buff *skb, -- cgit From b3e8f29d6b45e865bab9a9964709ff7413e33e85 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:15 +0100 Subject: netfilter: nf_tables: avoid false-positive lockdep splats with flowtables The transaction mutex prevents concurrent add/delete, its ok to iterate those lists outside of rcu read side critical sections. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 91ae20cb7648..c1513bd14568 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1463,7 +1463,8 @@ struct nft_flowtable { struct nf_flowtable data; }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask); -- cgit From e57dfaa4b0a72f6a231a8eedb95d260045bbd8db Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 31 Oct 2024 16:52:57 +0100 Subject: xfrm: Convert struct xfrm_dst_lookup_params -> tos to dscp_t. Add type annotation to the "tos" field of struct xfrm_dst_lookup_params, to ensure that the ECN bits aren't mistakenly taken into account when doing route lookups. Rename that field (tos -> dscp) to make that change explicit. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2b87999bd5aa..32c09e85a64c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -354,7 +355,7 @@ void xfrm_if_unregister_cb(void); struct xfrm_dst_lookup_params { struct net *net; - int tos; + dscp_t dscp; int oif; xfrm_address_t *saddr; xfrm_address_t *daddr; -- cgit From 9907cda95fcbf44141b1292faab89cf8ec542f22 Mon Sep 17 00:00:00 2001 From: Juraj Šarinay Date: Sun, 3 Nov 2024 13:45:25 +0100 Subject: net: nfc: Propagate ISO14443 type A target ATS to userspace via netlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 20-byte field ats to struct nfc_target and expose it as NFC_ATTR_TARGET_ATS via the netlink interface. The payload contains 'historical bytes' that help to distinguish cards from one another. The information is commonly used to assemble an emulated ATR similar to that reported by smart cards with contacts. Add a 20-byte field target_ats to struct nci_dev to hold the payload obtained in nci_rf_intf_activated_ntf_packet() and copy it to over to nfc_target.ats in nci_activate_target(). The approach is similar to the handling of 'general bytes' within ATR_RES. Replace the hard-coded size of rats_res within struct activation_params_nfca_poll_iso_dep by the equal constant NFC_ATS_MAXSIZE now defined in nfc.h Within NCI, the information corresponds to the 'RATS Response' activation parameter that omits the initial length byte TL. This loses no information and is consistent with our handling of SENSB_RES that also drops the first (constant) byte. Tested with nxp_nci_i2c on a few type A targets including an ICAO 9303 compliant passport. I refrain from the corresponding change to digital_in_recv_ats() to have the few drivers based on digital.h fill nfc_target.ats, as I have no way to test it. That class of drivers appear not to set NFC_ATTR_TARGET_SENSB_RES either. Consider a separate patch to propagate (all) the parameters. Signed-off-by: Juraj Šarinay Link: https://patch.msgid.link/20241103124525.8392-1-juraj@sarinay.com Signed-off-by: Paolo Abeni --- include/net/nfc/nci.h | 2 +- include/net/nfc/nci_core.h | 4 ++++ include/net/nfc/nfc.h | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/nfc/nci.h b/include/net/nfc/nci.h index dc36519d16aa..09efcaed7c3f 100644 --- a/include/net/nfc/nci.h +++ b/include/net/nfc/nci.h @@ -475,7 +475,7 @@ struct nci_rf_discover_ntf { #define NCI_OP_RF_INTF_ACTIVATED_NTF nci_opcode_pack(NCI_GID_RF_MGMT, 0x05) struct activation_params_nfca_poll_iso_dep { __u8 rats_res_len; - __u8 rats_res[20]; + __u8 rats_res[NFC_ATS_MAXSIZE]; }; struct activation_params_nfcb_poll_iso_dep { diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index ea8595651c38..e180bdf2f82b 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -265,6 +265,10 @@ struct nci_dev { /* stored during intf_activated_ntf */ __u8 remote_gb[NFC_MAX_GT_LEN]; __u8 remote_gb_len; + + /* stored during intf_activated_ntf */ + __u8 target_ats[NFC_ATS_MAXSIZE]; + __u8 target_ats_len; }; /* ----- NCI Devices ----- */ diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 3a3781838c67..127e6c7d910d 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -86,6 +86,8 @@ struct nfc_ops { * is a type A one. The %sens_res most significant byte must be byte 2 * as described by the NFC Forum digital specification (i.e. the platform * configuration one) while %sens_res least significant byte is byte 1. + * @ats_len: length of Answer To Select in bytes + * @ats: Answer To Select returned by an ISO 14443 Type A target upon activation */ struct nfc_target { u32 idx; @@ -105,6 +107,8 @@ struct nfc_target { u8 is_iso15693; u8 iso15693_dsfid; u8 iso15693_uid[NFC_ISO15693_UID_MAXSIZE]; + u8 ats_len; + u8 ats[NFC_ATS_MAXSIZE]; }; /** -- cgit From df81366b484da9618cec12cfc30ec93f7d4b6ac7 Mon Sep 17 00:00:00 2001 From: Zong-Zhe Yang Date: Fri, 1 Nov 2024 16:21:43 +0800 Subject: wifi: mac80211: fix description of ieee80211_set_active_links() for new sequence The sequence of calls has changed, but the description is inconsistent. So, fix the description. Fixes: 188a1bf89432 ("wifi: mac80211: re-order assigning channel in activate links") Signed-off-by: Zong-Zhe Yang Link: https://patch.msgid.link/20241101082143.11138-1-kevin_yang@realtek.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b4f246cdcca4..a97c9f85ae9a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7672,12 +7672,12 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * * - change_vif_links(0x11) * - unassign_vif_chanctx(link_id=0) + * - assign_vif_chanctx(link_id=4) * - change_sta_links(0x11) for each affected STA (the AP) * (TDLS connections on now inactive links should be torn down) * - remove group keys on the old link (link_id 0) * - add new group keys (GTK/IGTK/BIGTK) on the new link (link_id 4) * - change_sta_links(0x10) for each affected STA (the AP) - * - assign_vif_chanctx(link_id=4) * - change_vif_links(0x10) * * Return: 0 on success. An error code otherwise. -- cgit From 48171c65f61148b0025128b70837280123f1309d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 6 Nov 2024 22:37:32 +0100 Subject: ipv4: Prepare ip_route_output() to future .flowi4_tos conversion. Convert the "tos" parameter of ip_route_output() to dscp_t. This way we'll have a dscp_t value directly available when .flowi4_tos will eventually be converted to dscp_t. All ip_route_output() callers but one set this "tos" parameter to 0 and therefore don't need to be adapted to the new prototype. Only br_nf_pre_routing_finish() needs conversion. It can just use ip4h_dscp() to get the DSCP field from the IPv4 header. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/0f10d031dd44c70aae9bc6e19391cb30d5c2fe71.1730928699.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 586e59f7ed8a..0a690adfdff5 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -156,12 +156,12 @@ static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 * structure is only partially set, it may bypass some fib-rules. */ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, - __be32 saddr, u8 tos, int oif, - __u8 scope) + __be32 saddr, dscp_t dscp, + int oif, __u8 scope) { struct flowi4 fl4 = { .flowi4_oif = oif, - .flowi4_tos = tos, + .flowi4_tos = inet_dscp_to_dsfield(dscp), .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, -- cgit From 580db513b4a9d52f306580015a1872eea0a0894e Mon Sep 17 00:00:00 2001 From: Khang Nguyen Date: Tue, 5 Nov 2024 14:19:15 +0700 Subject: net: mctp: Expose transport binding identifier via IFLA attribute MCTP control protocol implementations are transport binding dependent. Endpoint discovery is mandatory based on transport binding. Message timing requirements are specified in each respective transport binding specification. However, we currently have no means to get this information from MCTP links. Add a IFLA_MCTP_PHYS_BINDING netlink link attribute, which represents the transport type using the DMTF DSP0239-defined type numbers, returned as part of RTM_GETLINK data. We get an IFLA_MCTP_PHYS_BINDING attribute for each MCTP link, for example: - 0x00 (unspec) for loopback interface; - 0x01 (SMBus/I2C) for mctpi2c%d interfaces; and - 0x05 (serial) for mctpserial%d interfaces. Signed-off-by: Khang Nguyen Reviewed-by: Matt Johnston Link: https://patch.msgid.link/20241105071915.821871-1-khangng@os.amperecomputing.com Signed-off-by: Jakub Kicinski --- include/net/mctp.h | 18 ++++++++++++++++++ include/net/mctpdevice.h | 4 +++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mctp.h b/include/net/mctp.h index 28d59ae94ca3..1ecbff7116f6 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -298,4 +298,22 @@ void mctp_routes_exit(void); int mctp_device_init(void); void mctp_device_exit(void); +/* MCTP IDs and Codes from DMTF specification + * "DSP0239 Management Component Transport Protocol (MCTP) IDs and Codes" + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0239_1.11.1.pdf + */ +enum mctp_phys_binding { + MCTP_PHYS_BINDING_UNSPEC = 0x00, + MCTP_PHYS_BINDING_SMBUS = 0x01, + MCTP_PHYS_BINDING_PCIE_VDM = 0x02, + MCTP_PHYS_BINDING_USB = 0x03, + MCTP_PHYS_BINDING_KCS = 0x04, + MCTP_PHYS_BINDING_SERIAL = 0x05, + MCTP_PHYS_BINDING_I3C = 0x06, + MCTP_PHYS_BINDING_MMBI = 0x07, + MCTP_PHYS_BINDING_PCC = 0x08, + MCTP_PHYS_BINDING_UCIE = 0x09, + MCTP_PHYS_BINDING_VENDOR = 0xFF, +}; + #endif /* __NET_MCTP_H */ diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h index 5c0d04b5c12c..957d9ef924c5 100644 --- a/include/net/mctpdevice.h +++ b/include/net/mctpdevice.h @@ -22,6 +22,7 @@ struct mctp_dev { refcount_t refs; unsigned int net; + enum mctp_phys_binding binding; const struct mctp_netdev_ops *ops; @@ -44,7 +45,8 @@ struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev); struct mctp_dev *__mctp_dev_get(const struct net_device *dev); int mctp_register_netdev(struct net_device *dev, - const struct mctp_netdev_ops *ops); + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding); void mctp_unregister_netdev(struct net_device *dev); void mctp_dev_hold(struct mctp_dev *mdev); -- cgit From 41b3caa7c0761141aa6d508924b9d23db57a17bc Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:38 +0000 Subject: neighbour: Add hlist_node to struct neighbour Add a doubly-linked node to neighbours, so that they can be deleted without iterating the entire bucket they're in. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-2-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 3887ed9e5026..0402447854c7 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -136,6 +136,7 @@ struct neigh_statistics { struct neighbour { struct neighbour __rcu *next; + struct hlist_node hash; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; @@ -191,6 +192,7 @@ struct pneigh_entry { struct neigh_hash_table { struct neighbour __rcu **hash_buckets; + struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; -- cgit From d7ddee1a522ddf5b28e2a3f7093cf238c96f492a Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:39 +0000 Subject: neighbour: Define neigh_for_each_in_bucket Introduce neigh_for_each_in_bucket in neighbour.h, to help iterate over the neighbour table more succinctly. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-3-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 0402447854c7..4b9068c5e668 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -277,6 +277,12 @@ static inline void *neighbour_priv(const struct neighbour *n) extern const struct nla_policy nda_policy[]; +#define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash) +#define neigh_for_each_in_bucket_rcu(pos, head) \ + hlist_for_each_entry_rcu(pos, head, hash) +#define neigh_for_each_in_bucket_safe(pos, tmp, head) \ + hlist_for_each_entry_safe(pos, tmp, head, hash) + static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; -- cgit From 0e3bcb0f78a0ca7cfdb7906dc79d922e19ba09b5 Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:41 +0000 Subject: neighbour: Convert iteration to use hlist+macro Remove all usage of the bare neighbour::next pointer, replacing them with neighbour::hash and its for_each macro. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-5-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 4b9068c5e668..94cf4f8c118f 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -311,12 +311,9 @@ static inline struct neighbour *___neigh_lookup_noref( u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); - for (n = rcu_dereference(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference(n->next)) { + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val]) if (n->dev == dev && key_eq(n, pkey)) return n; - } return NULL; } -- cgit From a01a67ab2fffa7458354f0a666a6d550fa2b82fc Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:42 +0000 Subject: neighbour: Remove bare neighbour::next pointer Remove the now-unused neighbour::next pointer, leaving struct neighbour solely with the hlist_node implementation. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-6-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 94cf4f8c118f..40aac1e24c68 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -135,7 +135,6 @@ struct neigh_statistics { #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { - struct neighbour __rcu *next; struct hlist_node hash; struct neigh_table *tbl; struct neigh_parms *parms; @@ -191,7 +190,6 @@ struct pneigh_entry { #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { - struct neighbour __rcu **hash_buckets; struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; @@ -354,7 +352,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); -bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl); +bool neigh_remove_one(struct neighbour *ndel); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); -- cgit From f7f52738637f4361c108cad36e23ee98959a9006 Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:43 +0000 Subject: neighbour: Create netdev->neighbour association Create a mapping between a netdev and its neighoburs, allowing for much cheaper flushes. Signed-off-by: Gilad Naaman Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241107160444.2913124-7-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 9 ++------- include/net/neighbour_tables.h | 12 ++++++++++++ 2 files changed, 14 insertions(+), 7 deletions(-) create mode 100644 include/net/neighbour_tables.h (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 40aac1e24c68..9a832cab5b1d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -29,6 +29,7 @@ #include #include #include +#include /* * NUD stands for "neighbor unreachability detection" @@ -136,6 +137,7 @@ struct neigh_statistics { struct neighbour { struct hlist_node hash; + struct hlist_node dev_list; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; @@ -236,13 +238,6 @@ struct neigh_table { struct pneigh_entry **phash_buckets; }; -enum { - NEIGH_ARP_TABLE = 0, - NEIGH_ND_TABLE = 1, - NEIGH_NR_TABLES, - NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ -}; - static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; diff --git a/include/net/neighbour_tables.h b/include/net/neighbour_tables.h new file mode 100644 index 000000000000..bcffbe8f7601 --- /dev/null +++ b/include/net/neighbour_tables.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_NEIGHBOUR_TABLES_H +#define _NET_NEIGHBOUR_TABLES_H + +enum { + NEIGH_ARP_TABLE = 0, + NEIGH_ND_TABLE = 1, + NEIGH_NR_TABLES, + NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ +}; + +#endif -- cgit From 7f4b3960e54faec72132a71da4a84a8e2a0b9037 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 8 Nov 2024 11:41:44 +0100 Subject: net: netlink: add nla_get_*_default() accessors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are quite a number of places that use patterns such as if (attr) val = nla_get_u16(attr); else val = DEFAULT; Add nla_get_u16_default() and friends like that to not have to type this out all the time. Acked-by: Toke Høiland-Jørgensen Acked-by: Jakub Kicinski Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20241108114145.acd2aadb03ac.I3df6aac71d38a5baa1c0a03d0c7e82d4395c030e@changeid Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 262 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 2dc671c977ff..39eaa6be6ca8 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -142,6 +142,8 @@ * nla_get_flag(nla) return 1 if flag is true * nla_get_msecs(nla) get payload for a msecs attribute * + * The same functions also exist with _default(). + * * Attribute Misc: * nla_memcpy(dest, nla, count) copy attribute into memory * nla_memcmp(nla, data, size) compare attribute with memory area @@ -1695,6 +1697,20 @@ static inline u32 nla_get_u32(const struct nlattr *nla) return *(u32 *) nla_data(nla); } +/** + * nla_get_u32_default - return payload of u32 attribute or default + * @nla: u32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u32 nla_get_u32_default(const struct nlattr *nla, u32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u32(nla); +} + /** * nla_get_be32 - return payload of __be32 attribute * @nla: __be32 netlink attribute @@ -1704,6 +1720,21 @@ static inline __be32 nla_get_be32(const struct nlattr *nla) return *(__be32 *) nla_data(nla); } +/** + * nla_get_be32_default - return payload of be32 attribute or default + * @nla: __be32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be32 nla_get_be32_default(const struct nlattr *nla, + __be32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be32(nla); +} + /** * nla_get_le32 - return payload of __le32 attribute * @nla: __le32 netlink attribute @@ -1713,6 +1744,21 @@ static inline __le32 nla_get_le32(const struct nlattr *nla) return *(__le32 *) nla_data(nla); } +/** + * nla_get_le32_default - return payload of le32 attribute or default + * @nla: __le32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le32 nla_get_le32_default(const struct nlattr *nla, + __le32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le32(nla); +} + /** * nla_get_u16 - return payload of u16 attribute * @nla: u16 netlink attribute @@ -1722,6 +1768,20 @@ static inline u16 nla_get_u16(const struct nlattr *nla) return *(u16 *) nla_data(nla); } +/** + * nla_get_u16_default - return payload of u16 attribute or default + * @nla: u16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u16 nla_get_u16_default(const struct nlattr *nla, u16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u16(nla); +} + /** * nla_get_be16 - return payload of __be16 attribute * @nla: __be16 netlink attribute @@ -1731,6 +1791,21 @@ static inline __be16 nla_get_be16(const struct nlattr *nla) return *(__be16 *) nla_data(nla); } +/** + * nla_get_be16_default - return payload of be16 attribute or default + * @nla: __be16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be16 nla_get_be16_default(const struct nlattr *nla, + __be16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be16(nla); +} + /** * nla_get_le16 - return payload of __le16 attribute * @nla: __le16 netlink attribute @@ -1740,6 +1815,21 @@ static inline __le16 nla_get_le16(const struct nlattr *nla) return *(__le16 *) nla_data(nla); } +/** + * nla_get_le16_default - return payload of le16 attribute or default + * @nla: __le16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le16 nla_get_le16_default(const struct nlattr *nla, + __le16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le16(nla); +} + /** * nla_get_u8 - return payload of u8 attribute * @nla: u8 netlink attribute @@ -1749,6 +1839,20 @@ static inline u8 nla_get_u8(const struct nlattr *nla) return *(u8 *) nla_data(nla); } +/** + * nla_get_u8_default - return payload of u8 attribute or default + * @nla: u8 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u8 nla_get_u8_default(const struct nlattr *nla, u8 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u8(nla); +} + /** * nla_get_u64 - return payload of u64 attribute * @nla: u64 netlink attribute @@ -1762,6 +1866,20 @@ static inline u64 nla_get_u64(const struct nlattr *nla) return tmp; } +/** + * nla_get_u64_default - return payload of u64 attribute or default + * @nla: u64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u64 nla_get_u64_default(const struct nlattr *nla, u64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u64(nla); +} + /** * nla_get_uint - return payload of uint attribute * @nla: uint netlink attribute @@ -1773,6 +1891,20 @@ static inline u64 nla_get_uint(const struct nlattr *nla) return nla_get_u64(nla); } +/** + * nla_get_uint_default - return payload of uint attribute or default + * @nla: uint netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u64 nla_get_uint_default(const struct nlattr *nla, u64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_uint(nla); +} + /** * nla_get_be64 - return payload of __be64 attribute * @nla: __be64 netlink attribute @@ -1786,6 +1918,21 @@ static inline __be64 nla_get_be64(const struct nlattr *nla) return tmp; } +/** + * nla_get_be64_default - return payload of be64 attribute or default + * @nla: __be64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be64 nla_get_be64_default(const struct nlattr *nla, + __be64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be64(nla); +} + /** * nla_get_le64 - return payload of __le64 attribute * @nla: __le64 netlink attribute @@ -1795,6 +1942,21 @@ static inline __le64 nla_get_le64(const struct nlattr *nla) return *(__le64 *) nla_data(nla); } +/** + * nla_get_le64_default - return payload of le64 attribute or default + * @nla: __le64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le64 nla_get_le64_default(const struct nlattr *nla, + __le64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le64(nla); +} + /** * nla_get_s32 - return payload of s32 attribute * @nla: s32 netlink attribute @@ -1804,6 +1966,20 @@ static inline s32 nla_get_s32(const struct nlattr *nla) return *(s32 *) nla_data(nla); } +/** + * nla_get_s32_default - return payload of s32 attribute or default + * @nla: s32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s32 nla_get_s32_default(const struct nlattr *nla, s32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s32(nla); +} + /** * nla_get_s16 - return payload of s16 attribute * @nla: s16 netlink attribute @@ -1813,6 +1989,20 @@ static inline s16 nla_get_s16(const struct nlattr *nla) return *(s16 *) nla_data(nla); } +/** + * nla_get_s16_default - return payload of s16 attribute or default + * @nla: s16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s16 nla_get_s16_default(const struct nlattr *nla, s16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s16(nla); +} + /** * nla_get_s8 - return payload of s8 attribute * @nla: s8 netlink attribute @@ -1822,6 +2012,20 @@ static inline s8 nla_get_s8(const struct nlattr *nla) return *(s8 *) nla_data(nla); } +/** + * nla_get_s8_default - return payload of s8 attribute or default + * @nla: s8 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s8 nla_get_s8_default(const struct nlattr *nla, s8 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s8(nla); +} + /** * nla_get_s64 - return payload of s64 attribute * @nla: s64 netlink attribute @@ -1835,6 +2039,20 @@ static inline s64 nla_get_s64(const struct nlattr *nla) return tmp; } +/** + * nla_get_s64_default - return payload of s64 attribute or default + * @nla: s64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s64 nla_get_s64_default(const struct nlattr *nla, s64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s64(nla); +} + /** * nla_get_sint - return payload of uint attribute * @nla: uint netlink attribute @@ -1846,6 +2064,20 @@ static inline s64 nla_get_sint(const struct nlattr *nla) return nla_get_s64(nla); } +/** + * nla_get_sint_default - return payload of sint attribute or default + * @nla: sint netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s64 nla_get_sint_default(const struct nlattr *nla, s64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_sint(nla); +} + /** * nla_get_flag - return payload of flag attribute * @nla: flag netlink attribute @@ -1868,6 +2100,21 @@ static inline unsigned long nla_get_msecs(const struct nlattr *nla) return msecs_to_jiffies((unsigned long) msecs); } +/** + * nla_get_msecs_default - return payload of msecs attribute or default + * @nla: msecs netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline unsigned long nla_get_msecs_default(const struct nlattr *nla, + unsigned long defvalue) +{ + if (!nla) + return defvalue; + return nla_get_msecs(nla); +} + /** * nla_get_in_addr - return payload of IPv4 address attribute * @nla: IPv4 address netlink attribute @@ -1877,6 +2124,21 @@ static inline __be32 nla_get_in_addr(const struct nlattr *nla) return *(__be32 *) nla_data(nla); } +/** + * nla_get_in_addr_default - return payload of be32 attribute or default + * @nla: IPv4 address netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be32 nla_get_in_addr_default(const struct nlattr *nla, + __be32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_in_addr(nla); +} + /** * nla_get_in6_addr - return payload of IPv6 address attribute * @nla: IPv6 address netlink attribute -- cgit From d5ec8d91f82ef78405b506737952dec8af95a95b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:14 -0800 Subject: rtnetlink: Remove __rtnl_link_unregister(). rtnl_link_unregister() holds RTNL and calls __rtnl_link_unregister(), where we call synchronize_srcu() to wait inflight RTM_NEWLINK requests for per-netns RTNL. We put synchronize_srcu() in __rtnl_link_unregister() due to ifb.ko and dummy.ko. However, rtnl_newlink() will acquire SRCU before RTNL later in this series. Then, lockdep will detect the deadlock: rtnl_link_unregister() rtnl_newlink() ---- ---- lock(rtnl_mutex); lock(&ops->srcu); lock(rtnl_mutex); sync(&ops->srcu); To avoid the problem, we must call synchronize_srcu() before RTNL in rtnl_link_unregister(). As a preparation, let's remove __rtnl_link_unregister(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index b260c0cc9671..3ebfcc6e56fd 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -165,7 +165,6 @@ struct rtnl_link_ops { }; int __rtnl_link_register(struct rtnl_link_ops *ops); -void __rtnl_link_unregister(struct rtnl_link_ops *ops); int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); -- cgit From 6b57ff21a3109b1dba2d286ff415463e6fb1fca3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:15 -0800 Subject: rtnetlink: Protect link_ops by mutex. rtnl_link_unregister() holds RTNL and calls synchronize_srcu(), but rtnl_newlink() will acquire SRCU frist and then RTNL. Then, we need to unlink ops and call synchronize_srcu() outside of RTNL to avoid the deadlock. rtnl_link_unregister() rtnl_newlink() ---- ---- lock(rtnl_mutex); lock(&ops->srcu); lock(rtnl_mutex); sync(&ops->srcu); Let's move as such and add a mutex to protect link_ops. Now, link_ops is protected by its dedicated mutex and rtnl_link_register() no longer needs to hold RTNL. While at it, we move the initialisation of ops->dellink and ops->srcu out of the mutex scope. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 3ebfcc6e56fd..7559020f760c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -71,7 +71,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) /** * struct rtnl_link_ops - rtnetlink link operations * - * @list: Used internally, protected by RTNL and SRCU + * @list: Used internally, protected by link_ops_mutex and SRCU * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit -- cgit From 68297dbb967f87c3c92af9d2f652270f57c547c7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:16 -0800 Subject: rtnetlink: Remove __rtnl_link_register() link_ops is protected by link_ops_mutex and no longer needs RTNL, so we have no reason to have __rtnl_link_register() separately. Let's remove it and call rtnl_link_register() from ifb.ko and dummy.ko. Note that both modules' init() work on init_net only, so we need not export pernet_ops_rwsem and can use rtnl_net_lock() there. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 7559020f760c..ef7c11f0d74c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -164,8 +164,6 @@ struct rtnl_link_ops { int *prividx, int attr); }; -int __rtnl_link_register(struct rtnl_link_ops *ops); - int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); -- cgit From 28690e5361c05fd4ef0ca3a17d1c667cba790554 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:18 -0800 Subject: rtnetlink: Add peer_type in struct rtnl_link_ops. In ops->newlink(), veth, vxcan, and netkit call rtnl_link_get_net() with a net pointer, which is the first argument of ->newlink(). rtnl_link_get_net() could return another netns based on IFLA_NET_NS_PID and IFLA_NET_NS_FD in the peer device's attributes. We want to get it and fill rtnl_nets->nets[] in advance in rtnl_newlink() for per-netns RTNL. All of the three get the peer netns in the same way: 1. Call rtnl_nla_parse_ifinfomsg() 2. Call ops->validate() (vxcan doesn't have) 3. Call rtnl_link_get_net_tb() Let's add a new field peer_type to struct rtnl_link_ops and prefetch netns in the peer ifla to add it to rtnl_nets in rtnl_newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index ef7c11f0d74c..bef76abcff8d 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -75,6 +75,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit + * @peer_type: Peer device specific netlink attribute number (e.g. VETH_INFO_PEER) * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters @@ -116,6 +117,7 @@ struct rtnl_link_ops { void (*setup)(struct net_device *dev); bool netns_refund; + const u16 peer_type; unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], -- cgit From 636af13f213bf9b28a34254327934bc72a797754 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:23 -0800 Subject: rtnetlink: Register rtnl_dellink() and rtnl_setlink() with RTNL_FLAG_DOIT_PERNET_WIP. Currently, rtnl_setlink() and rtnl_dellink() cannot be fully converted to per-netns RTNL due to a lack of handling peer/lower/upper devices in different netns. For example, when we change a device in rtnl_setlink() and need to propagate that to its upper devices, we want to avoid acquiring all netns locks, for which we do not know the upper limit. The same situation happens when we remove a device. rtnl_dellink() could be transformed to remove a single device in the requested netns and delegate other devices to per-netns work, and rtnl_setlink() might be ? Until we come up with a better idea, let's use a new flag RTNL_FLAG_DOIT_PERNET_WIP for rtnl_dellink() and rtnl_setlink(). This will unblock converting RTNL users where such devices are not related. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Nikolay Aleksandrov Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241108004823.29419-11-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bef76abcff8d..bc0069a8b6ea 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -13,6 +13,7 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), #define RTNL_FLAG_DOIT_PERNET RTNL_FLAG_DOIT_UNLOCKED +#define RTNL_FLAG_DOIT_PERNET_WIP RTNL_FLAG_DOIT_UNLOCKED RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), RTNL_FLAG_DUMP_UNLOCKED = BIT(2), RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */ -- cgit From 3fcbecbdeb048dfd1bea824f4276717fed02d10e Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Sat, 9 Nov 2024 05:02:32 +0000 Subject: net: Add control functions for irq suspension The napi_suspend_irqs routine bootstraps irq suspension by elongating the defer timeout to irq_suspend_timeout. The napi_resume_irqs routine effectively cancels irq suspension by forcing the napi to be scheduled immediately. Signed-off-by: Martin Karsten Co-developed-by: Joe Damato Signed-off-by: Joe Damato Tested-by: Joe Damato Tested-by: Martin Karsten Acked-by: Stanislav Fomichev Reviewed-by: Sridhar Samudrala Link: https://patch.msgid.link/20241109050245.191288-3-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index f03040baaefd..c858270141bc 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -52,6 +52,9 @@ void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); +void napi_suspend_irqs(unsigned int napi_id); +void napi_resume_irqs(unsigned int napi_id); + #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { -- cgit From 37653a0b8a6f5d6ab23daa8e585c5ed24a0fc500 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:53 +0800 Subject: net: ip: make fib_validate_source() support drop reasons In this commit, we make fib_validate_source() and __fib_validate_source() return -reason instead of errno on error. The return value of fib_validate_source can be -errno, 0, and 1. It's hard to make fib_validate_source() return drop reasons directly. The fib_validate_source() will return 1 if the scope of the source(revert) route is HOST. And the __mkroute_input() will mark the skb with IPSKB_DOREDIRECT in this case (combine with some other conditions). And then, a REDIRECT ICMP will be sent in ip_forward() if this flag exists. We can't pass this information to __mkroute_input if we make fib_validate_source() return drop reasons. Therefore, we introduce the wrapper fib_validate_source_reason() for fib_validate_source(), which will return the drop reasons on error. In the origin logic, LINUX_MIB_IPRPFILTER will be counted if fib_validate_source() return -EXDEV. And now, we need to adjust it by checking "reason == SKB_DROP_REASON_IP_RPFILTER". However, this will take effect only after the patch "net: ip: make ip_route_input_noref() return drop reasons", as we can't pass the drop reasons from fib_validate_source() to ip_rcv_finish_core() in this patch. Following new drop reasons are added in this patch: SKB_DROP_REASON_IP_LOCAL_SOURCE SKB_DROP_REASON_IP_INVALID_SOURCE Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 10 ++++++++++ include/net/ip_fib.h | 12 ++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index d59bb96c5a02..62a60be1db84 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -76,6 +76,8 @@ FN(INVALID_PROTO) \ FN(IP_INADDRERRORS) \ FN(IP_INNOROUTES) \ + FN(IP_LOCAL_SOURCE) \ + FN(IP_INVALID_SOURCE) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -373,6 +375,14 @@ enum skb_drop_reason { * IPSTATS_MIB_INADDRERRORS */ SKB_DROP_REASON_IP_INNOROUTES, + /** @SKB_DROP_REASON_IP_LOCAL_SOURCE: the source ip is local */ + SKB_DROP_REASON_IP_LOCAL_SOURCE, + /** + * @SKB_DROP_REASON_IP_INVALID_SOURCE: the source ip is invalid: + * 1) source ip is multicast or limited broadcast + * 2) source ip is zero and not IGMP + */ + SKB_DROP_REASON_IP_INVALID_SOURCE, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index b6e44f4eaa4c..a113c11ab56b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -452,6 +452,18 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); +static inline enum skb_drop_reason +fib_validate_source_reason(struct sk_buff *skb, __be32 src, __be32 dst, + dscp_t dscp, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int err = fib_validate_source(skb, src, dst, dscp, oif, dev, idev, + itag); + if (err < 0) + return -err; + return SKB_NOT_DROPPED_YET; +} + #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { -- cgit From d46f827016d891dbc234cb05c406180f77fb3b2d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:55 +0800 Subject: net: ip: make ip_mc_validate_source() return drop reason Make ip_mc_validate_source() return drop reason, and adjust the call of it in ip_route_input_mc(). Another caller of it is ip_rcv_finish_core->udp_v4_early_demux, and the errno is not checked in detail, so we don't do more adjustment for it. The drop reason "SKB_DROP_REASON_IP_LOCALNET" is added in this commit. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 3 +++ include/net/route.h | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 62a60be1db84..a2a1fb90e0e5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -78,6 +78,7 @@ FN(IP_INNOROUTES) \ FN(IP_LOCAL_SOURCE) \ FN(IP_INVALID_SOURCE) \ + FN(IP_LOCALNET) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -383,6 +384,8 @@ enum skb_drop_reason { * 2) source ip is zero and not IGMP */ SKB_DROP_REASON_IP_INVALID_SOURCE, + /** @SKB_DROP_REASON_IP_LOCALNET: source or dest ip is local net */ + SKB_DROP_REASON_IP_LOCALNET, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/include/net/route.h b/include/net/route.h index 0a690adfdff5..e2e1922c58fb 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -199,9 +199,10 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 return ip_route_output_key(net, fl4); } -int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct in_device *in_dev, u32 *itag); +enum skb_drop_reason +ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, -- cgit From 5b92112acd8e2ed84a4df653fc20575f4da6fa49 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:56 +0800 Subject: net: ip: make ip_route_input_slow() return drop reasons In this commit, we make ip_route_input_slow() return skb drop reasons, and following new skb drop reasons are added: SKB_DROP_REASON_IP_INVALID_DEST The only caller of ip_route_input_slow() is ip_route_input_rcu(), and we adjust it by making it return -EINVAL on error. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index a2a1fb90e0e5..74624d369d48 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -79,6 +79,7 @@ FN(IP_LOCAL_SOURCE) \ FN(IP_INVALID_SOURCE) \ FN(IP_LOCALNET) \ + FN(IP_INVALID_DEST) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -386,6 +387,11 @@ enum skb_drop_reason { SKB_DROP_REASON_IP_INVALID_SOURCE, /** @SKB_DROP_REASON_IP_LOCALNET: source or dest ip is local net */ SKB_DROP_REASON_IP_LOCALNET, + /** + * @SKB_DROP_REASON_IP_INVALID_DEST: the dest ip is invalid: + * 1) dest ip is 0 + */ + SKB_DROP_REASON_IP_INVALID_DEST, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) -- cgit From 82d9983ebeb871cb5abd27c12a950c14c68772e1 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:58 +0800 Subject: net: ip: make ip_route_input_noref() return drop reasons In this commit, we make ip_route_input_noref() return drop reasons, which come from ip_route_input_rcu(). We need adjust the callers of ip_route_input_noref() to make sure the return value of ip_route_input_noref() is used properly. The errno that ip_route_input_noref() returns comes from ip_route_input and bpf_lwt_input_reroute in the origin logic, and we make them return -EINVAL on error instead. In the following patch, we will make ip_route_input() returns drop reasons too. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index e2e1922c58fb..b85ffa3e042b 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -203,8 +203,9 @@ enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag); -int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev); +enum skb_drop_reason +ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); @@ -212,18 +213,18 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, struct net_device *devin) { - int err; + enum skb_drop_reason reason; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, dscp, devin); - if (!err) { + reason = ip_route_input_noref(skb, dst, src, dscp, devin); + if (!reason) { skb_dst_force(skb); if (!skb_dst(skb)) - err = -EINVAL; + reason = SKB_DROP_REASON_NOT_SPECIFIED; } rcu_read_unlock(); - return err; + return reason ? -EINVAL : 0; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, -- cgit From 50038bf38e6577a15d52b890d82c197cf3b163a0 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:59 +0800 Subject: net: ip: make ip_route_input() return drop reasons In this commit, we make ip_route_input() return skb drop reasons that come from ip_route_input_noref(). Meanwhile, adjust all the call to it. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index b85ffa3e042b..fb3433dc9c72 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -210,8 +210,9 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); -static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - dscp_t dscp, struct net_device *devin) +static inline enum skb_drop_reason +ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, + struct net_device *devin) { enum skb_drop_reason reason; @@ -224,7 +225,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, } rcu_read_unlock(); - return reason ? -EINVAL : 0; + return reason; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, -- cgit From d9340d1e02779dbf83d53b0deb4068c7768b8261 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:56:00 +0800 Subject: net: ip: make ip_mkroute_input/__mkroute_input return drop reasons In this commit, we make ip_mkroute_input() and __mkroute_input() return drop reasons. The drop reason "SKB_DROP_REASON_ARP_PVLAN_DISABLE" is introduced for the case: the packet which is not IP is forwarded to the in_dev, and the proxy_arp_pvlan is not enabled. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 74624d369d48..6c5a1ea209a2 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -104,6 +104,7 @@ FN(IP_TUNNEL_ECN) \ FN(TUNNEL_TXINFO) \ FN(LOCAL_MAC) \ + FN(ARP_PVLAN_DISABLE) \ FNe(MAX) /** @@ -477,6 +478,12 @@ enum skb_drop_reason { * the MAC address of the local netdev. */ SKB_DROP_REASON_LOCAL_MAC, + /** + * @SKB_DROP_REASON_ARP_PVLAN_DISABLE: packet which is not IP is + * forwarded to the in_dev, and the proxy_arp_pvlan is not + * enabled. + */ + SKB_DROP_REASON_ARP_PVLAN_DISABLE, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen -- cgit From 479aed04e84a5d66caa3b25bfc651292c153ef70 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:56:01 +0800 Subject: net: ip: make ip_route_use_hint() return drop reasons In this commit, we make ip_route_use_hint() return drop reasons. The drop reasons that we return are similar to what we do in ip_route_input_slow(), and no drop reasons are added in this commit. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index fb3433dc9c72..84cb1e04f5cd 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -206,9 +206,10 @@ ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); -int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - const struct sk_buff *hint); +enum skb_drop_reason +ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + const struct sk_buff *hint); static inline enum skb_drop_reason ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, -- cgit From 6b998404c71ecff9ebeed343c1ce21f9df3ef131 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 12 Nov 2024 21:36:29 +0100 Subject: net: simplify eeecfg_mac_can_tx_lpi Simplify the function. Signed-off-by: Heiner Kallweit Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/f9a4623b-b94c-466c-8733-62057c6d9a17@gmail.com Signed-off-by: Jakub Kicinski --- include/net/eee.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/eee.h b/include/net/eee.h index 84837aba3cd9..cfab1b8bc46a 100644 --- a/include/net/eee.h +++ b/include/net/eee.h @@ -13,10 +13,7 @@ struct eee_config { static inline bool eeecfg_mac_can_tx_lpi(const struct eee_config *eeecfg) { /* eee_enabled is the master on/off */ - if (!eeecfg->eee_enabled || !eeecfg->tx_lpi_enabled) - return false; - - return true; + return eeecfg->eee_enabled && eeecfg->tx_lpi_enabled; } static inline void eeecfg_to_eee(struct ethtool_keee *eee, -- cgit From a8ee6b900c147d3bedced6c52ba6cb603226aaa3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Nov 2024 16:35:50 +0100 Subject: netfilter: nf_tables: prepare for multiple elements in nft_trans_elem structure Add helpers to release the individual elements contained in the trans_elem container structure. No functional change intended. Followup patch will add 'nelems' member and will turn 'priv' into a flexible array. These helpers can then loop over all elements. Care needs to be taken to handle a mix of new elements and existing elements that are being updated (e.g. timeout refresh). Before this patch, NEWSETELEM transaction with update is released early so nft_trans_set_elem_destroy() won't get called, so we need to skip elements marked as update. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f24278767bfd..37af0b174c39 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1759,28 +1759,25 @@ enum nft_trans_elem_flags { NFT_TRANS_UPD_EXPIRATION = (1 << 1), }; -struct nft_trans_elem { - struct nft_trans nft_trans; - struct nft_set *set; - struct nft_elem_priv *elem_priv; +struct nft_trans_one_elem { + struct nft_elem_priv *priv; u64 timeout; u64 expiration; u8 update_flags; +}; + +struct nft_trans_elem { + struct nft_trans nft_trans; + struct nft_set *set; bool bound; + unsigned int nelems; + struct nft_trans_one_elem elems[] __counted_by(nelems); }; #define nft_trans_container_elem(t) \ container_of(t, struct nft_trans_elem, nft_trans) #define nft_trans_elem_set(trans) \ nft_trans_container_elem(trans)->set -#define nft_trans_elem_priv(trans) \ - nft_trans_container_elem(trans)->elem_priv -#define nft_trans_elem_update_flags(trans) \ - nft_trans_container_elem(trans)->update_flags -#define nft_trans_elem_timeout(trans) \ - nft_trans_container_elem(trans)->timeout -#define nft_trans_elem_expiration(trans) \ - nft_trans_container_elem(trans)->expiration #define nft_trans_elem_set_bound(trans) \ nft_trans_container_elem(trans)->bound -- cgit From 508180850b732c7a0e3a728460cf3f95f25e1fbd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Nov 2024 16:35:53 +0100 Subject: netfilter: nf_tables: allocate element update information dynamically Move the timeout/expire/flag members from nft_trans_one_elem struct into a dybamically allocated structure, only needed when timeout update was requested. This halves size of nft_trans_one_elem struct and allows to compact up to 124 elements in one transaction container rather than 62. This halves memory requirements for a large flush or insert transaction, where ->update remains NULL. Care has to be taken to release the extra data in all spots, including abort path. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 37af0b174c39..80a537ac26cd 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1759,11 +1759,15 @@ enum nft_trans_elem_flags { NFT_TRANS_UPD_EXPIRATION = (1 << 1), }; -struct nft_trans_one_elem { - struct nft_elem_priv *priv; +struct nft_elem_update { u64 timeout; u64 expiration; - u8 update_flags; + u8 flags; +}; + +struct nft_trans_one_elem { + struct nft_elem_priv *priv; + struct nft_elem_update *update; }; struct nft_trans_elem { -- cgit From 94464a7b71634037b13d54021e0dfd0fb0d8c1f0 Mon Sep 17 00:00:00 2001 From: Danil Pylaev Date: Mon, 21 Oct 2024 12:22:44 +0000 Subject: Bluetooth: Add new quirks for ATS2851 This adds quirks for broken extended create connection, and write auth payload timeout. Signed-off-by: Danil Pylaev Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 14 ++++++++++++++ include/net/bluetooth/hci_core.h | 10 ++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index bab1e3d7452a..4f64066915be 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -300,6 +300,20 @@ enum { */ HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, + /* + * When this quirk is set, the HCI_OP_LE_EXT_CREATE_CONN command is + * disabled. This is required for the Actions Semiconductor ATS2851 + * based controllers, which erroneously claims to support it. + */ + HCI_QUIRK_BROKEN_EXT_CREATE_CONN, + + /* + * When this quirk is set, the command WRITE_AUTH_PAYLOAD_TIMEOUT is + * skipped. This is required for the Actions Semiconductor ATS2851 + * based controllers, due to a race condition in pairing process. + */ + HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT, + /* When this quirk is set, MSFT extension monitor tracking by * address filter is supported. Since tracking quantity of each * pattern is limited, this feature supports tracking multiple diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 88265d37aa72..94ddc8684973 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1871,8 +1871,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks)) /* Use ext create connection if command is supported */ -#define use_ext_conn(dev) ((dev)->commands[37] & 0x80) - +#define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \ + !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &(dev)->quirks)) /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) @@ -1885,8 +1885,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); * C24: Mandatory if the LE Controller supports Connection State and either * LE Feature (LL Privacy) or LE Feature (Extended Advertising) is supported */ -#define use_enhanced_conn_complete(dev) (ll_privacy_capable(dev) || \ - ext_adv_capable(dev)) +#define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \ + ext_adv_capable(dev)) && \ + !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, \ + &(dev)->quirks)) /* Periodic advertising support */ #define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV)) -- cgit From 4a5e0ba68676b3a77298cf646cd2b39c94fbd2f5 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Fri, 1 Nov 2024 10:23:36 +0200 Subject: Bluetooth: ISO: Do not emit LE PA Create Sync if previous is pending The Bluetooth Core spec does not allow a LE PA Create sync command to be sent to Controller if another one is pending (Vol 4, Part E, page 2493). In order to avoid this issue, the HCI_CONN_CREATE_PA_SYNC was added to mark that the LE PA Create Sync command has been sent for a hcon. Once the PA Sync Established event is received, the hcon flag is erased and the next pending hcon is handled. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 3 ++- include/net/bluetooth/hci_core.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4f64066915be..4becf201b063 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1,7 +1,7 @@ /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated - Copyright 2023 NXP + Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky @@ -697,6 +697,7 @@ enum { #define HCI_RSSI_INVALID 127 #define HCI_SYNC_HANDLE_INVALID 0xffff +#define HCI_SID_INVALID 0xff #define HCI_ROLE_MASTER 0x00 #define HCI_ROLE_SLAVE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 94ddc8684973..43474b751a50 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -668,6 +668,7 @@ struct hci_conn { __u8 adv_instance; __u16 handle; __u16 sync_handle; + __u8 sid; __u16 state; __u16 mtu; __u8 mode; @@ -947,6 +948,7 @@ enum { HCI_CONN_CREATE_CIS, HCI_CONN_BIG_SYNC, HCI_CONN_BIG_SYNC_FAILED, + HCI_CONN_CREATE_PA_SYNC, HCI_CONN_PA_SYNC, HCI_CONN_PA_SYNC_FAILED, }; @@ -1099,6 +1101,30 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev, + __u8 sid, + bdaddr_t *dst, + __u8 dst_type) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != ISO_LINK || bacmp(&c->dst, dst) || + c->dst_type != dst_type || c->sid != sid) + continue; + + rcu_read_unlock(); + return c; + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn * hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, bdaddr_t *ba, @@ -1328,6 +1354,13 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) if (c->type != ISO_LINK) continue; + /* Ignore the listen hcon, we are looking + * for the child hcon that was created as + * a result of the PA sync established event. + */ + if (c->state == BT_LISTEN) + continue; + if (c->sync_handle == sync_handle) { rcu_read_unlock(); return c; @@ -1445,6 +1478,7 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); +int hci_pa_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, -- cgit From 42ecf1947135110ea08abeaca39741636f9a2285 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Fri, 1 Nov 2024 10:23:38 +0200 Subject: Bluetooth: ISO: Do not emit LE BIG Create Sync if previous is pending The Bluetooth Core spec does not allow a LE BIG Create sync command to be sent to Controller if another one is pending (Vol 4, Part E, page 2586). In order to avoid this issue, the HCI_CONN_CREATE_BIG_SYNC was added to mark that the LE BIG Create Sync command has been sent for a hcon. Once the BIG Sync Established event is received, the hcon flag is erased and the next pending hcon is handled. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4becf201b063..5bb4eaa52e14 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -29,6 +29,7 @@ #define HCI_MAX_ACL_SIZE 1024 #define HCI_MAX_SCO_SIZE 255 #define HCI_MAX_ISO_SIZE 251 +#define HCI_MAX_ISO_BIS 31 #define HCI_MAX_EVENT_SIZE 260 #define HCI_MAX_FRAME_SIZE (HCI_MAX_ACL_SIZE + 4) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 43474b751a50..c95f7e6ba255 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -711,6 +711,9 @@ struct hci_conn { __s8 tx_power; __s8 max_tx_power; struct bt_iso_qos iso_qos; + __u8 num_bis; + __u8 bis[HCI_MAX_ISO_BIS]; + unsigned long flags; enum conn_reasons conn_reason; @@ -946,6 +949,7 @@ enum { HCI_CONN_PER_ADV, HCI_CONN_BIG_CREATED, HCI_CONN_CREATE_CIS, + HCI_CONN_CREATE_BIG_SYNC, HCI_CONN_BIG_SYNC, HCI_CONN_BIG_SYNC_FAILED, HCI_CONN_CREATE_PA_SYNC, @@ -1295,6 +1299,30 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn * +hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, + __u8 handle, __u8 num_bis) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != ISO_LINK) + continue; + + if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn * hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) { @@ -1479,6 +1507,7 @@ void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); int hci_pa_create_sync_pending(struct hci_dev *hdev); +int hci_le_big_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, -- cgit From 83d328a72eff3268ea4c19deb0a6cf4c7da15746 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Fri, 1 Nov 2024 10:23:39 +0200 Subject: Bluetooth: ISO: Update hci_conn_hash_lookup_big for Broadcast slave Currently, hci_conn_hash_lookup_big only checks for BIS master connections, by filtering out connections with the destination address set. This commit updates this function to also consider BIS slave connections, since it is also used for a Broadcast Receiver to set an available BIG handle before issuing the LE BIG Create Sync command. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c95f7e6ba255..ea798f07c5a2 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1285,7 +1285,17 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK) + if (c->type != ISO_LINK) + continue; + + /* An ISO_LINK hcon with BDADDR_ANY as destination + * address is a Broadcast connection. A Broadcast + * slave connection is associated with a PA train, + * so the sync_handle can be used to differentiate + * from unicast. + */ + if (bacmp(&c->dst, BDADDR_ANY) && + c->sync_handle == HCI_SYNC_HANDLE_INVALID) continue; if (handle == c->iso_qos.bcast.big) { -- cgit From 96e7c4273560be4744ba8c444bc6969745315251 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 6 Nov 2024 09:53:45 -0500 Subject: Bluetooth: HCI: Add IPC(11) bus type Zephyr(1) has been using the same bus defines as Linux so tools likes of btmon, etc, are able to decode the bus used by the driver to transport HCI packets. Link: https://github.com/zephyrproject-rtos/zephyr/pull/80808 Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 5bb4eaa52e14..6203bd8663b7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -68,6 +68,7 @@ #define HCI_I2C 8 #define HCI_SMD 9 #define HCI_VIRTIO 10 +#define HCI_IPC 11 /* HCI device quirks */ enum { -- cgit From 827af4787e74e8df9e8e0677a69fbb15e0856d2f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 23 Oct 2024 16:55:57 -0400 Subject: Bluetooth: MGMT: Add initial implementation of MGMT_OP_HCI_CMD_SYNC This adds the initial implementation of MGMT_OP_HCI_CMD_SYNC as documented in mgmt-api (BlueZ tree): Send HCI command and wait for event Command =========================================== Command Code: 0x005B Controller Index: Command Parameters: Opcode (2 Octets) Event (1 Octet) Timeout (1 Octet) Parameter Length (2 Octets) Parameter (variable) Return Parameters: Response (1-variable Octets) This command may be used to send a HCI command and wait for an (optional) event. The HCI command is specified by the Opcode, any arbitrary is supported including vendor commands, but contrary to the like of Raw/User channel it is run as an HCI command send by the kernel since it uses its command synchronization thus it is possible to wait for a specific event as a response. Setting event to 0x00 will cause the command to wait for either HCI Command Status or HCI Command Complete. Timeout is specified in seconds, setting it to 0 will cause the default timeout to be used. Possible errors: Failed Invalid Parameters Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index d382679efd2b..affac861efdc 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -878,6 +878,16 @@ struct mgmt_cp_mesh_send_cancel { } __packed; #define MGMT_MESH_SEND_CANCEL_SIZE 1 +#define MGMT_OP_HCI_CMD_SYNC 0x005B +struct mgmt_cp_hci_cmd_sync { + __le16 opcode; + __u8 event; + __u8 timeout; + __le16 params_len; + __u8 params[]; +} __packed; +#define MGMT_HCI_CMD_SYNC_SIZE 6 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; -- cgit From accdd51dc74ff65b7b7be1961b11723d228fbbbd Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:04 +0800 Subject: net/udp: Add a new struct for hash2 slot Preparing for udp 4-tuple hash (uhash4 for short). To implement uhash4 without cache line missing when lookup, hslot2 is used to record the number of hashed sockets in hslot4. Thus adding a new struct udp_hslot_main with field hash4_cnt, which is used by hash2. The new struct is used to avoid doubling the size of udp_hslot. Before uhash4 lookup, firstly checking hash4_cnt to see if there are hashed sks in hslot4. Because hslot2 is always used in lookup, there is no cache line miss. Related helpers are updated, and use the helpers as possible. uhash4 is implemented in following patches. Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index 61222545ab1c..62a7207e65f2 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -50,7 +50,7 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** - * struct udp_hslot - UDP hash slot + * struct udp_hslot - UDP hash slot used by udp_table.hash * * @head: head of list of sockets * @count: number of sockets in 'head' list @@ -60,7 +60,22 @@ struct udp_hslot { struct hlist_head head; int count; spinlock_t lock; -} __attribute__((aligned(2 * sizeof(long)))); +} __aligned(2 * sizeof(long)); + +/** + * struct udp_hslot_main - UDP hash slot used by udp_table.hash2 + * + * @hslot: basic hash slot + * @hash4_cnt: number of sockets in hslot4 of the same + * (local port, local address) + */ +struct udp_hslot_main { + struct udp_hslot hslot; /* must be the first member */ +#if !IS_ENABLED(CONFIG_BASE_SMALL) + u32 hash4_cnt; +#endif +} __aligned(2 * sizeof(long)); +#define UDP_HSLOT_MAIN(__hslot) ((struct udp_hslot_main *)(__hslot)) /** * struct udp_table - UDP table @@ -72,7 +87,7 @@ struct udp_hslot { */ struct udp_table { struct udp_hslot *hash; - struct udp_hslot *hash2; + struct udp_hslot_main *hash2; unsigned int mask; unsigned int log; }; @@ -84,6 +99,7 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table, { return &table->hash[udp_hashfn(net, num, table->mask)]; } + /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() @@ -91,8 +107,22 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table, static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { - return &table->hash2[hash & table->mask]; + return &table->hash2[hash & table->mask].hslot; +} + +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_table_hash4_init(struct udp_table *table) +{ +} +#else /* !CONFIG_BASE_SMALL */ + +/* Must be called with table->hash2 initialized */ +static inline void udp_table_hash4_init(struct udp_table *table) +{ + for (int i = 0; i <= table->mask; i++) + table->hash2[i].hash4_cnt = 0; } +#endif /* CONFIG_BASE_SMALL */ extern struct proto udp_prot; -- cgit From dab78a1745ab3c6001e1e4d50a9d09efef8e260d Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:05 +0800 Subject: net/udp: Add 4-tuple hash list basis Add a new hash list, hash4, in udp table. It will be used to implement 4-tuple hash for connected udp sockets. This patch adds the hlist to table, and implements helpers and the initialization. 4-tuple hash is implemented in the following patch. hash4 uses hlist_nulls to avoid moving wrongly onto another hlist due to concurrent rehash, because rehash() can happen with lookup(). Co-developed-by: Cambda Zhu Signed-off-by: Cambda Zhu Co-developed-by: Fred Chen Signed-off-by: Fred Chen Co-developed-by: Yubing Qiu Signed-off-by: Yubing Qiu Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index 62a7207e65f2..edb669967130 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -50,14 +50,21 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** - * struct udp_hslot - UDP hash slot used by udp_table.hash + * struct udp_hslot - UDP hash slot used by udp_table.hash/hash4 * * @head: head of list of sockets + * @nulls_head: head of list of sockets, only used by hash4 * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { - struct hlist_head head; + union { + struct hlist_head head; + /* hash4 uses hlist_nulls to avoid moving wrongly onto another + * hlist, because rehash() can happen with lookup(). + */ + struct hlist_nulls_head nulls_head; + }; int count; spinlock_t lock; } __aligned(2 * sizeof(long)); @@ -82,12 +89,17 @@ struct udp_hslot_main { * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) + * @hash4: hash table, connected sockets are hashed on + * (local port, local address, remote port, remote address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; struct udp_hslot_main *hash2; +#if !IS_ENABLED(CONFIG_BASE_SMALL) + struct udp_hslot *hash4; +#endif unsigned int mask; unsigned int log; }; @@ -114,13 +126,80 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, static inline void udp_table_hash4_init(struct udp_table *table) { } + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + BUILD_BUG(); + return NULL; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return false; +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return 0; +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return false; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ +} #else /* !CONFIG_BASE_SMALL */ /* Must be called with table->hash2 initialized */ static inline void udp_table_hash4_init(struct udp_table *table) { - for (int i = 0; i <= table->mask; i++) + table->hash4 = (void *)(table->hash2 + (table->mask + 1)); + for (int i = 0; i <= table->mask; i++) { table->hash2[i].hash4_cnt = 0; + + INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i); + table->hash4[i].count = 0; + spin_lock_init(&table->hash4[i].lock); + } +} + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + return &table->hash4[hash & table->mask]; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node); +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return sizeof(struct udp_hslot); +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return UDP_HSLOT_MAIN(hslot2)->hash4_cnt; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt++; +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt--; } #endif /* CONFIG_BASE_SMALL */ -- cgit From 78c91ae2c6deb5d236a5a93ff2995cdd05514380 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:06 +0800 Subject: ipv4/udp: Add 4-tuple hash for connected socket Currently, the udp_table has two hash table, the port hash and portaddr hash. Usually for UDP servers, all sockets have the same local port and addr, so they are all on the same hash slot within a reuseport group. In some applications, UDP servers use connect() to manage clients. In particular, when firstly receiving from an unseen 4 tuple, a new socket is created and connect()ed to the remote addr:port, and then the fd is used exclusively by the client. Once there are connected sks in a reuseport group, udp has to score all sks in the same hash2 slot to find the best match. This could be inefficient with a large number of connections, resulting in high softirq overhead. To solve the problem, this patch implement 4-tuple hash for connected udp sockets. During connect(), hash4 slot is updated, as well as a corresponding counter, hash4_cnt, in hslot2. In __udp4_lib_lookup(), hslot4 will be searched firstly if the counter is non-zero. Otherwise, hslot2 is used like before. Note that only connected sockets enter this hash4 path, while un-connected ones are not affected. hlist_nulls is used for hash4, because we probably move to another hslot wrongly when lookup with concurrent rehash. Then we check nulls at the list end to see if we should restart lookup. Because udp does not use SLAB_TYPESAFE_BY_RCU, we don't need to touch sk_refcnt when lookup. Stress test results (with 1 cpu fully used) are shown below, in pps: (1) _un-connected_ socket as server [a] w/o hash4: 1,825176 [b] w/ hash4: 1,831750 (+0.36%) (2) 500 _connected_ sockets as server [c] w/o hash4: 290860 (only 16% of [a]) [d] w/ hash4: 1,889658 (+3.1% compared with [b]) With hash4, compute_score is skipped when lookup, so [d] is slightly better than [b]. Co-developed-by: Cambda Zhu Signed-off-by: Cambda Zhu Co-developed-by: Fred Chen Signed-off-by: Fred Chen Co-developed-by: Yubing Qiu Signed-off-by: Yubing Qiu Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index edb669967130..feb06c0e48fb 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -302,13 +302,27 @@ static inline int udp_lib_hash(struct sock *sk) } void udp_lib_unhash(struct sock *sk); -void udp_lib_rehash(struct sock *sk, u16 new_hash); +void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } +/* hash4 routines shared between UDPv4/6 */ +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_lib_hash4(struct sock *sk, u16 hash) +{ +} + +static inline void udp4_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +void udp_lib_hash4(struct sock *sk, u16 hash); +void udp4_hash4(struct sock *sk); +#endif /* CONFIG_BASE_SMALL */ + int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); -- cgit From 1b29a730ef8b6fd3aa3e11c2f6d409cf201cd913 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:07 +0800 Subject: ipv6/udp: Add 4-tuple hash for connected socket Implement ipv6 udp hash4 like that in ipv4. The major difference is that the hash value should be calculated with udp6_ehashfn(). Besides, ipv4-mapped ipv6 address is handled before hash() and rehash(). Export udp_ehashfn because now we use it in udpv6 rehash. Core procedures of hash/unhash/rehash are same as ipv4, and udpv4 and udpv6 share the same udptable, so some functions in ipv4 hash4 can also be shared. Co-developed-by: Cambda Zhu Signed-off-by: Cambda Zhu Co-developed-by: Fred Chen Signed-off-by: Fred Chen Co-developed-by: Yubing Qiu Signed-off-by: Yubing Qiu Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index feb06c0e48fb..6e89520e100d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -303,6 +303,8 @@ static inline int udp_lib_hash(struct sock *sk) void udp_lib_unhash(struct sock *sk); void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4); +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); static inline void udp_lib_close(struct sock *sk, long timeout) { -- cgit