diff options
Diffstat (limited to 'net')
217 files changed, 7940 insertions, 6572 deletions
diff --git a/net/Makefile b/net/Makefile index 6a62e5b27378..0914bea9c335 100644 --- a/net/Makefile +++ b/net/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ +obj-$(CONFIG_NET_DEVLINK) += devlink/ obj-$(CONFIG_NET_DSA) += dsa/ obj-$(CONFIG_ATALK) += appletalk/ obj-$(CONFIG_X25) += x25/ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 114ee5da261f..828fb393ee94 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -27,7 +27,6 @@ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> -#include <linux/prandom.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index f9a58fb5442e..acff565849ae 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -21,7 +21,6 @@ #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/nl80211.h> -#include <linux/prandom.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index addfd8c4fe95..e710e9afe78f 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -21,7 +21,6 @@ #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> -#include <linux/prandom.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> @@ -800,8 +799,8 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, /* only unknown & newer OGMs contain TVLVs we are interested in */ if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT) - batadv_tvlv_containers_process(bat_priv, true, orig_node, - NULL, NULL, + batadv_tvlv_containers_process(bat_priv, BATADV_OGM2, orig_node, + NULL, (unsigned char *)(ogm2 + 1), ntohs(ogm2->tvlv_len)); diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index fefb51a5f606..6968e55eb971 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -822,7 +822,7 @@ int batadv_dat_init(struct batadv_priv *bat_priv) batadv_dat_start_timer(bat_priv); batadv_tvlv_handler_register(bat_priv, batadv_dat_tvlv_ogm_handler_v1, - NULL, BATADV_TVLV_DAT, 1, + NULL, NULL, BATADV_TVLV_DAT, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_dat_tvlv_container_update(bat_priv); return 0; diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 9349c76f30c5..6a964a773f57 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -259,7 +259,7 @@ void batadv_gw_init(struct batadv_priv *bat_priv) atomic_set(&bat_priv->gw.sel_class, 1); batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1, - NULL, BATADV_TVLV_GW, 1, + NULL, NULL, BATADV_TVLV_GW, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); } diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index c48803b32bb0..156ed39eded1 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2022.3" +#define BATADV_SOURCE_VERSION "2023.1" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index b238455913df..315394f12c55 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -26,7 +26,6 @@ #include <linux/ipv6.h> #include <linux/jiffies.h> #include <linux/kernel.h> -#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -1137,222 +1136,19 @@ static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_tt_node_get() - get a multicast tt node - * @bat_priv: the bat priv with all the soft interface information - * @ethhdr: the ether header containing the multicast destination - * - * Return: an orig_node matching the multicast address provided by ethhdr - * via a translation table lookup. This increases the returned nodes refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, - struct ethhdr *ethhdr) -{ - return batadv_transtable_search(bat_priv, NULL, ethhdr->h_dest, - BATADV_NO_FLAGS); -} - -/** - * batadv_mcast_forw_ipv4_node_get() - get a node with an ipv4 flag - * @bat_priv: the bat priv with all the soft interface information - * - * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and - * increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) -{ - struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_orig_node, - &bat_priv->mcast.want_all_ipv4_list, - mcast_want_all_ipv4_node) { - if (!kref_get_unless_zero(&tmp_orig_node->refcount)) - continue; - - orig_node = tmp_orig_node; - break; - } - rcu_read_unlock(); - - return orig_node; -} - -/** - * batadv_mcast_forw_ipv6_node_get() - get a node with an ipv6 flag - * @bat_priv: the bat priv with all the soft interface information - * - * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set - * and increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) -{ - struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_orig_node, - &bat_priv->mcast.want_all_ipv6_list, - mcast_want_all_ipv6_node) { - if (!kref_get_unless_zero(&tmp_orig_node->refcount)) - continue; - - orig_node = tmp_orig_node; - break; - } - rcu_read_unlock(); - - return orig_node; -} - -/** - * batadv_mcast_forw_ip_node_get() - get a node with an ipv4/ipv6 flag - * @bat_priv: the bat priv with all the soft interface information - * @ethhdr: an ethernet header to determine the protocol family from - * - * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or - * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, sets and - * increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, - struct ethhdr *ethhdr) -{ - switch (ntohs(ethhdr->h_proto)) { - case ETH_P_IP: - return batadv_mcast_forw_ipv4_node_get(bat_priv); - case ETH_P_IPV6: - return batadv_mcast_forw_ipv6_node_get(bat_priv); - default: - /* we shouldn't be here... */ - return NULL; - } -} - -/** - * batadv_mcast_forw_unsnoop_node_get() - get a node with an unsnoopable flag - * @bat_priv: the bat priv with all the soft interface information - * - * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag - * set and increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) -{ - struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_orig_node, - &bat_priv->mcast.want_all_unsnoopables_list, - mcast_want_all_unsnoopables_node) { - if (!kref_get_unless_zero(&tmp_orig_node->refcount)) - continue; - - orig_node = tmp_orig_node; - break; - } - rcu_read_unlock(); - - return orig_node; -} - -/** - * batadv_mcast_forw_rtr4_node_get() - get a node with an ipv4 mcast router flag - * @bat_priv: the bat priv with all the soft interface information - * - * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR4 flag unset and - * increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_rtr4_node_get(struct batadv_priv *bat_priv) -{ - struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_orig_node, - &bat_priv->mcast.want_all_rtr4_list, - mcast_want_all_rtr4_node) { - if (!kref_get_unless_zero(&tmp_orig_node->refcount)) - continue; - - orig_node = tmp_orig_node; - break; - } - rcu_read_unlock(); - - return orig_node; -} - -/** - * batadv_mcast_forw_rtr6_node_get() - get a node with an ipv6 mcast router flag - * @bat_priv: the bat priv with all the soft interface information - * - * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR6 flag unset - * and increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_rtr6_node_get(struct batadv_priv *bat_priv) -{ - struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_orig_node, - &bat_priv->mcast.want_all_rtr6_list, - mcast_want_all_rtr6_node) { - if (!kref_get_unless_zero(&tmp_orig_node->refcount)) - continue; - - orig_node = tmp_orig_node; - break; - } - rcu_read_unlock(); - - return orig_node; -} - -/** - * batadv_mcast_forw_rtr_node_get() - get a node with an ipv4/ipv6 router flag - * @bat_priv: the bat priv with all the soft interface information - * @ethhdr: an ethernet header to determine the protocol family from - * - * Return: an orig_node which has no BATADV_MCAST_WANT_NO_RTR4 or - * BATADV_MCAST_WANT_NO_RTR6 flag, depending on the provided ethhdr, set and - * increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_rtr_node_get(struct batadv_priv *bat_priv, - struct ethhdr *ethhdr) -{ - switch (ntohs(ethhdr->h_proto)) { - case ETH_P_IP: - return batadv_mcast_forw_rtr4_node_get(bat_priv); - case ETH_P_IPV6: - return batadv_mcast_forw_rtr6_node_get(bat_priv); - default: - /* we shouldn't be here... */ - return NULL; - } -} - -/** * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the soft interface information - * @skb: The multicast packet to check - * @orig: an originator to be set to forward the skb to + * @skb: the multicast packet to check * @is_routable: stores whether the destination is routable * - * Return: the forwarding mode as enum batadv_forw_mode and in case of - * BATADV_FORW_SINGLE set the orig to the single originator the skb - * should be forwarded to. + * Return: The forwarding mode as enum batadv_forw_mode. */ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, - struct batadv_orig_node **orig, int *is_routable) + int *is_routable) { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; - unsigned int mcast_fanout; struct ethhdr *ethhdr; int rtr_count = 0; @@ -1361,7 +1157,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) - return BATADV_FORW_ALL; + return BATADV_FORW_BCAST; ethhdr = eth_hdr(skb); @@ -1374,32 +1170,15 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, total_count = tt_count + ip_count + unsnoop_count + rtr_count; - switch (total_count) { - case 1: - if (tt_count) - *orig = batadv_mcast_forw_tt_node_get(bat_priv, ethhdr); - else if (ip_count) - *orig = batadv_mcast_forw_ip_node_get(bat_priv, ethhdr); - else if (unsnoop_count) - *orig = batadv_mcast_forw_unsnoop_node_get(bat_priv); - else if (rtr_count) - *orig = batadv_mcast_forw_rtr_node_get(bat_priv, - ethhdr); - - if (*orig) - return BATADV_FORW_SINGLE; - - fallthrough; - case 0: + if (!total_count) return BATADV_FORW_NONE; - default: - mcast_fanout = atomic_read(&bat_priv->multicast_fanout); + else if (unsnoop_count) + return BATADV_FORW_BCAST; - if (!unsnoop_count && total_count <= mcast_fanout) - return BATADV_FORW_SOME; - } + if (total_count <= atomic_read(&bat_priv->multicast_fanout)) + return BATADV_FORW_UCASTS; - return BATADV_FORW_ALL; + return BATADV_FORW_BCAST; } /** @@ -1411,10 +1190,10 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ -int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, - struct sk_buff *skb, - unsigned short vid, - struct batadv_orig_node *orig_node) +static int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned short vid, + struct batadv_orig_node *orig_node) { /* Avoid sending multicast-in-unicast packets to other BLA * gateways - they already got the frame from the LAN side @@ -2039,7 +1818,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, void batadv_mcast_init(struct batadv_priv *bat_priv) { batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, - NULL, BATADV_TVLV_MCAST, 2, + NULL, NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update); diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 8aec818d0bf6..a9770d8d6d36 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -17,23 +17,16 @@ */ enum batadv_forw_mode { /** - * @BATADV_FORW_ALL: forward the packet to all nodes (currently via - * classic flooding) + * @BATADV_FORW_BCAST: forward the packet to all nodes via a batman-adv + * broadcast packet */ - BATADV_FORW_ALL, + BATADV_FORW_BCAST, /** - * @BATADV_FORW_SOME: forward the packet to some nodes (currently via - * a multicast-to-unicast conversion and the BATMAN unicast routing - * protocol) + * @BATADV_FORW_UCASTS: forward the packet to some nodes via one + * or more batman-adv unicast packets */ - BATADV_FORW_SOME, - - /** - * @BATADV_FORW_SINGLE: forward the packet to a single node (currently - * via the BATMAN unicast routing protocol) - */ - BATADV_FORW_SINGLE, + BATADV_FORW_UCASTS, /** @BATADV_FORW_NONE: don't forward, drop it */ BATADV_FORW_NONE, @@ -43,14 +36,8 @@ enum batadv_forw_mode { enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, - struct batadv_orig_node **mcast_single_orig, int *is_routable); -int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, - struct sk_buff *skb, - unsigned short vid, - struct batadv_orig_node *orig_node); - int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable); @@ -69,20 +56,9 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node); static inline enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, - struct batadv_orig_node **mcast_single_orig, int *is_routable) { - return BATADV_FORW_ALL; -} - -static inline int -batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, - struct sk_buff *skb, - unsigned short vid, - struct batadv_orig_node *orig_node) -{ - kfree_skb(skb); - return NET_XMIT_DROP; + return BATADV_FORW_BCAST; } static inline int diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index bf29fba4dde5..71ebd0284f95 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -25,8 +25,8 @@ #include <linux/lockdep.h> #include <linux/net.h> #include <linux/netdevice.h> -#include <linux/prandom.h> #include <linux/printk.h> +#include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> @@ -160,7 +160,7 @@ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) batadv_nc_start_timer(bat_priv); batadv_tvlv_handler_register(bat_priv, batadv_nc_tvlv_ogm_handler_v1, - NULL, BATADV_TVLV_NC, 1, + NULL, NULL, BATADV_TVLV_NC, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_nc_tvlv_container_update(bat_priv); return 0; diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 83f31494ea4d..163cd43c4821 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1073,10 +1073,9 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb, if (tvlv_buff_len > skb->len - hdr_size) goto free_skb; - ret = batadv_tvlv_containers_process(bat_priv, false, NULL, - unicast_tvlv_packet->src, - unicast_tvlv_packet->dst, - tvlv_buff, tvlv_buff_len); + ret = batadv_tvlv_containers_process(bat_priv, BATADV_UNICAST_TVLV, + NULL, skb, tvlv_buff, + tvlv_buff_len); if (ret != NET_RX_SUCCESS) { ret = batadv_route_unicast_packet(skb, recv_if); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 0f5c0679b55a..125f4628687c 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -48,7 +48,6 @@ #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" -#include "originator.h" #include "send.h" #include "translation-table.h" @@ -196,8 +195,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, unsigned short vid; u32 seqno; int gw_mode; - enum batadv_forw_mode forw_mode = BATADV_FORW_SINGLE; - struct batadv_orig_node *mcast_single_orig = NULL; + enum batadv_forw_mode forw_mode = BATADV_FORW_BCAST; int mcast_is_routable = 0; int network_offset = ETH_HLEN; __be16 proto; @@ -301,14 +299,18 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, send: if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { forw_mode = batadv_mcast_forw_mode(bat_priv, skb, - &mcast_single_orig, &mcast_is_routable); - if (forw_mode == BATADV_FORW_NONE) - goto dropped; - - if (forw_mode == BATADV_FORW_SINGLE || - forw_mode == BATADV_FORW_SOME) + switch (forw_mode) { + case BATADV_FORW_BCAST: + break; + case BATADV_FORW_UCASTS: do_bcast = false; + break; + case BATADV_FORW_NONE: + fallthrough; + default: + goto dropped; + } } } @@ -357,10 +359,7 @@ send: if (ret) goto dropped; ret = batadv_send_skb_via_gw(bat_priv, skb, vid); - } else if (mcast_single_orig) { - ret = batadv_mcast_forw_send_orig(bat_priv, skb, vid, - mcast_single_orig); - } else if (forw_mode == BATADV_FORW_SOME) { + } else if (forw_mode == BATADV_FORW_UCASTS) { ret = batadv_mcast_forw_send(bat_priv, skb, vid, mcast_is_routable); } else { @@ -386,7 +385,6 @@ dropped: dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: - batadv_orig_node_put(mcast_single_orig); batadv_hardif_put(primary_if); return NETDEV_TX_OK; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 01d30c1e412c..36ca31252a73 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -4168,11 +4168,11 @@ int batadv_tt_init(struct batadv_priv *bat_priv) } batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1, - batadv_tt_tvlv_unicast_handler_v1, + batadv_tt_tvlv_unicast_handler_v1, NULL, BATADV_TVLV_TT, 1, BATADV_NO_FLAGS); batadv_tvlv_handler_register(bat_priv, NULL, - batadv_roam_tvlv_unicast_handler_v1, + batadv_roam_tvlv_unicast_handler_v1, NULL, BATADV_TVLV_ROAM, 1, BATADV_NO_FLAGS); INIT_DELAYED_WORK(&bat_priv->tt.work, batadv_tt_purge); diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 7ec2e2343884..2a583215d439 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -352,10 +352,9 @@ end: * appropriate handlers * @bat_priv: the bat priv with all the soft interface information * @tvlv_handler: tvlv callback function handling the tvlv content - * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet + * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet - * @src: source mac address of the unicast packet - * @dst: destination mac address of the unicast packet + * @skb: the skb the TVLV handler is called for * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * @@ -364,15 +363,20 @@ end: */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, - bool ogm_source, + u8 packet_type, struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_value, u16 tvlv_value_len) + struct sk_buff *skb, void *tvlv_value, + u16 tvlv_value_len) { + unsigned int tvlv_offset; + u8 *src, *dst; + if (!tvlv_handler) return NET_RX_SUCCESS; - if (ogm_source) { + switch (packet_type) { + case BATADV_IV_OGM: + case BATADV_OGM2: if (!tvlv_handler->ogm_handler) return NET_RX_SUCCESS; @@ -383,19 +387,32 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, BATADV_NO_FLAGS, tvlv_value, tvlv_value_len); tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED; - } else { - if (!src) - return NET_RX_SUCCESS; - - if (!dst) + break; + case BATADV_UNICAST_TVLV: + if (!skb) return NET_RX_SUCCESS; if (!tvlv_handler->unicast_handler) return NET_RX_SUCCESS; + src = ((struct batadv_unicast_tvlv_packet *)skb->data)->src; + dst = ((struct batadv_unicast_tvlv_packet *)skb->data)->dst; + return tvlv_handler->unicast_handler(bat_priv, src, dst, tvlv_value, tvlv_value_len); + case BATADV_MCAST: + if (!skb) + return NET_RX_SUCCESS; + + if (!tvlv_handler->mcast_handler) + return NET_RX_SUCCESS; + + tvlv_offset = (unsigned char *)tvlv_value - skb->data; + skb_set_network_header(skb, tvlv_offset); + skb_set_transport_header(skb, tvlv_offset + tvlv_value_len); + + return tvlv_handler->mcast_handler(bat_priv, skb); } return NET_RX_SUCCESS; @@ -405,10 +422,9 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, * batadv_tvlv_containers_process() - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the soft interface information - * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet + * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet - * @src: source mac address of the unicast packet - * @dst: destination mac address of the unicast packet + * @skb: the skb the TVLV handler is called for * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * @@ -416,10 +432,10 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, * handler callbacks. */ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, - bool ogm_source, + u8 packet_type, struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_value, u16 tvlv_value_len) + struct sk_buff *skb, void *tvlv_value, + u16 tvlv_value_len) { struct batadv_tvlv_handler *tvlv_handler; struct batadv_tvlv_hdr *tvlv_hdr; @@ -441,20 +457,24 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, tvlv_hdr->version); ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler, - ogm_source, orig_node, - src, dst, tvlv_value, + packet_type, orig_node, skb, + tvlv_value, tvlv_value_cont_len); batadv_tvlv_handler_put(tvlv_handler); tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } - if (!ogm_source) + if (packet_type != BATADV_IV_OGM && + packet_type != BATADV_OGM2) return ret; rcu_read_lock(); hlist_for_each_entry_rcu(tvlv_handler, &bat_priv->tvlv.handler_list, list) { + if (!tvlv_handler->ogm_handler) + continue; + if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) && !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED)) tvlv_handler->ogm_handler(bat_priv, orig_node, @@ -490,7 +510,7 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, tvlv_value = batadv_ogm_packet + 1; - batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL, + batadv_tvlv_containers_process(bat_priv, BATADV_IV_OGM, orig_node, NULL, tvlv_value, tvlv_value_len); } @@ -504,6 +524,10 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, * @uptr: unicast tvlv handler callback function. This function receives the * source & destination of the unicast packet as well as the tvlv content * to process. + * @mptr: multicast packet tvlv handler callback function. This function + * receives the full skb to process, with the skb network header pointing + * to the current tvlv and the skb transport header pointing to the first + * byte after the current tvlv. * @type: tvlv handler type to be registered * @version: tvlv handler version to be registered * @flags: flags to enable or disable TVLV API behavior @@ -518,6 +542,8 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len), + int (*mptr)(struct batadv_priv *bat_priv, + struct sk_buff *skb), u8 type, u8 version, u8 flags) { struct batadv_tvlv_handler *tvlv_handler; @@ -539,6 +565,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, tvlv_handler->ogm_handler = optr; tvlv_handler->unicast_handler = uptr; + tvlv_handler->mcast_handler = mptr; tvlv_handler->type = type; tvlv_handler->version = version; tvlv_handler->flags = flags; diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 4cf8af00fc11..e5697230d991 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -9,6 +9,7 @@ #include "main.h" +#include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> @@ -34,14 +35,16 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len), + int (*mptr)(struct batadv_priv *bat_priv, + struct sk_buff *skb), u8 type, u8 version, u8 flags); void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, u8 type, u8 version); int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, - bool ogm_source, + u8 packet_type, struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_buff, u16 tvlv_buff_len); + struct sk_buff *skb, void *tvlv_buff, + u16 tvlv_buff_len); void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src, const u8 *dst, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 758cd797a063..ca9449ec9836 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -2335,6 +2335,12 @@ struct batadv_tvlv_handler { u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len); + /** + * @mcast_handler: handler callback which is given the tvlv payload to + * process on incoming mcast packet + */ + int (*mcast_handler)(struct batadv_priv *bat_priv, struct sk_buff *skb); + /** @type: tvlv type this handler feels responsible for */ u8 type; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 8d6fce9005bd..053ef8f25fae 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -35,6 +35,8 @@ #include <net/bluetooth/l2cap.h> #include <net/bluetooth/rfcomm.h> +#include <trace/events/sock.h> + #define VERSION "1.11" static bool disable_cfc; @@ -186,6 +188,8 @@ static void rfcomm_l2state_change(struct sock *sk) static void rfcomm_l2data_ready(struct sock *sk) { + trace_sk_data_ready(sk); + BT_DBG("%p", sk); rfcomm_schedule(); } diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 1ac4467928a9..ff4f89a2b02a 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -154,6 +154,23 @@ static bool bpf_dummy_ops_is_valid_access(int off, int size, return bpf_tracing_btf_ctx_access(off, size, type, prog, info); } +static int bpf_dummy_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct bpf_dummy_ops, test_sleepable): + break; + default: + if (prog->aux->sleepable) + return -EINVAL; + } + + return 0; +} + static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, @@ -208,6 +225,7 @@ static void bpf_dummy_unreg(void *kdata) struct bpf_struct_ops bpf_bpf_dummy_ops = { .verifier_ops = &bpf_dummy_verifier_ops, .init = bpf_dummy_init, + .check_member = bpf_dummy_ops_check_member, .init_member = bpf_dummy_init_member, .reg = bpf_dummy_reg, .unreg = bpf_dummy_unreg, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2723623429ac..8da0d73b368e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1300,6 +1300,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES) return -EINVAL; + if (bpf_prog_is_dev_bound(prog->aux)) + return -EINVAL; + if (do_live) { if (!batch_size) batch_size = NAPI_POLL_WEIGHT; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 00e5743647b0..9f22ebfdc518 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -849,11 +849,10 @@ static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, - MCAST_INCLUDE, cfg->rt_protocol); - if (unlikely(!p)) { - NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (S, G) port group"); + MCAST_INCLUDE, cfg->rt_protocol, extack); + if (unlikely(!p)) return -ENOMEM; - } + rcu_assign_pointer(*pp, p); if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) mod_timer(&p->timer, @@ -1075,11 +1074,10 @@ static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, - cfg->filter_mode, cfg->rt_protocol); - if (unlikely(!p)) { - NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (*, G) port group"); + cfg->filter_mode, cfg->rt_protocol, + extack); + if (unlikely(!p)) return -ENOMEM; - } err = br_mdb_add_group_srcs(cfg, p, brmctx, extack); if (err) @@ -1101,8 +1099,7 @@ static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, return 0; err_del_port_group: - hlist_del_init(&p->mglist); - kfree(p); + br_multicast_del_port_group(p); return err; } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index dea1ee1bd095..96d1fc78dd39 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -31,6 +31,7 @@ #include <net/ip6_checksum.h> #include <net/addrconf.h> #endif +#include <trace/events/bridge.h> #include "br_private.h" #include "br_private_mcast_eht.h" @@ -234,6 +235,29 @@ out: return pmctx; } +static struct net_bridge_mcast_port * +br_multicast_port_vid_to_port_ctx(struct net_bridge_port *port, u16 vid) +{ + struct net_bridge_mcast_port *pmctx = NULL; + struct net_bridge_vlan *vlan; + + lockdep_assert_held_once(&port->br->multicast_lock); + + if (!br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return NULL; + + /* Take RCU to access the vlan. */ + rcu_read_lock(); + + vlan = br_vlan_find(nbp_vlan_group_rcu(port), vid); + if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) + pmctx = &vlan->port_mcast_ctx; + + rcu_read_unlock(); + + return pmctx; +} + /* when snooping we need to check if the contexts should be used * in the following order: * - if pmctx is non-NULL (port), check if it should be used @@ -668,6 +692,101 @@ void br_multicast_del_group_src(struct net_bridge_group_src *src, __br_multicast_del_group_src(src); } +static int +br_multicast_port_ngroups_inc_one(struct net_bridge_mcast_port *pmctx, + struct netlink_ext_ack *extack, + const char *what) +{ + u32 max = READ_ONCE(pmctx->mdb_max_entries); + u32 n = READ_ONCE(pmctx->mdb_n_entries); + + if (max && n >= max) { + NL_SET_ERR_MSG_FMT_MOD(extack, "%s is already in %u groups, and mcast_max_groups=%u", + what, n, max); + return -E2BIG; + } + + WRITE_ONCE(pmctx->mdb_n_entries, n + 1); + return 0; +} + +static void br_multicast_port_ngroups_dec_one(struct net_bridge_mcast_port *pmctx) +{ + u32 n = READ_ONCE(pmctx->mdb_n_entries); + + WARN_ON_ONCE(n == 0); + WRITE_ONCE(pmctx->mdb_n_entries, n - 1); +} + +static int br_multicast_port_ngroups_inc(struct net_bridge_port *port, + const struct br_ip *group, + struct netlink_ext_ack *extack) +{ + struct net_bridge_mcast_port *pmctx; + int err; + + lockdep_assert_held_once(&port->br->multicast_lock); + + /* Always count on the port context. */ + err = br_multicast_port_ngroups_inc_one(&port->multicast_ctx, extack, + "Port"); + if (err) { + trace_br_mdb_full(port->dev, group); + return err; + } + + /* Only count on the VLAN context if VID is given, and if snooping on + * that VLAN is enabled. + */ + if (!group->vid) + return 0; + + pmctx = br_multicast_port_vid_to_port_ctx(port, group->vid); + if (!pmctx) + return 0; + + err = br_multicast_port_ngroups_inc_one(pmctx, extack, "Port-VLAN"); + if (err) { + trace_br_mdb_full(port->dev, group); + goto dec_one_out; + } + + return 0; + +dec_one_out: + br_multicast_port_ngroups_dec_one(&port->multicast_ctx); + return err; +} + +static void br_multicast_port_ngroups_dec(struct net_bridge_port *port, u16 vid) +{ + struct net_bridge_mcast_port *pmctx; + + lockdep_assert_held_once(&port->br->multicast_lock); + + if (vid) { + pmctx = br_multicast_port_vid_to_port_ctx(port, vid); + if (pmctx) + br_multicast_port_ngroups_dec_one(pmctx); + } + br_multicast_port_ngroups_dec_one(&port->multicast_ctx); +} + +u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx) +{ + return READ_ONCE(pmctx->mdb_n_entries); +} + +void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max) +{ + WRITE_ONCE(pmctx->mdb_max_entries, max); +} + +u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx) +{ + return READ_ONCE(pmctx->mdb_max_entries); +} + static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc) { struct net_bridge_port_group *pg; @@ -702,6 +821,7 @@ void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, } else { br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); } + br_multicast_port_ngroups_dec(pg->key.port, pg->key.addr.vid); hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); @@ -1165,6 +1285,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, return mp; if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) { + trace_br_mdb_full(br->dev, group); br_mc_disabled_update(br->dev, false, NULL); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); return ERR_PTR(-E2BIG); @@ -1284,14 +1405,22 @@ struct net_bridge_port_group *br_multicast_new_port_group( unsigned char flags, const unsigned char *src, u8 filter_mode, - u8 rt_protocol) + u8 rt_protocol, + struct netlink_ext_ack *extack) { struct net_bridge_port_group *p; + int err; - p = kzalloc(sizeof(*p), GFP_ATOMIC); - if (unlikely(!p)) + err = br_multicast_port_ngroups_inc(port, group, extack); + if (err) return NULL; + p = kzalloc(sizeof(*p), GFP_ATOMIC); + if (unlikely(!p)) { + NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); + goto dec_out; + } + p->key.addr = *group; p->key.port = port; p->flags = flags; @@ -1305,8 +1434,8 @@ struct net_bridge_port_group *br_multicast_new_port_group( if (!br_multicast_is_star_g(group) && rhashtable_lookup_insert_fast(&port->br->sg_port_tbl, &p->rhnode, br_sg_port_rht_params)) { - kfree(p); - return NULL; + NL_SET_ERR_MSG_MOD(extack, "Couldn't insert new port group"); + goto free_out; } rcu_assign_pointer(p->next, next); @@ -1320,6 +1449,25 @@ struct net_bridge_port_group *br_multicast_new_port_group( eth_broadcast_addr(p->eth_addr); return p; + +free_out: + kfree(p); +dec_out: + br_multicast_port_ngroups_dec(port, group->vid); + return NULL; +} + +void br_multicast_del_port_group(struct net_bridge_port_group *p) +{ + struct net_bridge_port *port = p->key.port; + __u16 vid = p->key.addr.vid; + + hlist_del_init(&p->mglist); + if (!br_multicast_is_star_g(&p->key.addr)) + rhashtable_remove_fast(&port->br->sg_port_tbl, &p->rhnode, + br_sg_port_rht_params); + kfree(p); + br_multicast_port_ngroups_dec(port, vid); } void br_multicast_host_join(const struct net_bridge_mcast *brmctx, @@ -1387,7 +1535,7 @@ __br_multicast_add_group(struct net_bridge_mcast *brmctx, } p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src, - filter_mode, RTPROT_KERNEL); + filter_mode, RTPROT_KERNEL, NULL); if (unlikely(!p)) { p = ERR_PTR(-ENOMEM); goto out; @@ -1933,6 +2081,25 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) br_ip4_multicast_add_router(brmctx, pmctx); br_ip6_multicast_add_router(brmctx, pmctx); } + + if (br_multicast_port_ctx_is_vlan(pmctx)) { + struct net_bridge_port_group *pg; + u32 n = 0; + + /* The mcast_n_groups counter might be wrong. First, + * BR_VLFLAG_MCAST_ENABLED is toggled before temporary entries + * are flushed, thus mcast_n_groups after the toggle does not + * reflect the true values. And second, permanent entries added + * while BR_VLFLAG_MCAST_ENABLED was disabled, are not reflected + * either. Thus we have to refresh the counter. + */ + + hlist_for_each_entry(pg, &pmctx->port->mglist, mglist) { + if (pg->key.addr.vid == pmctx->vlan->vid) + n++; + } + WRITE_ONCE(pmctx->mdb_n_entries, n); + } } void br_multicast_enable_port(struct net_bridge_port *port) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 9554abcfd5b4..638a4d5359db 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -214,7 +214,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; - len = ntohs(iph->tot_len); + len = skb_ip_totlen(skb); if (skb->len < len) { __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 4316cc82ae17..9173e52b89e2 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -202,6 +202,8 @@ static inline size_t br_port_info_size(void) + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_N_GROUPS */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_MAX_GROUPS */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ @@ -298,7 +300,11 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, p->multicast_eht_hosts_limit) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, - p->multicast_eht_hosts_cnt)) + p->multicast_eht_hosts_cnt) || + nla_put_u32(skb, IFLA_BRPORT_MCAST_N_GROUPS, + br_multicast_ngroups_get(&p->multicast_ctx)) || + nla_put_u32(skb, IFLA_BRPORT_MCAST_MAX_GROUPS, + br_multicast_ngroups_get_max(&p->multicast_ctx))) return -EMSGSIZE; #endif @@ -858,6 +864,8 @@ static int br_afspec(struct net_bridge *br, } static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { + [IFLA_BRPORT_UNSPEC] = { .strict_start_type = + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + 1 }, [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, [IFLA_BRPORT_COST] = { .type = NLA_U32 }, [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, @@ -881,6 +889,8 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MAB] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, + [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, + [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -1015,6 +1025,13 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], if (err) return err; } + + if (tb[IFLA_BRPORT_MCAST_MAX_GROUPS]) { + u32 max_groups; + + max_groups = nla_get_u32(tb[IFLA_BRPORT_MCAST_MAX_GROUPS]); + br_multicast_ngroups_set_max(&p->multicast_ctx, max_groups); + } #endif if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c index 8914290c75d4..17abf092f7ca 100644 --- a/net/bridge/br_netlink_tunnel.c +++ b/net/bridge/br_netlink_tunnel.c @@ -188,6 +188,9 @@ initvars: } static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = { + [IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC] = { + .strict_start_type = IFLA_BRIDGE_VLAN_TUNNEL_FLAGS + 1 + }, [IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 }, [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 }, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 15ef7fd508ee..cef5f6ea850c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -126,6 +126,8 @@ struct net_bridge_mcast_port { struct hlist_node ip6_rlist; #endif /* IS_ENABLED(CONFIG_IPV6) */ unsigned char multicast_router; + u32 mdb_n_entries; + u32 mdb_max_entries; #endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ }; @@ -956,7 +958,9 @@ br_multicast_new_port_group(struct net_bridge_port *port, const struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, - u8 filter_mode, u8 rt_protocol); + u8 filter_mode, u8 rt_protocol, + struct netlink_ext_ack *extack); +void br_multicast_del_port_group(struct net_bridge_port_group *p); int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, @@ -974,6 +978,9 @@ void br_multicast_uninit_stats(struct net_bridge *br); void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, struct br_mcast_stats *dest); +u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx); +void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max); +u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx); void br_mdb_init(void); void br_mdb_uninit(void); void br_multicast_host_join(const struct net_bridge_mcast *brmctx, @@ -1757,7 +1764,8 @@ static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end); -bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v); +bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, + const struct net_bridge_port *p); size_t br_vlan_opts_nl_size(void); int br_vlan_process_options(const struct net_bridge *br, const struct net_bridge_port *p, diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 7eb6fd5bb917..de18e9c1d7a7 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -104,9 +104,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, return 0; if (err) { - if (extack && !extack->_msg) - NL_SET_ERR_MSG_MOD(extack, - "bridge flag offload is not supported"); + NL_SET_ERR_MSG_WEAK_MOD(extack, + "bridge flag offload is not supported"); return -EOPNOTSUPP; } @@ -115,9 +114,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, err = switchdev_port_attr_set(p->dev, &attr, extack); if (err) { - if (extack && !extack->_msg) - NL_SET_ERR_MSG_MOD(extack, - "error setting offload flag on port"); + NL_SET_ERR_MSG_WEAK_MOD(extack, + "error setting offload flag on port"); return err; } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index bc75fa1e4666..8a3dbc09ba38 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1816,6 +1816,7 @@ out_err: /* v_opts is used to dump the options which must be equal in the whole range */ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts, + const struct net_bridge_port *p, u16 flags, bool dump_stats) { @@ -1842,7 +1843,7 @@ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, goto out_err; if (v_opts) { - if (!br_vlan_opts_fill(skb, v_opts)) + if (!br_vlan_opts_fill(skb, v_opts, p)) goto out_err; if (dump_stats && !br_vlan_stats_fill(skb, v_opts)) @@ -1925,7 +1926,7 @@ void br_vlan_notify(const struct net_bridge *br, goto out_kfree; } - if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags, false)) + if (!br_vlan_fill_vids(skb, vid, vid_range, v, p, flags, false)) goto out_err; nlmsg_end(skb, nlh); @@ -2030,7 +2031,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, if (!br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, - vlan_flags, dump_stats)) { + p, vlan_flags, dump_stats)) { err = -EMSGSIZE; break; } @@ -2056,7 +2057,7 @@ update_end: else if (!dump_global && !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, - br_vlan_flags(range_start, pvid), + p, br_vlan_flags(range_start, pvid), dump_stats)) err = -EMSGSIZE; } @@ -2131,6 +2132,8 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS] = { .type = NLA_REJECT }, + [BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, }; static int br_vlan_rtm_process_one(struct net_device *dev, diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index a2724d03278c..e378c2f3a9e2 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -48,7 +48,8 @@ bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, curr_mc_rtr == range_mc_rtr; } -bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) +bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, + const struct net_bridge_port *p) { if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) || !__vlan_tun_put(skb, v)) @@ -58,6 +59,12 @@ bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER, br_vlan_multicast_router(v))) return false; + if (p && !br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx) && + (nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS, + br_multicast_ngroups_get(&v->port_mcast_ctx)) || + nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS, + br_multicast_ngroups_get_max(&v->port_mcast_ctx)))) + return false; #endif return true; @@ -70,6 +77,8 @@ size_t br_vlan_opts_nl_size(void) + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */ + + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS */ + + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS */ #endif + 0; } @@ -212,6 +221,22 @@ static int br_vlan_process_one_opts(const struct net_bridge *br, return err; *changed = true; } + if (tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]) { + u32 val; + + if (!p) { + NL_SET_ERR_MSG_MOD(extack, "Can't set mcast_max_groups for non-port vlans"); + return -EINVAL; + } + if (br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx)) { + NL_SET_ERR_MSG_MOD(extack, "Multicast snooping disabled on this VLAN"); + return -EINVAL; + } + + val = nla_get_u32(tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]); + br_multicast_ngroups_set_max(&v->port_mcast_ctx, val); + *changed = true; + } #endif return 0; diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 5c5dd437f1c2..71056ee84773 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -212,7 +212,7 @@ static int nf_ct_br_ip_check(const struct sk_buff *skb) iph->version != 4) return -1; - len = ntohs(iph->tot_len); + len = skb_ip_totlen(skb); if (skb->len < nhoff + len || len < (iph->ihl * 4)) return -1; @@ -256,7 +256,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct iphdr))) return NF_ACCEPT; - len = ntohs(ip_hdr(skb)->tot_len); + len = skb_ip_totlen(skb); if (pskb_trim_rcsum(skb, len)) return NF_ACCEPT; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 748be7253248..1f2c1d7b90e2 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -533,10 +533,6 @@ static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_namelen) goto err; - ret = -EINVAL; - if (unlikely(msg->msg_iter.nr_segs == 0) || - unlikely(msg->msg_iter.iov->iov_base == NULL)) - goto err; noblock = msg->msg_flags & MSG_DONTWAIT; timeo = sock_sndtimeo(sk, noblock); diff --git a/net/can/gw.c b/net/can/gw.c index 23a3d89cad81..37528826935e 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1139,6 +1139,13 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, if (gwj->dst.dev->type != ARPHRD_CAN) goto out; + /* is sending the skb back to the incoming interface intended? */ + if (gwj->src.dev == gwj->dst.dev && + !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) { + err = -EINVAL; + goto out; + } + ASSERT_RTNL(); err = cgw_register_filter(net, gwj); diff --git a/net/can/isotp.c b/net/can/isotp.c index fc81d77724a1..9bc344851704 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1220,6 +1220,9 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) if (len < ISOTP_MIN_NAMELEN) return -EINVAL; + if (addr->can_family != AF_CAN) + return -EINVAL; + /* sanitize tx CAN identifier */ if (tx_id & CAN_EFF_FLAG) tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); diff --git a/net/can/raw.c b/net/can/raw.c index ba86782ba8bb..f64469b98260 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -523,6 +523,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, struct can_filter sfilter; /* single filter */ struct net_device *dev = NULL; can_err_mask_t err_mask = 0; + int fd_frames; int count = 0; int err = 0; @@ -664,17 +665,17 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; case CAN_RAW_FD_FRAMES: - if (optlen != sizeof(ro->fd_frames)) + if (optlen != sizeof(fd_frames)) return -EINVAL; - if (copy_from_sockptr(&ro->fd_frames, optval, optlen)) + if (copy_from_sockptr(&fd_frames, optval, optlen)) return -EFAULT; /* Enabling CAN XL includes CAN FD */ - if (ro->xl_frames && !ro->fd_frames) { - ro->fd_frames = ro->xl_frames; + if (ro->xl_frames && !fd_frames) return -EINVAL; - } + + ro->fd_frames = fd_frames; break; case CAN_RAW_XL_FRAMES: diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 1d06e114ba3f..cd7b0bf5369e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -17,6 +17,7 @@ #endif /* CONFIG_BLOCK */ #include <linux/dns_resolver.h> #include <net/tcp.h> +#include <trace/events/sock.h> #include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> @@ -344,6 +345,9 @@ static void con_sock_state_closed(struct ceph_connection *con) static void ceph_sock_data_ready(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; + + trace_sk_data_ready(sk); + if (atomic_read(&con->msgr->stopping)) { return; } diff --git a/net/core/Makefile b/net/core/Makefile index 5857cec87b83..10edd66a8a37 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o -obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o diff --git a/net/core/dev.c b/net/core/dev.c index b76fb37b381e..bb42150a38ec 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1840,7 +1840,7 @@ EXPORT_SYMBOL(register_netdevice_notifier_net); * @nb: notifier * * Unregister a notifier previously registered by - * register_netdevice_notifier(). The notifier is unlinked into the + * register_netdevice_notifier_net(). The notifier is unlinked from the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * @@ -3001,6 +3001,8 @@ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); + if (size < READ_ONCE(dev->gso_ipv4_max_size)) + netif_set_gso_ipv4_max_size(dev, size); } EXPORT_SYMBOL(netif_set_tso_max_size); @@ -6616,17 +6618,16 @@ static int napi_threaded_poll(void *data) static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; - unsigned long flags; /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ if (!READ_ONCE(sd->defer_list)) return; - spin_lock_irqsave(&sd->defer_lock, flags); + spin_lock_irq(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; sd->defer_count = 0; - spin_unlock_irqrestore(&sd->defer_lock, flags); + spin_unlock_irq(&sd->defer_lock); while (skb != NULL) { next = skb->next; @@ -9224,8 +9225,12 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } - if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) { - NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); + if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { + NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); + return -EINVAL; + } + if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { + NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { @@ -10611,6 +10616,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; + dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; @@ -10830,6 +10837,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev_shutdown(dev); dev_xdp_uninstall(dev); + bpf_dev_bound_netdev_unregister(dev); netdev_offload_xstats_disable_all(dev); diff --git a/net/core/dev.h b/net/core/dev.h index 814ed5b7b960..a065b7571441 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -100,6 +100,8 @@ static inline void netif_set_gso_max_size(struct net_device *dev, { /* dev->gso_max_size is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_max_size, size); + if (size <= GSO_LEGACY_MAX_SIZE) + WRITE_ONCE(dev->gso_ipv4_max_size, size); } static inline void netif_set_gso_max_segs(struct net_device *dev, @@ -114,6 +116,22 @@ static inline void netif_set_gro_max_size(struct net_device *dev, { /* This pairs with the READ_ONCE() in skb_gro_receive() */ WRITE_ONCE(dev->gro_max_size, size); + if (size <= GRO_LEGACY_MAX_SIZE) + WRITE_ONCE(dev->gro_ipv4_max_size, size); +} + +static inline void netif_set_gso_ipv4_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_ipv4_max_size, size); +} + +static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_ipv4_max_size, size); } #endif diff --git a/net/core/dst.c b/net/core/dst.c index 6d2dd03dafa8..31c08a3386d3 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -82,12 +82,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, if (ops->gc && !(flags & DST_NOCOUNT) && - dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) { - pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); - return NULL; - } - } + dst_entries_get_fast(ops) > ops->gc_thresh) + ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) diff --git a/net/core/filter.c b/net/core/filter.c index 43cc1fe58a2c..d8f9b53f3db6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3381,13 +3381,17 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) +#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ - BPF_ADJ_ROOM_ENCAP_L2_MASK)) + BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ + BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) @@ -3501,6 +3505,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; @@ -3519,6 +3524,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + skb->protocol = htons(ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) + skb->protocol = htons(ETH_P_IP); + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -3608,6 +3621,22 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOTSUPP; } + if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + if (!shrink) + return -EINVAL; + + switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: + len_min = sizeof(struct iphdr); + break; + case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: + len_min = sizeof(struct ipv6hdr); + break; + default: + return -EINVAL; + } + } + len_cur = skb->len - skb_network_offset(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || @@ -4128,9 +4157,13 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { * bpf_redirect_info to actually enqueue the frame into a map type-specific * bulk queue structure. * - * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), - * which will flush all the different bulk queues, thus completing the - * redirect. + * 3. Before exiting its NAPI poll loop, the driver will call + * xdp_do_flush(), which will flush all the different bulk queues, + * thus completing the redirect. Note that xdp_do_flush() must be + * called before napi_complete_done() in the driver, as the + * XDP_REDIRECT logic relies on being inside a single NAPI instance + * through to the xdp_do_flush() call for RCU protection of all + * in-kernel data structures. */ /* * Pointers to the map entries will be kept around for this whole sequence of @@ -4618,7 +4651,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | - BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | + BPF_F_NO_TUNNEL_KEY))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -4656,6 +4690,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags &= ~TUNNEL_CSUM; if (flags & BPF_F_SEQ_NUMBER) info->key.tun_flags |= TUNNEL_SEQ; + if (flags & BPF_F_NO_TUNNEL_KEY) + info->key.tun_flags &= ~TUNNEL_KEY; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -5172,7 +5208,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { - if (sk->sk_prot->setsockopt != tcp_setsockopt) + if (sk->sk_protocol != IPPROTO_TCP) return -EINVAL; switch (optname) { @@ -6844,9 +6880,6 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, FIELD)); \ } while (0) - if (insn > insn_buf) - return insn - insn_buf; - switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != @@ -8731,7 +8764,7 @@ static bool xdp_is_valid_access(int off, int size, } if (type == BPF_WRITE) { - if (bpf_prog_is_dev_bound(prog->aux)) { + if (bpf_prog_is_offloaded(prog->aux)) { switch (off) { case offsetof(struct xdp_md, rx_queue_index): return __is_valid_xdp_access(off, size); @@ -10144,9 +10177,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) - if (insn > insn_buf) - return insn - insn_buf; - switch (si->off) { case offsetof(struct bpf_sock_ops, op): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, diff --git a/net/core/gro.c b/net/core/gro.c index 4bac7ea6e025..a606705a0859 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -171,16 +171,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (p->pp_recycle != skb->pp_recycle) return -ETOOMANYREFS; - /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ - gro_max_size = READ_ONCE(p->dev->gro_max_size); + /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ + gro_max_size = p->protocol == htons(ETH_P_IPV6) ? + READ_ONCE(p->dev->gro_max_size) : + READ_ONCE(p->dev->gro_ipv4_max_size); if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { - if (p->protocol != htons(ETH_P_IPV6) || - skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || - ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || + (p->protocol == htons(ETH_P_IPV6) && + skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 4edd2176e238..6798f6d2423b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1674,11 +1674,21 @@ static void neigh_proxy_process(struct timer_list *t) spin_unlock(&tbl->proxy_queue.lock); } +static unsigned long neigh_proxy_delay(struct neigh_parms *p) +{ + /* If proxy_delay is zero, do not call get_random_u32_below() + * as it is undefined behavior. + */ + unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY); + + return proxy_delay ? + jiffies + get_random_u32_below(proxy_delay) : jiffies; +} + void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { - unsigned long sched_next = jiffies + - get_random_u32_below(NEIGH_VAR(p, PROXY_DELAY)); + unsigned long sched_next = neigh_proxy_delay(p); if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c40cd8dd75c7..805b7385dd8d 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -41,6 +41,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add); EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full); #endif #if IS_ENABLED(CONFIG_PAGE_POOL) @@ -61,3 +62,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); + +EXPORT_TRACEPOINT_SYMBOL_GPL(sk_data_ready); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 9be762e1d042..a089b704b986 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -682,7 +682,7 @@ int netpoll_setup(struct netpoll *np) } if (!netif_running(ndev)) { - unsigned long atmost, atleast; + unsigned long atmost; np_info(np, "device %s not up yet, forcing it\n", np->dev_name); @@ -694,7 +694,6 @@ int netpoll_setup(struct netpoll *np) } rtnl_unlock(); - atleast = jiffies + HZ/10; atmost = jiffies + carrier_timeout * HZ; while (!netif_carrier_ok(ndev)) { if (time_after(jiffies, atmost)) { @@ -704,15 +703,6 @@ int netpoll_setup(struct netpoll *np) msleep(1); } - /* If carrier appears to come up instantly, we don't - * trust it and pause so that we don't pump all our - * queued console messages into the bitbucket. - */ - - if (time_before(jiffies, atleast)) { - np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n"); - msleep(4000); - } rtnl_lock(); } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9b203d8660e4..193c18799865 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -511,8 +511,8 @@ static void page_pool_return_page(struct page_pool *pool, struct page *page) static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) { int ret; - /* BH protection not needed if current is serving softirq */ - if (in_serving_softirq()) + /* BH protection not needed if current is softirq */ + if (in_softirq()) ret = ptr_ring_produce(&pool->ring, page); else ret = ptr_ring_produce_bh(&pool->ring, page); @@ -570,7 +570,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_serving_softirq() && + if (allow_direct && in_softirq() && page_pool_recycle_in_cache(page, pool)) return NULL; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 64289bc98887..5d8eb57867a9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -58,7 +58,7 @@ #include "dev.h" #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 40 +#define RTNL_SLAVE_MAX_TYPE 42 struct rtnl_link { rtnl_doit_func doit; @@ -1074,6 +1074,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ @@ -1807,6 +1809,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || + nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) || + nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) || nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || #ifdef CONFIG_RPS @@ -1968,6 +1972,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT }, + [IFLA_GSO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2883,6 +2889,29 @@ static int do_setlink(const struct sk_buff *skb, } } + if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); + + if (max_size > dev->tso_max_size) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_ipv4_max_size ^ max_size) { + netif_set_gso_ipv4_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GRO_IPV4_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]); + + if (dev->gro_ipv4_max_size ^ gro_max_size) { + netif_set_gro_ipv4_max_size(dev, gro_max_size); + status |= DO_SETLINK_MODIFIED; + } + } + if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); @@ -3325,6 +3354,10 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); if (tb[IFLA_GRO_MAX_SIZE]) netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); + if (tb[IFLA_GSO_IPV4_MAX_SIZE]) + netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE])); + if (tb[IFLA_GRO_IPV4_MAX_SIZE]) + netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE])); return dev; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a31ff4d83ecc..70a6088e8326 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,7 @@ #include <linux/capability.h> #include <linux/user_namespace.h> #include <linux/indirect_call_wrapper.h> +#include <linux/textsearch.h> #include "dev.h" #include "sock_destructor.h" @@ -88,6 +89,34 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init; #ifdef CONFIG_SKB_EXTENSIONS static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif + +/* skb_small_head_cache and related code is only supported + * for CONFIG_SLAB and CONFIG_SLUB. + * As soon as SLOB is removed from the kernel, we can clean up this. + */ +#if !defined(CONFIG_SLOB) +# define HAVE_SKB_SMALL_HEAD_CACHE 1 +#endif + +#ifdef HAVE_SKB_SMALL_HEAD_CACHE +static struct kmem_cache *skb_small_head_cache __ro_after_init; + +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) + +/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. + * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique + * size, and we can differentiate heads from skb_small_head_cache + * vs system slabs by looking at their size (skb_end_offset()). + */ +#define SKB_SMALL_HEAD_CACHE_SIZE \ + (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ + (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ + SKB_SMALL_HEAD_SIZE) + +#define SKB_SMALL_HEAD_HEADROOM \ + SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) +#endif /* HAVE_SKB_SMALL_HEAD_CACHE */ + int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -386,8 +415,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) /* build_skb() is wrapper over __build_skb(), that specifically * takes care of skb->head and skb->pfmemalloc - * This means that if @frag_size is not zero, then @data must be backed - * by a page fragment, not kmalloc() or vmalloc() */ struct sk_buff *build_skb(void *data, unsigned int frag_size) { @@ -406,7 +433,7 @@ EXPORT_SYMBOL(build_skb); * build_skb_around - build a network buffer around provided skb * @skb: sk_buff provide by caller, must be memset cleared * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data */ struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size) @@ -428,7 +455,7 @@ EXPORT_SYMBOL(build_skb_around); /** * __napi_build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data * * Version of __build_skb() that uses NAPI percpu caches to obtain * skbuff_head instead of inplace allocation. @@ -452,7 +479,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) /** * napi_build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data * * Version of __napi_build_skb() that takes care of skb->head_frag * and skb->pfmemalloc when the data is a page or page fragment. @@ -479,17 +506,37 @@ EXPORT_SYMBOL(napi_build_skb); * may be used. Otherwise, the packet data may be discarded until enough * memory is free */ -static void *kmalloc_reserve(size_t size, gfp_t flags, int node, +static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, bool *pfmemalloc) { - void *obj; bool ret_pfmemalloc = false; + unsigned int obj_size; + void *obj; + + obj_size = SKB_HEAD_ALIGN(*size); +#ifdef HAVE_SKB_SMALL_HEAD_CACHE + if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && + !(flags & KMALLOC_NOT_NORMAL_BITS)) { + /* skb_small_head_cache has non power of two size, + * likely forcing SLUB to use order-3 pages. + * We deliberately attempt a NOMEMALLOC allocation only. + */ + obj = kmem_cache_alloc_node(skb_small_head_cache, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + if (obj) { + *size = SKB_SMALL_HEAD_CACHE_SIZE; + goto out; + } + } +#endif + *size = obj_size = kmalloc_size_roundup(obj_size); /* * Try a regular allocation, when that fails and we're not entitled * to the reserves, fail. */ - obj = kmalloc_node_track_caller(size, + obj = kmalloc_node_track_caller(obj_size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); if (obj || !(gfp_pfmemalloc_allowed(flags))) @@ -497,7 +544,7 @@ static void *kmalloc_reserve(size_t size, gfp_t flags, int node, /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; - obj = kmalloc_node_track_caller(size, flags, node); + obj = kmalloc_node_track_caller(obj_size, flags, node); out: if (pfmemalloc) @@ -534,7 +581,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, { struct kmem_cache *cache; struct sk_buff *skb; - unsigned int osize; bool pfmemalloc; u8 *data; @@ -559,18 +605,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned. */ - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - osize = kmalloc_size_roundup(size); - data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc); + data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); if (unlikely(!data)) goto nodata; /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ - size = SKB_WITH_OVERHEAD(osize); - prefetchw(data + size); + prefetchw(data + SKB_WITH_OVERHEAD(size)); /* * Only clear those fields we need to clear, not those that we will @@ -578,7 +620,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); - __build_skb_around(skb, data, osize); + __build_skb_around(skb, data, size); skb->pfmemalloc = pfmemalloc; if (flags & SKB_ALLOC_FCLONE) { @@ -633,8 +675,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, goto skb_success; } - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - len = SKB_DATA_ALIGN(len); + len = SKB_HEAD_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; @@ -733,8 +774,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, data = page_frag_alloc_1k(&nc->page_small, gfp_mask); pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); } else { - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - len = SKB_DATA_ALIGN(len); + len = SKB_HEAD_ALIGN(len); data = page_frag_alloc(&nc->page, len, gfp_mask); pfmemalloc = nc->page.pfmemalloc; @@ -810,6 +850,16 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data) return page_pool_return_skb_page(virt_to_page(data)); } +static void skb_kfree_head(void *head, unsigned int end_offset) +{ +#ifdef HAVE_SKB_SMALL_HEAD_CACHE + if (end_offset == SKB_SMALL_HEAD_HEADROOM) + kmem_cache_free(skb_small_head_cache, head); + else +#endif + kfree(head); +} + static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; @@ -819,7 +869,7 @@ static void skb_free_head(struct sk_buff *skb) return; skb_free_frag(head); } else { - kfree(head); + skb_kfree_head(head, skb_end_offset(skb)); } } @@ -932,6 +982,21 @@ void __kfree_skb(struct sk_buff *skb) } EXPORT_SYMBOL(__kfree_skb); +static __always_inline +bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +{ + if (unlikely(!skb_unref(skb))) + return false; + + DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + + if (reason == SKB_CONSUMED) + trace_consume_skb(skb); + else + trace_kfree_skb(skb, __builtin_return_address(0), reason); + return true; +} + /** * kfree_skb_reason - free an sk_buff with special reason * @skb: buffer to free @@ -944,28 +1009,59 @@ EXPORT_SYMBOL(__kfree_skb); void __fix_address kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) { - if (unlikely(!skb_unref(skb))) + if (__kfree_skb_reason(skb, reason)) + __kfree_skb(skb); +} +EXPORT_SYMBOL(kfree_skb_reason); + +#define KFREE_SKB_BULK_SIZE 16 + +struct skb_free_array { + unsigned int skb_count; + void *skb_array[KFREE_SKB_BULK_SIZE]; +}; + +static void kfree_skb_add_bulk(struct sk_buff *skb, + struct skb_free_array *sa, + enum skb_drop_reason reason) +{ + /* if SKB is a clone, don't handle this case */ + if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { + __kfree_skb(skb); return; + } - DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + skb_release_all(skb, reason); + sa->skb_array[sa->skb_count++] = skb; - if (reason == SKB_CONSUMED) - trace_consume_skb(skb); - else - trace_kfree_skb(skb, __builtin_return_address(0), reason); - __kfree_skb(skb); + if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { + kmem_cache_free_bulk(skbuff_head_cache, KFREE_SKB_BULK_SIZE, + sa->skb_array); + sa->skb_count = 0; + } } -EXPORT_SYMBOL(kfree_skb_reason); -void kfree_skb_list_reason(struct sk_buff *segs, - enum skb_drop_reason reason) +void __fix_address +kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) { + struct skb_free_array sa; + + sa.skb_count = 0; + while (segs) { struct sk_buff *next = segs->next; - kfree_skb_reason(segs, reason); + if (__kfree_skb_reason(segs, reason)) { + skb_poison_list(segs); + kfree_skb_add_bulk(segs, &sa, reason); + } + segs = next; } + + if (sa.skb_count) + kmem_cache_free_bulk(skbuff_head_cache, sa.skb_count, + sa.skb_array); } EXPORT_SYMBOL(kfree_skb_list_reason); @@ -1893,10 +1989,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - size = kmalloc_size_roundup(size); - data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata; size = SKB_WITH_OVERHEAD(size); @@ -1959,7 +2052,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, return 0; nofrags: - kfree(data); + skb_kfree_head(data, size); nodata: return -ENOMEM; } @@ -4596,6 +4689,19 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +#ifdef HAVE_SKB_SMALL_HEAD_CACHE + /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. + * struct skb_shared_info is located at the end of skb->head, + * and should not be copied to/from user. + */ + skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", + SKB_SMALL_HEAD_CACHE_SIZE, + 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, + 0, + SKB_SMALL_HEAD_HEADROOM, + NULL); +#endif skb_extensions_init(); } @@ -6244,10 +6350,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - size = kmalloc_size_roundup(size); - data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; size = SKB_WITH_OVERHEAD(size); @@ -6263,7 +6366,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, if (skb_cloned(skb)) { /* drop the old head gracefully */ if (skb_orphan_frags(skb, gfp_mask)) { - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) @@ -6363,10 +6466,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - size = kmalloc_size_roundup(size); - data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; size = SKB_WITH_OVERHEAD(size); @@ -6374,7 +6474,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); if (skb_orphan_frags(skb, gfp_mask)) { - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } shinfo = (struct skb_shared_info *)(data + size); @@ -6410,7 +6510,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ if (skb_has_frag_list(skb)) kfree_skb_list(skb_shinfo(skb)->frag_list); - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } skb_release_data(skb, SKB_CONSUMED); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 53d0251788aa..f81883759d38 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -8,6 +8,7 @@ #include <net/sock.h> #include <net/tcp.h> #include <net/tls.h> +#include <trace/events/sock.h> static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { @@ -1114,6 +1115,8 @@ static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; + trace_sk_data_ready(sk); + rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { @@ -1210,6 +1213,8 @@ static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; + trace_sk_data_ready(sk); + if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) return; sock->ops->read_skb(sk, sk_psock_verdict_recv); diff --git a/net/core/sock.c b/net/core/sock.c index 6f27c24016fe..afbb02984d5f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2375,17 +2375,22 @@ void sk_free_unlock_clone(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); -static void sk_trim_gso_size(struct sock *sk) +static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { - if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) - return; + bool is_ipv6 = false; + u32 max_size; + #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6 && - sk_is_tcp(sk) && - !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) - return; + is_ipv6 = (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif - sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ + max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : + READ_ONCE(dst->dev->gso_ipv4_max_size); + if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) + max_size = GSO_LEGACY_MAX_SIZE; + + return max_size - (MAX_TCP_HEADER + 1); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst) @@ -2405,10 +2410,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ - sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); - sk_trim_gso_size(sk); - sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } @@ -3293,6 +3295,8 @@ void sock_def_readable(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) @@ -3381,7 +3385,7 @@ void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) } EXPORT_SYMBOL(sk_stop_timer_sync); -void sock_init_data(struct socket *sock, struct sock *sk) +void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) { sk_init_common(sk); sk->sk_send_head = NULL; @@ -3401,11 +3405,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; - sk->sk_uid = SOCK_INODE(sock)->i_uid; } else { RCU_INIT_POINTER(sk->sk_wq, NULL); - sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); } + sk->sk_uid = uid; rwlock_init(&sk->sk_callback_lock); if (sk->sk_kern_sock) @@ -3463,6 +3466,16 @@ void sock_init_data(struct socket *sock, struct sock *sk) refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } +EXPORT_SYMBOL(sock_init_data_uid); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + kuid_t uid = sock ? + SOCK_INODE(sock)->i_uid : + make_kuid(sock_net(sk)->user_ns, 0); + + sock_init_data_uid(sock, sk, uid); +} EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5b1ce656baa1..e7b98162c632 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -643,11 +643,6 @@ static __net_init int sysctl_core_net_init(struct net *net) for (tmp = tbl; tmp->procname; tmp++) tmp->data += (char *)net - (char *)&init_net; - - /* Don't export any sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; - } } net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); diff --git a/net/core/xdp.c b/net/core/xdp.c index 844c9d99dc0e..a5a7ecf6391c 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -4,6 +4,7 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #include <linux/bpf.h> +#include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> @@ -709,3 +710,66 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) return nxdpf; } + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +/** + * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp. + * @ctx: XDP context pointer. + * @timestamp: Return value pointer. + * + * Returns 0 on success or ``-errno`` on error. + */ +int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) +{ + return -EOPNOTSUPP; +} + +/** + * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash. + * @ctx: XDP context pointer. + * @hash: Return value pointer. + * + * Returns 0 on success or ``-errno`` on error. + */ +int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash) +{ + return -EOPNOTSUPP; +} + +__diag_pop(); + +BTF_SET8_START(xdp_metadata_kfunc_ids) +#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, 0) +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC +BTF_SET8_END(xdp_metadata_kfunc_ids) + +static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { + .owner = THIS_MODULE, + .set = &xdp_metadata_kfunc_ids, +}; + +BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted) +#define XDP_METADATA_KFUNC(name, str) BTF_ID(func, str) +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + +u32 bpf_xdp_metadata_kfunc_id(int id) +{ + /* xdp_metadata_kfunc_ids is sorted and can't be used */ + return xdp_metadata_kfunc_ids_unsorted[id]; +} + +bool bpf_dev_bound_kfunc_id(u32 btf_id) +{ + return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id); +} + +static int __init xdp_metadata_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set); +} +late_initcall(xdp_metadata_init); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index f9949e051f49..c0c438128575 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -178,6 +178,7 @@ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { }; static LIST_HEAD(dcb_app_list); +static LIST_HEAD(dcb_rewr_list); static DEFINE_SPINLOCK(dcb_lock); static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) @@ -1099,11 +1100,46 @@ out: return err; } +/* Set or delete APP table or rewrite table entries. The APP struct is validated + * and the appropriate callback function is called. + */ +static int dcbnl_app_table_setdel(struct nlattr *attr, + struct net_device *netdev, + int (*setdel)(struct net_device *dev, + struct dcb_app *app)) +{ + struct dcb_app *app_data; + enum ieee_attrs_app type; + struct nlattr *attr_itr; + int rem, err; + + nla_for_each_nested(attr_itr, attr, rem) { + type = nla_type(attr_itr); + + if (!dcbnl_app_attr_type_validate(type)) + continue; + + if (nla_len(attr_itr) < sizeof(struct dcb_app)) + return -ERANGE; + + app_data = nla_data(attr_itr); + + if (!dcbnl_app_selector_validate(type, app_data->selector)) + return -EINVAL; + + err = setdel(netdev, app_data); + if (err) + return err; + } + + return 0; +} + /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; - struct nlattr *ieee, *app; + struct nlattr *ieee, *app, *rewr; struct dcb_app_type *itr; int dcbx; int err; @@ -1206,6 +1242,27 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); + rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE); + if (!rewr) + return -EMSGSIZE; + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->ifindex == netdev->ifindex) { + enum ieee_attrs_app type = + dcbnl_app_attr_type_get(itr->app.selector); + err = nla_put(skb, type, sizeof(itr->app), &itr->app); + if (err) { + spin_unlock_bh(&dcb_lock); + nla_nest_cancel(skb, rewr); + return -EMSGSIZE; + } + } + } + + spin_unlock_bh(&dcb_lock); + nla_nest_end(skb, rewr); + if (ops->dcbnl_getapptrust) { err = dcbnl_getapptrust(netdev, skb); if (err) @@ -1567,37 +1624,20 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, goto err; } - if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { - struct nlattr *attr; - int rem; - - nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { - enum ieee_attrs_app type = nla_type(attr); - struct dcb_app *app_data; - - if (!dcbnl_app_attr_type_validate(type)) - continue; - - if (nla_len(attr) < sizeof(struct dcb_app)) { - err = -ERANGE; - goto err; - } - - app_data = nla_data(attr); - - if (!dcbnl_app_selector_validate(type, - app_data->selector)) { - err = -EINVAL; - goto err; - } + if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], + netdev, + ops->dcbnl_setrewr ?: dcb_setrewr); + if (err) + goto err; + } - if (ops->ieee_setapp) - err = ops->ieee_setapp(netdev, app_data); - else - err = dcb_ieee_setapp(netdev, app_data); - if (err) - goto err; - } + if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], + netdev, ops->ieee_setapp ?: + dcb_ieee_setapp); + if (err) + goto err; } if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { @@ -1684,31 +1724,19 @@ static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, return err; if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { - struct nlattr *attr; - int rem; - - nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { - enum ieee_attrs_app type = nla_type(attr); - struct dcb_app *app_data; - - if (!dcbnl_app_attr_type_validate(type)) - continue; - - app_data = nla_data(attr); - - if (!dcbnl_app_selector_validate(type, - app_data->selector)) { - err = -EINVAL; - goto err; - } + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], + netdev, ops->ieee_delapp ?: + dcb_ieee_delapp); + if (err) + goto err; + } - if (ops->ieee_delapp) - err = ops->ieee_delapp(netdev, app_data); - else - err = dcb_ieee_delapp(netdev, app_data); - if (err) - goto err; - } + if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], + netdev, + ops->dcbnl_delrewr ?: dcb_delrewr); + if (err) + goto err; } err: @@ -1939,6 +1967,22 @@ out: return ret; } +static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app, + int ifindex, int proto) +{ + struct dcb_app_type *itr; + + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->app.selector == app->selector && + itr->app.priority == app->priority && + itr->ifindex == ifindex && + ((proto == -1) || itr->app.protocol == proto)) + return itr; + } + + return NULL; +} + static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, int ifindex, int prio) { @@ -1955,7 +1999,8 @@ static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, return NULL; } -static int dcb_app_add(const struct dcb_app *app, int ifindex) +static int dcb_app_add(struct list_head *list, const struct dcb_app *app, + int ifindex) { struct dcb_app_type *entry; @@ -1965,7 +2010,7 @@ static int dcb_app_add(const struct dcb_app *app, int ifindex) memcpy(&entry->app, app, sizeof(*app)); entry->ifindex = ifindex; - list_add(&entry->list, &dcb_app_list); + list_add(&entry->list, list); return 0; } @@ -2028,7 +2073,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new) } /* App type does not exist add new application type */ if (new->priority) - err = dcb_app_add(new, dev->ifindex); + err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) @@ -2061,6 +2106,63 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) } EXPORT_SYMBOL(dcb_ieee_getapp_mask); +/* Get protocol value from rewrite entry. */ +u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app) +{ + struct dcb_app_type *itr; + u16 proto = 0; + + spin_lock_bh(&dcb_lock); + itr = dcb_rewr_lookup(app, dev->ifindex, -1); + if (itr) + proto = itr->app.protocol; + spin_unlock_bh(&dcb_lock); + + return proto; +} +EXPORT_SYMBOL(dcb_getrewr); + + /* Add rewrite entry to the rewrite list. */ +int dcb_setrewr(struct net_device *dev, struct dcb_app *new) +{ + int err; + + spin_lock_bh(&dcb_lock); + /* Search for existing match and abort if found. */ + if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) { + err = -EEXIST; + goto out; + } + + err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex); +out: + spin_unlock_bh(&dcb_lock); + + return err; +} +EXPORT_SYMBOL(dcb_setrewr); + +/* Delete rewrite entry from the rewrite list. */ +int dcb_delrewr(struct net_device *dev, struct dcb_app *del) +{ + struct dcb_app_type *itr; + int err = -ENOENT; + + spin_lock_bh(&dcb_lock); + /* Search for existing match and remove it. */ + itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol); + if (itr) { + list_del(&itr->list); + kfree(itr); + err = 0; + } + + spin_unlock_bh(&dcb_lock); + + return err; +} +EXPORT_SYMBOL(dcb_delrewr); + /** * dcb_ieee_setapp - add IEEE dcb application data to app list * @dev: network interface @@ -2088,7 +2190,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) goto out; } - err = dcb_app_add(new, dev->ifindex); + err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) @@ -2130,6 +2232,58 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) } EXPORT_SYMBOL(dcb_ieee_delapp); +/* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from + * priorities to the PCP and DEI values assigned to that priority. + */ +void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev, + struct dcb_rewr_prio_pcp_map *p_map) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + u8 prio; + + memset(p_map->map, 0, sizeof(p_map->map)); + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == DCB_APP_SEL_PCP && + itr->app.protocol < 16 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) { + prio = itr->app.priority; + p_map->map[prio] |= 1 << itr->app.protocol; + } + } + spin_unlock_bh(&dcb_lock); +} +EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map); + +/* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from + * priorities to the DSCP values assigned to that priority. + */ +void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev, + struct dcb_ieee_app_prio_map *p_map) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + u8 prio; + + memset(p_map->map, 0, sizeof(p_map->map)); + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && + itr->app.protocol < 64 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) { + prio = itr->app.priority; + p_map->map[prio] |= 1ULL << itr->app.protocol; + } + } + spin_unlock_bh(&dcb_lock); +} +EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map); + /* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map diff --git a/net/devlink/Makefile b/net/devlink/Makefile new file mode 100644 index 000000000000..daad4521c61e --- /dev/null +++ b/net/devlink/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y := leftover.o core.o netlink.o dev.o diff --git a/net/devlink/core.c b/net/devlink/core.c new file mode 100644 index 000000000000..a4f47dafb864 --- /dev/null +++ b/net/devlink/core.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <net/genetlink.h> + +#include "devl_internal.h" + +DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); + +void *devlink_priv(struct devlink *devlink) +{ + return &devlink->priv; +} +EXPORT_SYMBOL_GPL(devlink_priv); + +struct devlink *priv_to_devlink(void *priv) +{ + return container_of(priv, struct devlink, priv); +} +EXPORT_SYMBOL_GPL(priv_to_devlink); + +struct device *devlink_to_dev(const struct devlink *devlink) +{ + return devlink->dev; +} +EXPORT_SYMBOL_GPL(devlink_to_dev); + +struct net *devlink_net(const struct devlink *devlink) +{ + return read_pnet(&devlink->_net); +} +EXPORT_SYMBOL_GPL(devlink_net); + +void devl_assert_locked(struct devlink *devlink) +{ + lockdep_assert_held(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_assert_locked); + +#ifdef CONFIG_LOCKDEP +/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ +bool devl_lock_is_held(struct devlink *devlink) +{ + return lockdep_is_held(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_lock_is_held); +#endif + +void devl_lock(struct devlink *devlink) +{ + mutex_lock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_lock); + +int devl_trylock(struct devlink *devlink) +{ + return mutex_trylock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_trylock); + +void devl_unlock(struct devlink *devlink) +{ + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_unlock); + +/** + * devlink_try_get() - try to obtain a reference on a devlink instance + * @devlink: instance to reference + * + * Obtain a reference on a devlink instance. A reference on a devlink instance + * only implies that it's safe to take the instance lock. It does not imply + * that the instance is registered, use devl_is_registered() after taking + * the instance lock to check registration status. + */ +struct devlink *__must_check devlink_try_get(struct devlink *devlink) +{ + if (refcount_inc_not_zero(&devlink->refcount)) + return devlink; + return NULL; +} + +static void devlink_release(struct work_struct *work) +{ + struct devlink *devlink; + + devlink = container_of(to_rcu_work(work), struct devlink, rwork); + + mutex_destroy(&devlink->lock); + lockdep_unregister_key(&devlink->lock_key); + kfree(devlink); +} + +void devlink_put(struct devlink *devlink) +{ + if (refcount_dec_and_test(&devlink->refcount)) + queue_rcu_work(system_wq, &devlink->rwork); +} + +struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) +{ + struct devlink *devlink = NULL; + + rcu_read_lock(); +retry: + devlink = xa_find(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); + if (!devlink) + goto unlock; + + if (!devlink_try_get(devlink)) + goto next; + if (!net_eq(devlink_net(devlink), net)) { + devlink_put(devlink); + goto next; + } +unlock: + rcu_read_unlock(); + return devlink; + +next: + (*indexp)++; + goto retry; +} + +/** + * devl_register - Register devlink instance + * @devlink: devlink + */ +int devl_register(struct devlink *devlink) +{ + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + devl_assert_locked(devlink); + + xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); + devlink_notify_register(devlink); + + return 0; +} +EXPORT_SYMBOL_GPL(devl_register); + +void devlink_register(struct devlink *devlink) +{ + devl_lock(devlink); + devl_register(devlink); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_register); + +/** + * devl_unregister - Unregister devlink instance + * @devlink: devlink + */ +void devl_unregister(struct devlink *devlink) +{ + ASSERT_DEVLINK_REGISTERED(devlink); + devl_assert_locked(devlink); + + devlink_notify_unregister(devlink); + xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); +} +EXPORT_SYMBOL_GPL(devl_unregister); + +void devlink_unregister(struct devlink *devlink) +{ + devl_lock(devlink); + devl_unregister(devlink); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_unregister); + +/** + * devlink_alloc_ns - Allocate new devlink instance resources + * in specific namespace + * + * @ops: ops + * @priv_size: size of user private data + * @net: net namespace + * @dev: parent device + * + * Allocate new devlink instance resources, including devlink index + * and name. + */ +struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, + size_t priv_size, struct net *net, + struct device *dev) +{ + struct devlink *devlink; + static u32 last_id; + int ret; + + WARN_ON(!ops || !dev); + if (!devlink_reload_actions_valid(ops)) + return NULL; + + devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); + if (!devlink) + return NULL; + + ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, + &last_id, GFP_KERNEL); + if (ret < 0) + goto err_xa_alloc; + + devlink->netdevice_nb.notifier_call = devlink_port_netdevice_event; + ret = register_netdevice_notifier(&devlink->netdevice_nb); + if (ret) + goto err_register_netdevice_notifier; + + devlink->dev = dev; + devlink->ops = ops; + xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); + xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); + write_pnet(&devlink->_net, net); + INIT_LIST_HEAD(&devlink->rate_list); + INIT_LIST_HEAD(&devlink->linecard_list); + INIT_LIST_HEAD(&devlink->sb_list); + INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); + INIT_LIST_HEAD(&devlink->resource_list); + INIT_LIST_HEAD(&devlink->param_list); + INIT_LIST_HEAD(&devlink->region_list); + INIT_LIST_HEAD(&devlink->reporter_list); + INIT_LIST_HEAD(&devlink->trap_list); + INIT_LIST_HEAD(&devlink->trap_group_list); + INIT_LIST_HEAD(&devlink->trap_policer_list); + INIT_RCU_WORK(&devlink->rwork, devlink_release); + lockdep_register_key(&devlink->lock_key); + mutex_init(&devlink->lock); + lockdep_set_class(&devlink->lock, &devlink->lock_key); + refcount_set(&devlink->refcount, 1); + + return devlink; + +err_register_netdevice_notifier: + xa_erase(&devlinks, devlink->index); +err_xa_alloc: + kfree(devlink); + return NULL; +} +EXPORT_SYMBOL_GPL(devlink_alloc_ns); + +/** + * devlink_free - Free devlink instance resources + * + * @devlink: devlink + */ +void devlink_free(struct devlink *devlink) +{ + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + + WARN_ON(!list_empty(&devlink->trap_policer_list)); + WARN_ON(!list_empty(&devlink->trap_group_list)); + WARN_ON(!list_empty(&devlink->trap_list)); + WARN_ON(!list_empty(&devlink->reporter_list)); + WARN_ON(!list_empty(&devlink->region_list)); + WARN_ON(!list_empty(&devlink->param_list)); + WARN_ON(!list_empty(&devlink->resource_list)); + WARN_ON(!list_empty(&devlink->dpipe_table_list)); + WARN_ON(!list_empty(&devlink->sb_list)); + WARN_ON(!list_empty(&devlink->rate_list)); + WARN_ON(!list_empty(&devlink->linecard_list)); + WARN_ON(!xa_empty(&devlink->ports)); + + xa_destroy(&devlink->snapshot_ids); + xa_destroy(&devlink->ports); + + WARN_ON_ONCE(unregister_netdevice_notifier(&devlink->netdevice_nb)); + + xa_erase(&devlinks, devlink->index); + + devlink_put(devlink); +} +EXPORT_SYMBOL_GPL(devlink_free); + +static void __net_exit devlink_pernet_pre_exit(struct net *net) +{ + struct devlink *devlink; + u32 actions_performed; + unsigned long index; + int err; + + /* In case network namespace is getting destroyed, reload + * all devlink instances from this namespace into init_net. + */ + devlinks_xa_for_each_registered_get(net, index, devlink) { + devl_lock(devlink); + err = 0; + if (devl_is_registered(devlink)) + err = devlink_reload(devlink, &init_net, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_LIMIT_UNSPEC, + &actions_performed, NULL); + devl_unlock(devlink); + devlink_put(devlink); + if (err && err != -EOPNOTSUPP) + pr_warn("Failed to reload devlink instance into init_net\n"); + } +} + +static struct pernet_operations devlink_pernet_ops __net_initdata = { + .pre_exit = devlink_pernet_pre_exit, +}; + +static int __init devlink_init(void) +{ + int err; + + err = genl_register_family(&devlink_nl_family); + if (err) + goto out; + err = register_pernet_subsys(&devlink_pernet_ops); + +out: + WARN_ON(err); + return err; +} + +subsys_initcall(devlink_init); diff --git a/net/devlink/dev.c b/net/devlink/dev.c new file mode 100644 index 000000000000..78d824eda5ec --- /dev/null +++ b/net/devlink/dev.c @@ -0,0 +1,1343 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <net/genetlink.h> +#include <net/sock.h> +#include "devl_internal.h" + +struct devlink_info_req { + struct sk_buff *msg; + void (*version_cb)(const char *version_name, + enum devlink_info_version_type version_type, + void *version_cb_priv); + void *version_cb_priv; +}; + +struct devlink_reload_combination { + enum devlink_reload_action action; + enum devlink_reload_limit limit; +}; + +static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = { + { + /* can't reinitialize driver with no down time */ + .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + .limit = DEVLINK_RELOAD_LIMIT_NO_RESET, + }, +}; + +static bool +devlink_reload_combination_is_invalid(enum devlink_reload_action action, + enum devlink_reload_limit limit) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) + if (devlink_reload_invalid_combinations[i].action == action && + devlink_reload_invalid_combinations[i].limit == limit) + return true; + return false; +} + +static bool +devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) +{ + return test_bit(action, &devlink->ops->reload_actions); +} + +static bool +devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit) +{ + return test_bit(limit, &devlink->ops->reload_limits); +} + +static int devlink_reload_stat_put(struct sk_buff *msg, + enum devlink_reload_limit limit, u32 value) +{ + struct nlattr *reload_stats_entry; + + reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY); + if (!reload_stats_entry) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || + nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) + goto nla_put_failure; + nla_nest_end(msg, reload_stats_entry); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, reload_stats_entry); + return -EMSGSIZE; +} + +static int +devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) +{ + struct nlattr *reload_stats_attr, *act_info, *act_stats; + int i, j, stat_idx; + u32 value; + + if (!is_remote) + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); + else + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS); + + if (!reload_stats_attr) + return -EMSGSIZE; + + for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { + if ((!is_remote && + !devlink_reload_action_is_supported(devlink, i)) || + i == DEVLINK_RELOAD_ACTION_UNSPEC) + continue; + act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO); + if (!act_info) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i)) + goto action_info_nest_cancel; + act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS); + if (!act_stats) + goto action_info_nest_cancel; + + for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { + /* Remote stats are shown even if not locally supported. + * Stats of actions with unspecified limit are shown + * though drivers don't need to register unspecified + * limit. + */ + if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && + !devlink_reload_limit_is_supported(devlink, j)) || + devlink_reload_combination_is_invalid(i, j)) + continue; + + stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; + if (!is_remote) + value = devlink->stats.reload_stats[stat_idx]; + else + value = devlink->stats.remote_reload_stats[stat_idx]; + if (devlink_reload_stat_put(msg, j, value)) + goto action_stats_nest_cancel; + } + nla_nest_end(msg, act_stats); + nla_nest_end(msg, act_info); + } + nla_nest_end(msg, reload_stats_attr); + return 0; + +action_stats_nest_cancel: + nla_nest_cancel(msg, act_stats); +action_info_nest_cancel: + nla_nest_cancel(msg, act_info); +nla_put_failure: + nla_nest_cancel(msg, reload_stats_attr); + return -EMSGSIZE; +} + +static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + struct nlattr *dev_stats; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed)) + goto nla_put_failure; + + dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS); + if (!dev_stats) + goto nla_put_failure; + + if (devlink_reload_stats_put(msg, devlink, false)) + goto dev_stats_nest_cancel; + if (devlink_reload_stats_put(msg, devlink, true)) + goto dev_stats_nest_cancel; + + nla_nest_end(msg, dev_stats); + genlmsg_end(msg, hdr); + return 0; + +dev_stats_nest_cancel: + nla_nest_cancel(msg, dev_stats); +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +void devlink_notify(struct devlink *devlink, enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); + WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_cmd_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) +{ + return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); +} + +const struct devlink_cmd devl_cmd_get = { + .dump_one = devlink_nl_cmd_get_dump_one, +}; + +static void devlink_reload_failed_set(struct devlink *devlink, + bool reload_failed) +{ + if (devlink->reload_failed == reload_failed) + return; + devlink->reload_failed = reload_failed; + devlink_notify(devlink, DEVLINK_CMD_NEW); +} + +bool devlink_is_reload_failed(const struct devlink *devlink) +{ + return devlink->reload_failed; +} +EXPORT_SYMBOL_GPL(devlink_is_reload_failed); + +static void +__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats, + enum devlink_reload_limit limit, u32 actions_performed) +{ + unsigned long actions = actions_performed; + int stat_idx; + int action; + + for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) { + stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action; + reload_stats[stat_idx]++; + } + devlink_notify(devlink, DEVLINK_CMD_NEW); +} + +static void +devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit, + u32 actions_performed) +{ + __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit, + actions_performed); +} + +/** + * devlink_remote_reload_actions_performed - Update devlink on reload actions + * performed which are not a direct result of devlink reload call. + * + * This should be called by a driver after performing reload actions in case it was not + * a result of devlink reload call. For example fw_activate was performed as a result + * of devlink reload triggered fw_activate on another host. + * The motivation for this function is to keep data on reload actions performed on this + * function whether it was done due to direct devlink reload call or not. + * + * @devlink: devlink + * @limit: reload limit + * @actions_performed: bitmask of actions performed + */ +void devlink_remote_reload_actions_performed(struct devlink *devlink, + enum devlink_reload_limit limit, + u32 actions_performed) +{ + if (WARN_ON(!actions_performed || + actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || + actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) || + limit > DEVLINK_RELOAD_LIMIT_MAX)) + return; + + __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit, + actions_performed); +} +EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed); + +static struct net *devlink_netns_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; + struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; + struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; + struct net *net; + + if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { + NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified"); + return ERR_PTR(-EINVAL); + } + + if (netns_pid_attr) { + net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); + } else if (netns_fd_attr) { + net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); + } else if (netns_id_attr) { + net = get_net_ns_by_id(sock_net(skb->sk), + nla_get_u32(netns_id_attr)); + if (!net) + net = ERR_PTR(-EINVAL); + } else { + WARN_ON(1); + net = ERR_PTR(-EINVAL); + } + if (IS_ERR(net)) { + NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace"); + return ERR_PTR(-EINVAL); + } + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EPERM); + } + return net; +} + +static void devlink_reload_netns_change(struct devlink *devlink, + struct net *curr_net, + struct net *dest_net) +{ + /* Userspace needs to be notified about devlink objects + * removed from original and entering new network namespace. + * The rest of the devlink objects are re-created during + * reload process so the notifications are generated separatelly. + */ + devlink_notify_unregister(devlink); + move_netdevice_notifier_net(curr_net, dest_net, + &devlink->netdevice_nb); + write_pnet(&devlink->_net, dest_net); + devlink_notify_register(devlink); +} + +int devlink_reload(struct devlink *devlink, struct net *dest_net, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + u32 *actions_performed, struct netlink_ext_ack *extack) +{ + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + struct net *curr_net; + int err; + + memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, + sizeof(remote_reload_stats)); + + err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); + if (err) + return err; + + curr_net = devlink_net(devlink); + if (dest_net && !net_eq(dest_net, curr_net)) + devlink_reload_netns_change(devlink, curr_net, dest_net); + + err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); + devlink_reload_failed_set(devlink, !!err); + if (err) + return err; + + WARN_ON(!(*actions_performed & BIT(action))); + /* Catch driver on updating the remote action within devlink reload */ + WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, + sizeof(remote_reload_stats))); + devlink_reload_stats_update(devlink, limit, *actions_performed); + return 0; +} + +static int +devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed, + enum devlink_command cmd, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd); + if (!hdr) + goto free_msg; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed, + actions_performed)) + goto nla_put_failure; + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +nla_put_failure: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return -EMSGSIZE; +} + +int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + enum devlink_reload_action action; + enum devlink_reload_limit limit; + struct net *dest_net = NULL; + u32 actions_performed; + int err; + + err = devlink_resources_validate(devlink, NULL, info); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); + return err; + } + + if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) + action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]); + else + action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT; + + if (!devlink_reload_action_is_supported(devlink, action)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested reload action is not supported by the driver"); + return -EOPNOTSUPP; + } + + limit = DEVLINK_RELOAD_LIMIT_UNSPEC; + if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) { + struct nla_bitfield32 limits; + u32 limits_selected; + + limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]); + limits_selected = limits.value & limits.selector; + if (!limits_selected) { + NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected"); + return -EINVAL; + } + for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++) + if (limits_selected & BIT(limit)) + break; + /* UAPI enables multiselection, but currently it is not used */ + if (limits_selected != BIT(limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Multiselection of limit is not supported"); + return -EOPNOTSUPP; + } + if (!devlink_reload_limit_is_supported(devlink, limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested limit is not supported by the driver"); + return -EOPNOTSUPP; + } + if (devlink_reload_combination_is_invalid(action, limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested limit is invalid for this action"); + return -EINVAL; + } + } + if (info->attrs[DEVLINK_ATTR_NETNS_PID] || + info->attrs[DEVLINK_ATTR_NETNS_FD] || + info->attrs[DEVLINK_ATTR_NETNS_ID]) { + dest_net = devlink_netns_get(skb, info); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + } + + err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack); + + if (dest_net) + put_net(dest_net); + + if (err) + return err; + /* For backward compatibility generate reply only if attributes used by user */ + if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) + return 0; + + return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, + DEVLINK_CMD_RELOAD, info); +} + +bool devlink_reload_actions_valid(const struct devlink_ops *ops) +{ + const struct devlink_reload_combination *comb; + int i; + + if (!devlink_reload_supported(ops)) { + if (WARN_ON(ops->reload_actions)) + return false; + return true; + } + + if (WARN_ON(!ops->reload_actions || + ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || + ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) + return false; + + if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) || + ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX))) + return false; + + for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) { + comb = &devlink_reload_invalid_combinations[i]; + if (ops->reload_actions == BIT(comb->action) && + ops->reload_limits == BIT(comb->limit)) + return false; + } + return true; +} + +static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + const struct devlink_ops *ops = devlink->ops; + enum devlink_eswitch_encap_mode encap_mode; + u8 inline_mode; + void *hdr; + int err = 0; + u16 mode; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto nla_put_failure; + + if (ops->eswitch_mode_get) { + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + goto nla_put_failure; + err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); + if (err) + goto nla_put_failure; + } + + if (ops->eswitch_inline_mode_get) { + err = ops->eswitch_inline_mode_get(devlink, &inline_mode); + if (err) + goto nla_put_failure; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, + inline_mode); + if (err) + goto nla_put_failure; + } + + if (ops->eswitch_encap_mode_get) { + err = ops->eswitch_encap_mode_get(devlink, &encap_mode); + if (err) + goto nla_put_failure; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode); + if (err) + goto nla_put_failure; + } + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return err; +} + +int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET, + info->snd_portid, info->snd_seq, 0); + + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const struct devlink_ops *ops = devlink->ops; + enum devlink_eswitch_encap_mode encap_mode; + u8 inline_mode; + int err = 0; + u16 mode; + + if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { + if (!ops->eswitch_mode_set) + return -EOPNOTSUPP; + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + err = devlink_rate_nodes_check(devlink, mode, info->extack); + if (err) + return err; + err = ops->eswitch_mode_set(devlink, mode, info->extack); + if (err) + return err; + } + + if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) { + if (!ops->eswitch_inline_mode_set) + return -EOPNOTSUPP; + inline_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); + err = ops->eswitch_inline_mode_set(devlink, inline_mode, + info->extack); + if (err) + return err; + } + + if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) { + if (!ops->eswitch_encap_mode_set) + return -EOPNOTSUPP; + encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); + err = ops->eswitch_encap_mode_set(devlink, encap_mode, + info->extack); + if (err) + return err; + } + + return 0; +} + +int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) +{ + if (!req->msg) + return 0; + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); +} +EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); + +int devlink_info_board_serial_number_put(struct devlink_info_req *req, + const char *bsn) +{ + if (!req->msg) + return 0; + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, + bsn); +} +EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); + +static int devlink_info_version_put(struct devlink_info_req *req, int attr, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type) +{ + struct nlattr *nest; + int err; + + if (req->version_cb) + req->version_cb(version_name, version_type, + req->version_cb_priv); + + if (!req->msg) + return 0; + + nest = nla_nest_start_noflag(req->msg, attr); + if (!nest) + return -EMSGSIZE; + + err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME, + version_name); + if (err) + goto nla_put_failure; + + err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE, + version_value); + if (err) + goto nla_put_failure; + + nla_nest_end(req->msg, nest); + + return 0; + +nla_put_failure: + nla_nest_cancel(req->msg, nest); + return err; +} + +int devlink_info_version_fixed_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); +} +EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); + +int devlink_info_version_stored_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); +} +EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); + +int devlink_info_version_stored_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, + version_name, version_value, + version_type); +} +EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext); + +int devlink_info_version_running_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); +} +EXPORT_SYMBOL_GPL(devlink_info_version_running_put); + +int devlink_info_version_running_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, + version_name, version_value, + version_type); +} +EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext); + +static int devlink_nl_driver_info_get(struct device_driver *drv, + struct devlink_info_req *req) +{ + if (!drv) + return 0; + + if (drv->name[0]) + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, + drv->name); + + return 0; +} + +static int +devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, struct netlink_ext_ack *extack) +{ + struct device *dev = devlink_to_dev(devlink); + struct devlink_info_req req = {}; + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + err = -EMSGSIZE; + if (devlink_nl_put_handle(msg, devlink)) + goto err_cancel_msg; + + req.msg = msg; + if (devlink->ops->info_get) { + err = devlink->ops->info_get(devlink, &req, extack); + if (err) + goto err_cancel_msg; + } + + err = devlink_nl_driver_info_get(dev->driver, &req); + if (err) + goto err_cancel_msg; + + genlmsg_end(msg, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(msg, hdr); + return err; +} + +int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, + info->snd_portid, info->snd_seq, 0, + info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_cmd_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) +{ + int err; + + err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); + if (err == -EOPNOTSUPP) + err = 0; + return err; +} + +const struct devlink_cmd devl_cmd_info_get = { + .dump_one = devlink_nl_cmd_info_get_dump_one, +}; + +static int devlink_nl_flash_update_fill(struct sk_buff *msg, + struct devlink *devlink, + enum devlink_command cmd, + struct devlink_flash_notify *params) +{ + void *hdr; + + hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) + goto out; + + if (params->status_msg && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, + params->status_msg)) + goto nla_put_failure; + if (params->component && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, + params->component)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, + params->done, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, + params->total, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, + params->timeout, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void __devlink_flash_update_notify(struct devlink *devlink, + enum devlink_command cmd, + struct devlink_flash_notify *params) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && + cmd != DEVLINK_CMD_FLASH_UPDATE_END && + cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); + + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_flash_update_fill(msg, devlink, cmd, params); + if (err) + goto out_free_msg; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + return; + +out_free_msg: + nlmsg_free(msg); +} + +static void devlink_flash_update_begin_notify(struct devlink *devlink) +{ + struct devlink_flash_notify params = {}; + + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE, + ¶ms); +} + +static void devlink_flash_update_end_notify(struct devlink *devlink) +{ + struct devlink_flash_notify params = {}; + + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_END, + ¶ms); +} + +void devlink_flash_update_status_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + struct devlink_flash_notify params = { + .status_msg = status_msg, + .component = component, + .done = done, + .total = total, + }; + + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + ¶ms); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); + +void devlink_flash_update_timeout_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long timeout) +{ + struct devlink_flash_notify params = { + .status_msg = status_msg, + .component = component, + .timeout = timeout, + }; + + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + ¶ms); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); + +struct devlink_flash_component_lookup_ctx { + const char *lookup_name; + bool lookup_name_found; +}; + +static void +devlink_flash_component_lookup_cb(const char *version_name, + enum devlink_info_version_type version_type, + void *version_cb_priv) +{ + struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv; + + if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT || + lookup_ctx->lookup_name_found) + return; + + lookup_ctx->lookup_name_found = + !strcmp(lookup_ctx->lookup_name, version_name); +} + +static int devlink_flash_component_get(struct devlink *devlink, + struct nlattr *nla_component, + const char **p_component, + struct netlink_ext_ack *extack) +{ + struct devlink_flash_component_lookup_ctx lookup_ctx = {}; + struct devlink_info_req req = {}; + const char *component; + int ret; + + if (!nla_component) + return 0; + + component = nla_data(nla_component); + + if (!devlink->ops->info_get) { + NL_SET_ERR_MSG_ATTR(extack, nla_component, + "component update is not supported by this device"); + return -EOPNOTSUPP; + } + + lookup_ctx.lookup_name = component; + req.version_cb = devlink_flash_component_lookup_cb; + req.version_cb_priv = &lookup_ctx; + + ret = devlink->ops->info_get(devlink, &req, NULL); + if (ret) + return ret; + + if (!lookup_ctx.lookup_name_found) { + NL_SET_ERR_MSG_ATTR(extack, nla_component, + "selected component is not supported by this device"); + return -EINVAL; + } + *p_component = component; + return 0; +} + +int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *nla_overwrite_mask, *nla_file_name; + struct devlink_flash_update_params params = {}; + struct devlink *devlink = info->user_ptr[0]; + const char *file_name; + u32 supported_params; + int ret; + + if (!devlink->ops->flash_update) + return -EOPNOTSUPP; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME)) + return -EINVAL; + + ret = devlink_flash_component_get(devlink, + info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT], + ¶ms.component, info->extack); + if (ret) + return ret; + + supported_params = devlink->ops->supported_flash_update_params; + + nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; + if (nla_overwrite_mask) { + struct nla_bitfield32 sections; + + if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask, + "overwrite settings are not supported by this device"); + return -EOPNOTSUPP; + } + sections = nla_get_bitfield32(nla_overwrite_mask); + params.overwrite_mask = sections.value & sections.selector; + } + + nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]; + file_name = nla_data(nla_file_name); + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, + "failed to locate the requested firmware file"); + return ret; + } + + devlink_flash_update_begin_notify(devlink); + ret = devlink->ops->flash_update(devlink, ¶ms, info->extack); + devlink_flash_update_end_notify(devlink); + + release_firmware(params.fw); + + return ret; +} + +static void __devlink_compat_running_version(struct devlink *devlink, + char *buf, size_t len) +{ + struct devlink_info_req req = {}; + const struct nlattr *nlattr; + struct sk_buff *msg; + int rem, err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + req.msg = msg; + err = devlink->ops->info_get(devlink, &req, NULL); + if (err) + goto free_msg; + + nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { + const struct nlattr *kv; + int rem_kv; + + if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) + continue; + + nla_for_each_nested(kv, nlattr, rem_kv) { + if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) + continue; + + strlcat(buf, nla_data(kv), len); + strlcat(buf, " ", len); + } + } +free_msg: + nlmsg_free(msg); +} + +void devlink_compat_running_version(struct devlink *devlink, + char *buf, size_t len) +{ + if (!devlink->ops->info_get) + return; + + devl_lock(devlink); + if (devl_is_registered(devlink)) + __devlink_compat_running_version(devlink, buf, len); + devl_unlock(devlink); +} + +int devlink_compat_flash_update(struct devlink *devlink, const char *file_name) +{ + struct devlink_flash_update_params params = {}; + int ret; + + devl_lock(devlink); + if (!devl_is_registered(devlink)) { + ret = -ENODEV; + goto out_unlock; + } + + if (!devlink->ops->flash_update) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) + goto out_unlock; + + devlink_flash_update_begin_notify(devlink); + ret = devlink->ops->flash_update(devlink, ¶ms, NULL); + devlink_flash_update_end_notify(devlink); + + release_firmware(params.fw); +out_unlock: + devl_unlock(devlink); + + return ret; +} + +static int +devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink, + u32 portid, u32 seq, int flags, + struct netlink_ext_ack *extack) +{ + struct nlattr *selftests; + void *hdr; + int err; + int i; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, + DEVLINK_CMD_SELFTESTS_GET); + if (!hdr) + return -EMSGSIZE; + + err = -EMSGSIZE; + if (devlink_nl_put_handle(msg, devlink)) + goto err_cancel_msg; + + selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); + if (!selftests) + goto err_cancel_msg; + + for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; + i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { + if (devlink->ops->selftest_check(devlink, i, extack)) { + err = nla_put_flag(msg, i); + if (err) + goto err_cancel_msg; + } + } + + nla_nest_end(msg, selftests); + genlmsg_end(msg, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(msg, hdr); + return err; +} + +int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + if (!devlink->ops->selftest_check) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid, + info->snd_seq, 0, info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_cmd_selftests_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) +{ + if (!devlink->ops->selftest_check) + return 0; + + return devlink_nl_selftests_fill(msg, devlink, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); +} + +const struct devlink_cmd devl_cmd_selftests_get = { + .dump_one = devlink_nl_cmd_selftests_get_dump_one, +}; + +static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id, + enum devlink_selftest_status test_status) +{ + struct nlattr *result_attr; + + result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT); + if (!result_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) || + nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS, + test_status)) + goto nla_put_failure; + + nla_nest_end(skb, result_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, result_attr); + return -EMSGSIZE; +} + +static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = { + [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG }, +}; + +int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1]; + struct devlink *devlink = info->user_ptr[0]; + struct nlattr *attrs, *selftests; + struct sk_buff *msg; + void *hdr; + int err; + int i; + + if (!devlink->ops->selftest_run || !devlink->ops->selftest_check) + return -EOPNOTSUPP; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS)) + return -EINVAL; + + attrs = info->attrs[DEVLINK_ATTR_SELFTESTS]; + + err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs, + devlink_selftest_nl_policy, info->extack); + if (err < 0) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = -EMSGSIZE; + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN); + if (!hdr) + goto free_msg; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + + selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); + if (!selftests) + goto genlmsg_cancel; + + for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; + i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { + enum devlink_selftest_status test_status; + + if (nla_get_flag(tb[i])) { + if (!devlink->ops->selftest_check(devlink, i, + info->extack)) { + if (devlink_selftest_result_put(msg, i, + DEVLINK_SELFTEST_STATUS_SKIP)) + goto selftests_nest_cancel; + continue; + } + + test_status = devlink->ops->selftest_run(devlink, i, + info->extack); + if (devlink_selftest_result_put(msg, i, test_status)) + goto selftests_nest_cancel; + } + } + + nla_nest_end(msg, selftests); + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +selftests_nest_cancel: + nla_nest_cancel(msg, selftests); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return err; +} diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h new file mode 100644 index 000000000000..941174e157d4 --- /dev/null +++ b/net/devlink/devl_internal.h @@ -0,0 +1,220 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/xarray.h> +#include <net/devlink.h> +#include <net/net_namespace.h> + +#define DEVLINK_REGISTERED XA_MARK_1 + +#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ + (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) + +struct devlink_dev_stats { + u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; +}; + +struct devlink { + u32 index; + struct xarray ports; + struct list_head rate_list; + struct list_head sb_list; + struct list_head dpipe_table_list; + struct list_head resource_list; + struct list_head param_list; + struct list_head region_list; + struct list_head reporter_list; + struct devlink_dpipe_headers *dpipe_headers; + struct list_head trap_list; + struct list_head trap_group_list; + struct list_head trap_policer_list; + struct list_head linecard_list; + const struct devlink_ops *ops; + struct xarray snapshot_ids; + struct devlink_dev_stats stats; + struct device *dev; + possible_net_t _net; + /* Serializes access to devlink instance specific objects such as + * port, sb, dpipe, resource, params, region, traps and more. + */ + struct mutex lock; + struct lock_class_key lock_key; + u8 reload_failed:1; + refcount_t refcount; + struct rcu_work rwork; + struct notifier_block netdevice_nb; + char priv[] __aligned(NETDEV_ALIGN); +}; + +extern struct xarray devlinks; +extern struct genl_family devlink_nl_family; + +/* devlink instances are open to the access from the user space after + * devlink_register() call. Such logical barrier allows us to have certain + * expectations related to locking. + * + * Before *_register() - we are in initialization stage and no parallel + * access possible to the devlink instance. All drivers perform that phase + * by implicitly holding device_lock. + * + * After *_register() - users and driver can access devlink instance at + * the same time. + */ +#define ASSERT_DEVLINK_REGISTERED(d) \ + WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) +#define ASSERT_DEVLINK_NOT_REGISTERED(d) \ + WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) + +/* Iterate over devlink pointers which were possible to get reference to. + * devlink_put() needs to be called for each iterated devlink pointer + * in loop body in order to release the reference. + */ +#define devlinks_xa_for_each_registered_get(net, index, devlink) \ + for (index = 0; (devlink = devlinks_xa_find_get(net, &index)); index++) + +struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp); + +static inline bool devl_is_registered(struct devlink *devlink) +{ + devl_assert_locked(devlink); + return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); +} + +/* Netlink */ +#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) +#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) +#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) +#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) +#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) + +enum devlink_multicast_groups { + DEVLINK_MCGRP_CONFIG, +}; + +/* state held across netlink dumps */ +struct devlink_nl_dump_state { + unsigned long instance; + int idx; + union { + /* DEVLINK_CMD_REGION_READ */ + struct { + u64 start_offset; + }; + /* DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET */ + struct { + u64 dump_ts; + }; + }; +}; + +struct devlink_cmd { + int (*dump_one)(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb); +}; + +extern const struct genl_small_ops devlink_nl_ops[56]; + +struct devlink * +devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); + +void devlink_notify_unregister(struct devlink *devlink); +void devlink_notify_register(struct devlink *devlink); + +int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, + struct netlink_callback *cb); + +static inline struct devlink_nl_dump_state * +devlink_dump_state(struct netlink_callback *cb) +{ + NL_ASSERT_DUMP_CTX_FITS(struct devlink_nl_dump_state); + + return (struct devlink_nl_dump_state *)cb->ctx; +} + +static inline int +devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) +{ + if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) + return -EMSGSIZE; + if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) + return -EMSGSIZE; + return 0; +} + +/* Commands */ +extern const struct devlink_cmd devl_cmd_get; +extern const struct devlink_cmd devl_cmd_port_get; +extern const struct devlink_cmd devl_cmd_sb_get; +extern const struct devlink_cmd devl_cmd_sb_pool_get; +extern const struct devlink_cmd devl_cmd_sb_port_pool_get; +extern const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get; +extern const struct devlink_cmd devl_cmd_param_get; +extern const struct devlink_cmd devl_cmd_region_get; +extern const struct devlink_cmd devl_cmd_info_get; +extern const struct devlink_cmd devl_cmd_health_reporter_get; +extern const struct devlink_cmd devl_cmd_trap_get; +extern const struct devlink_cmd devl_cmd_trap_group_get; +extern const struct devlink_cmd devl_cmd_trap_policer_get; +extern const struct devlink_cmd devl_cmd_rate_get; +extern const struct devlink_cmd devl_cmd_linecard_get; +extern const struct devlink_cmd devl_cmd_selftests_get; + +/* Notify */ +void devlink_notify(struct devlink *devlink, enum devlink_command cmd); + +/* Ports */ +int devlink_port_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr); + +struct devlink_port * +devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info); + +/* Reload */ +bool devlink_reload_actions_valid(const struct devlink_ops *ops); +int devlink_reload(struct devlink *devlink, struct net *dest_net, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + u32 *actions_performed, struct netlink_ext_ack *extack); + +static inline bool devlink_reload_supported(const struct devlink_ops *ops) +{ + return ops->reload_down && ops->reload_up; +} + +/* Resources */ +struct devlink_resource; +int devlink_resources_validate(struct devlink *devlink, + struct devlink_resource *resource, + struct genl_info *info); + +/* Line cards */ +struct devlink_linecard; + +struct devlink_linecard * +devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info); + +/* Rates */ +int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack); +struct devlink_rate * +devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info); +struct devlink_rate * +devlink_rate_node_get_from_info(struct devlink *devlink, + struct genl_info *info); +/* Devlink nl cmds */ +int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); diff --git a/net/core/devlink.c b/net/devlink/leftover.c index 909a10e4b0dd..f05ab093d231 100644 --- a/net/core/devlink.c +++ b/net/devlink/leftover.c @@ -31,58 +31,12 @@ #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> -#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ - (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) - -struct devlink_dev_stats { - u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; - u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; -}; - -struct devlink { - u32 index; - struct xarray ports; - struct list_head rate_list; - struct list_head sb_list; - struct list_head dpipe_table_list; - struct list_head resource_list; - struct list_head param_list; - struct list_head region_list; - struct list_head reporter_list; - struct mutex reporters_lock; /* protects reporter_list */ - struct devlink_dpipe_headers *dpipe_headers; - struct list_head trap_list; - struct list_head trap_group_list; - struct list_head trap_policer_list; - struct list_head linecard_list; - struct mutex linecards_lock; /* protects linecard_list */ - const struct devlink_ops *ops; - u64 features; - struct xarray snapshot_ids; - struct devlink_dev_stats stats; - struct device *dev; - possible_net_t _net; - /* Serializes access to devlink instance specific objects such as - * port, sb, dpipe, resource, params, region, traps and more. - */ - struct mutex lock; - struct lock_class_key lock_key; - u8 reload_failed:1; - refcount_t refcount; - struct completion comp; - struct rcu_head rcu; - struct notifier_block netdevice_nb; - char priv[] __aligned(NETDEV_ALIGN); -}; - -struct devlink_linecard_ops; -struct devlink_linecard_type; +#include "devl_internal.h" struct devlink_linecard { struct list_head list; struct devlink *devlink; unsigned int index; - refcount_t refcount; const struct devlink_linecard_ops *ops; void *priv; enum devlink_linecard_state state; @@ -122,24 +76,6 @@ struct devlink_resource { void *occ_get_priv; }; -void *devlink_priv(struct devlink *devlink) -{ - return &devlink->priv; -} -EXPORT_SYMBOL_GPL(devlink_priv); - -struct devlink *priv_to_devlink(void *priv) -{ - return container_of(priv, struct devlink, priv); -} -EXPORT_SYMBOL_GPL(priv_to_devlink); - -struct device *devlink_to_dev(const struct devlink *devlink) -{ - return devlink->dev; -} -EXPORT_SYMBOL_GPL(devlink_to_dev); - static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = { { .name = "destination mac", @@ -207,176 +143,6 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), }; -static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = { - [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG }, -}; - -static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); -#define DEVLINK_REGISTERED XA_MARK_1 -#define DEVLINK_UNREGISTERING XA_MARK_2 - -/* devlink instances are open to the access from the user space after - * devlink_register() call. Such logical barrier allows us to have certain - * expectations related to locking. - * - * Before *_register() - we are in initialization stage and no parallel - * access possible to the devlink instance. All drivers perform that phase - * by implicitly holding device_lock. - * - * After *_register() - users and driver can access devlink instance at - * the same time. - */ -#define ASSERT_DEVLINK_REGISTERED(d) \ - WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) -#define ASSERT_DEVLINK_NOT_REGISTERED(d) \ - WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) - -struct net *devlink_net(const struct devlink *devlink) -{ - return read_pnet(&devlink->_net); -} -EXPORT_SYMBOL_GPL(devlink_net); - -static void __devlink_put_rcu(struct rcu_head *head) -{ - struct devlink *devlink = container_of(head, struct devlink, rcu); - - complete(&devlink->comp); -} - -void devlink_put(struct devlink *devlink) -{ - if (refcount_dec_and_test(&devlink->refcount)) - /* Make sure unregister operation that may await the completion - * is unblocked only after all users are after the end of - * RCU grace period. - */ - call_rcu(&devlink->rcu, __devlink_put_rcu); -} - -struct devlink *__must_check devlink_try_get(struct devlink *devlink) -{ - if (refcount_inc_not_zero(&devlink->refcount)) - return devlink; - return NULL; -} - -void devl_assert_locked(struct devlink *devlink) -{ - lockdep_assert_held(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_assert_locked); - -#ifdef CONFIG_LOCKDEP -/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ -bool devl_lock_is_held(struct devlink *devlink) -{ - return lockdep_is_held(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_lock_is_held); -#endif - -void devl_lock(struct devlink *devlink) -{ - mutex_lock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_lock); - -int devl_trylock(struct devlink *devlink) -{ - return mutex_trylock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_trylock); - -void devl_unlock(struct devlink *devlink) -{ - mutex_unlock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_unlock); - -static struct devlink * -devlinks_xa_find_get(struct net *net, unsigned long *indexp, xa_mark_t filter, - void * (*xa_find_fn)(struct xarray *, unsigned long *, - unsigned long, xa_mark_t)) -{ - struct devlink *devlink; - - rcu_read_lock(); -retry: - devlink = xa_find_fn(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); - if (!devlink) - goto unlock; - - /* In case devlink_unregister() was already called and "unregistering" - * mark was set, do not allow to get a devlink reference here. - * This prevents live-lock of devlink_unregister() wait for completion. - */ - if (xa_get_mark(&devlinks, *indexp, DEVLINK_UNREGISTERING)) - goto retry; - - /* For a possible retry, the xa_find_after() should be always used */ - xa_find_fn = xa_find_after; - if (!devlink_try_get(devlink)) - goto retry; - if (!net_eq(devlink_net(devlink), net)) { - devlink_put(devlink); - goto retry; - } -unlock: - rcu_read_unlock(); - return devlink; -} - -static struct devlink *devlinks_xa_find_get_first(struct net *net, - unsigned long *indexp, - xa_mark_t filter) -{ - return devlinks_xa_find_get(net, indexp, filter, xa_find); -} - -static struct devlink *devlinks_xa_find_get_next(struct net *net, - unsigned long *indexp, - xa_mark_t filter) -{ - return devlinks_xa_find_get(net, indexp, filter, xa_find_after); -} - -/* Iterate over devlink pointers which were possible to get reference to. - * devlink_put() needs to be called for each iterated devlink pointer - * in loop body in order to release the reference. - */ -#define devlinks_xa_for_each_get(net, index, devlink, filter) \ - for (index = 0, \ - devlink = devlinks_xa_find_get_first(net, &index, filter); \ - devlink; devlink = devlinks_xa_find_get_next(net, &index, filter)) - -#define devlinks_xa_for_each_registered_get(net, index, devlink) \ - devlinks_xa_for_each_get(net, index, devlink, DEVLINK_REGISTERED) - -static struct devlink *devlink_get_from_attrs(struct net *net, - struct nlattr **attrs) -{ - struct devlink *devlink; - unsigned long index; - char *busname; - char *devname; - - if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) - return ERR_PTR(-EINVAL); - - busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); - devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); - - devlinks_xa_for_each_registered_get(net, index, devlink) { - if (strcmp(devlink->dev->bus->name, busname) == 0 && - strcmp(dev_name(devlink->dev), devname) == 0) - return devlink; - devlink_put(devlink); - } - - return ERR_PTR(-ENODEV); -} - #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ WARN_ON_ONCE(!(devlink_port)->registered) #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ @@ -405,8 +171,8 @@ static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, return ERR_PTR(-EINVAL); } -static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, - struct genl_info *info) +struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, + struct genl_info *info) { return devlink_port_get_from_attrs(devlink, info->attrs); } @@ -466,13 +232,13 @@ devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) return devlink_rate_node_get_by_name(devlink, rate_node_name); } -static struct devlink_rate * +struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } -static struct devlink_rate * +struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; @@ -511,11 +277,7 @@ devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); struct devlink_linecard *linecard; - mutex_lock(&devlink->linecards_lock); linecard = devlink_linecard_get_by_index(devlink, linecard_index); - if (linecard) - refcount_inc(&linecard->refcount); - mutex_unlock(&devlink->linecards_lock); if (!linecard) return ERR_PTR(-ENODEV); return linecard; @@ -523,20 +285,12 @@ devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) return ERR_PTR(-EINVAL); } -static struct devlink_linecard * +struct devlink_linecard * devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_linecard_get_from_attrs(devlink, info->attrs); } -static void devlink_linecard_put(struct devlink_linecard *linecard) -{ - if (refcount_dec_and_test(&linecard->refcount)) { - mutex_destroy(&linecard->state_lock); - kfree(linecard); - } -} - struct devlink_sb { struct list_head list; unsigned int index; @@ -838,104 +592,6 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) return NULL; } -#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) -#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) -#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) -#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) -#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) - -static int devlink_nl_pre_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) -{ - struct devlink_linecard *linecard; - struct devlink_port *devlink_port; - struct devlink *devlink; - int err; - - devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs); - if (IS_ERR(devlink)) - return PTR_ERR(devlink); - devl_lock(devlink); - info->user_ptr[0] = devlink; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { - devlink_port = devlink_port_get_from_info(devlink, info); - if (IS_ERR(devlink_port)) { - err = PTR_ERR(devlink_port); - goto unlock; - } - info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { - devlink_port = devlink_port_get_from_info(devlink, info); - if (!IS_ERR(devlink_port)) - info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { - struct devlink_rate *devlink_rate; - - devlink_rate = devlink_rate_get_from_info(devlink, info); - if (IS_ERR(devlink_rate)) { - err = PTR_ERR(devlink_rate); - goto unlock; - } - info->user_ptr[1] = devlink_rate; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { - struct devlink_rate *rate_node; - - rate_node = devlink_rate_node_get_from_info(devlink, info); - if (IS_ERR(rate_node)) { - err = PTR_ERR(rate_node); - goto unlock; - } - info->user_ptr[1] = rate_node; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) { - err = PTR_ERR(linecard); - goto unlock; - } - info->user_ptr[1] = linecard; - } - return 0; - -unlock: - devl_unlock(devlink); - devlink_put(devlink); - return err; -} - -static void devlink_nl_post_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) -{ - struct devlink_linecard *linecard; - struct devlink *devlink; - - devlink = info->user_ptr[0]; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { - linecard = info->user_ptr[1]; - devlink_linecard_put(linecard); - } - devl_unlock(devlink); - devlink_put(devlink); -} - -static struct genl_family devlink_nl_family; - -enum devlink_multicast_groups { - DEVLINK_MCGRP_CONFIG, -}; - -static const struct genl_multicast_group devlink_nl_mcgrps[] = { - [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, -}; - -static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) -{ - if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) - return -EMSGSIZE; - if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) - return -EMSGSIZE; - return 0; -} - static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink) { struct nlattr *nested_attr; @@ -972,185 +628,6 @@ size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ } -struct devlink_reload_combination { - enum devlink_reload_action action; - enum devlink_reload_limit limit; -}; - -static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = { - { - /* can't reinitialize driver with no down time */ - .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - .limit = DEVLINK_RELOAD_LIMIT_NO_RESET, - }, -}; - -static bool -devlink_reload_combination_is_invalid(enum devlink_reload_action action, - enum devlink_reload_limit limit) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) - if (devlink_reload_invalid_combinations[i].action == action && - devlink_reload_invalid_combinations[i].limit == limit) - return true; - return false; -} - -static bool -devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) -{ - return test_bit(action, &devlink->ops->reload_actions); -} - -static bool -devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit) -{ - return test_bit(limit, &devlink->ops->reload_limits); -} - -static int devlink_reload_stat_put(struct sk_buff *msg, - enum devlink_reload_limit limit, u32 value) -{ - struct nlattr *reload_stats_entry; - - reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY); - if (!reload_stats_entry) - return -EMSGSIZE; - - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || - nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) - goto nla_put_failure; - nla_nest_end(msg, reload_stats_entry); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, reload_stats_entry); - return -EMSGSIZE; -} - -static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) -{ - struct nlattr *reload_stats_attr, *act_info, *act_stats; - int i, j, stat_idx; - u32 value; - - if (!is_remote) - reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); - else - reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS); - - if (!reload_stats_attr) - return -EMSGSIZE; - - for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { - if ((!is_remote && - !devlink_reload_action_is_supported(devlink, i)) || - i == DEVLINK_RELOAD_ACTION_UNSPEC) - continue; - act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO); - if (!act_info) - goto nla_put_failure; - - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i)) - goto action_info_nest_cancel; - act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS); - if (!act_stats) - goto action_info_nest_cancel; - - for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { - /* Remote stats are shown even if not locally supported. - * Stats of actions with unspecified limit are shown - * though drivers don't need to register unspecified - * limit. - */ - if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && - !devlink_reload_limit_is_supported(devlink, j)) || - devlink_reload_combination_is_invalid(i, j)) - continue; - - stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; - if (!is_remote) - value = devlink->stats.reload_stats[stat_idx]; - else - value = devlink->stats.remote_reload_stats[stat_idx]; - if (devlink_reload_stat_put(msg, j, value)) - goto action_stats_nest_cancel; - } - nla_nest_end(msg, act_stats); - nla_nest_end(msg, act_info); - } - nla_nest_end(msg, reload_stats_attr); - return 0; - -action_stats_nest_cancel: - nla_nest_cancel(msg, act_stats); -action_info_nest_cancel: - nla_nest_cancel(msg, act_info); -nla_put_failure: - nla_nest_cancel(msg, reload_stats_attr); - return -EMSGSIZE; -} - -static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - struct nlattr *dev_stats; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed)) - goto nla_put_failure; - - dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS); - if (!dev_stats) - goto nla_put_failure; - - if (devlink_reload_stats_put(msg, devlink, false)) - goto dev_stats_nest_cancel; - if (devlink_reload_stats_put(msg, devlink, true)) - goto dev_stats_nest_cancel; - - nla_nest_end(msg, dev_stats); - genlmsg_end(msg, hdr); - return 0; - -dev_stats_nest_cancel: - nla_nest_cancel(msg, dev_stats); -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); - WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - static int devlink_nl_port_attrs_put(struct sk_buff *msg, struct devlink_port *devlink_port) { @@ -1537,47 +1014,40 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(devlink_rate, &devlink->rate_list, list) { - enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; - u32 id = NETLINK_CB(cb->skb).portid; + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; + u32 id = NETLINK_CB(cb->skb).portid; - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, NULL); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + if (idx < state->idx) { idx++; + continue; } - devl_unlock(devlink); - devlink_put(devlink); + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, NULL); + if (err) { + state->idx = idx; + break; + } + idx++; } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_cmd devl_cmd_rate_get = { + .dump_one = devlink_nl_cmd_rate_get_dump_one, +}; + static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { @@ -1612,58 +1082,6 @@ devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, return false; } -static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start) { - idx++; - devlink_put(devlink); - continue; - } - - devl_lock(devlink); - err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); - devl_unlock(devlink); - devlink_put(devlink); - - if (err) - goto out; - idx++; - } -out: - cb->args[0] = idx; - return msg->len; -} - static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, struct genl_info *info) { @@ -1686,43 +1104,40 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; - unsigned long index, port_index; - int start = cb->args[0]; + unsigned long port_index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - xa_for_each(&devlink->ports, port_index, devlink_port) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_port_fill(msg, devlink_port, - DEVLINK_CMD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->extack); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + xa_for_each(&devlink->ports, port_index, devlink_port) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_port_fill(msg, devlink_port, + DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, cb->extack); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_port_get = { + .dump_one = devlink_nl_cmd_port_get_dump_one, +}; + static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) @@ -2465,46 +1880,42 @@ static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int devlink_nl_cmd_linecard_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_linecard *linecard; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - mutex_lock(&devlink->linecards_lock); - list_for_each_entry(linecard, &devlink->linecard_list, list) { - if (idx < start) { - idx++; - continue; - } - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, - cb->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - mutex_unlock(&devlink->linecards_lock); - devlink_put(devlink); - goto out; - } + list_for_each_entry(linecard, &devlink->linecard_list, list) { + if (idx < state->idx) { idx++; + continue; + } + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + cb->extack); + mutex_unlock(&linecard->state_lock); + if (err) { + state->idx = idx; + break; } - mutex_unlock(&devlink->linecards_lock); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_linecard_get = { + .dump_one = devlink_nl_cmd_linecard_get_dump_one, +}; + static struct devlink_linecard_type * devlink_linecard_type_lookup(struct devlink_linecard *linecard, const char *type) @@ -2727,43 +2138,39 @@ static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_sb_fill(msg, devlink, devlink_sb, - DEVLINK_CMD_SB_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_sb_fill(msg, devlink, devlink_sb, + DEVLINK_CMD_SB_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_sb_get = { + .dump_one = devlink_nl_cmd_sb_get_dump_one, +}; + static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, u16 pool_index, enum devlink_command cmd, @@ -2869,46 +2276,39 @@ static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, return 0; } -static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_sb_pool_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; - int idx = 0; int err = 0; + int idx = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_pool_get) - goto retry; + if (!devlink->ops->sb_pool_get) + return 0; - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_pool_get_dumpit(msg, start, &idx, devlink, - devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_pool_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); -retry: - devlink_put(devlink); } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_cmd devl_cmd_sb_pool_get = { + .dump_one = devlink_nl_cmd_sb_pool_get_dump_one, +}; + static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, u16 pool_index, u32 size, enum devlink_sb_threshold_type threshold_type, @@ -3084,46 +2484,39 @@ static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, return 0; } -static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_sb_port_pool_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_port_pool_get) - goto retry; - - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_port_pool_get_dumpit(msg, start, &idx, - devlink, devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + if (!devlink->ops->sb_port_pool_get) + return 0; + + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_port_pool_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); -retry: - devlink_put(devlink); } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_cmd devl_cmd_sb_port_pool_get = { + .dump_one = devlink_nl_cmd_sb_port_pool_get_dump_one, +}; + static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, u32 threshold, @@ -3327,47 +2720,38 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, } static int -devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +devlink_nl_cmd_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_tc_pool_bind_get) - goto retry; + if (!devlink->ops->sb_tc_pool_bind_get) + return 0; - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, - devlink, - devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); -retry: - devlink_put(devlink); } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get = { + .dump_one = devlink_nl_cmd_sb_tc_pool_bind_get_dump_one, +}; + static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, @@ -3455,85 +2839,8 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, return -EOPNOTSUPP; } -static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - const struct devlink_ops *ops = devlink->ops; - enum devlink_eswitch_encap_mode encap_mode; - u8 inline_mode; - void *hdr; - int err = 0; - u16 mode; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - err = devlink_nl_put_handle(msg, devlink); - if (err) - goto nla_put_failure; - - if (ops->eswitch_mode_get) { - err = ops->eswitch_mode_get(devlink, &mode); - if (err) - goto nla_put_failure; - err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); - if (err) - goto nla_put_failure; - } - - if (ops->eswitch_inline_mode_get) { - err = ops->eswitch_inline_mode_get(devlink, &inline_mode); - if (err) - goto nla_put_failure; - err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, - inline_mode); - if (err) - goto nla_put_failure; - } - - if (ops->eswitch_encap_mode_get) { - err = ops->eswitch_encap_mode_get(devlink, &encap_mode); - if (err) - goto nla_put_failure; - err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode); - if (err) - goto nla_put_failure; - } - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET, - info->snd_portid, info->snd_seq, 0); - - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, - struct netlink_ext_ack *extack) +int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; @@ -3545,52 +2852,6 @@ static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, return 0; } -static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - const struct devlink_ops *ops = devlink->ops; - enum devlink_eswitch_encap_mode encap_mode; - u8 inline_mode; - int err = 0; - u16 mode; - - if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { - if (!ops->eswitch_mode_set) - return -EOPNOTSUPP; - mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); - err = devlink_rate_nodes_check(devlink, mode, info->extack); - if (err) - return err; - err = ops->eswitch_mode_set(devlink, mode, info->extack); - if (err) - return err; - } - - if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) { - if (!ops->eswitch_inline_mode_set) - return -EOPNOTSUPP; - inline_mode = nla_get_u8( - info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); - err = ops->eswitch_inline_mode_set(devlink, inline_mode, - info->extack); - if (err) - return err; - } - - if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) { - if (!ops->eswitch_encap_mode_set) - return -EOPNOTSUPP; - encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); - err = ops->eswitch_encap_mode_set(devlink, encap_mode, - info->extack); - if (err) - return err; - } - - return 0; -} - int devlink_dpipe_match_put(struct sk_buff *skb, struct devlink_dpipe_match *match) { @@ -4561,10 +3822,9 @@ static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); } -static int -devlink_resources_validate(struct devlink *devlink, - struct devlink_resource *resource, - struct genl_info *info) +int devlink_resources_validate(struct devlink *devlink, + struct devlink_resource *resource, + struct genl_info *info) { struct list_head *resource_list; int err = 0; @@ -4584,743 +3844,6 @@ devlink_resources_validate(struct devlink *devlink, return err; } -static struct net *devlink_netns_get(struct sk_buff *skb, - struct genl_info *info) -{ - struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; - struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; - struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; - struct net *net; - - if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { - NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified"); - return ERR_PTR(-EINVAL); - } - - if (netns_pid_attr) { - net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); - } else if (netns_fd_attr) { - net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); - } else if (netns_id_attr) { - net = get_net_ns_by_id(sock_net(skb->sk), - nla_get_u32(netns_id_attr)); - if (!net) - net = ERR_PTR(-EINVAL); - } else { - WARN_ON(1); - net = ERR_PTR(-EINVAL); - } - if (IS_ERR(net)) { - NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace"); - return ERR_PTR(-EINVAL); - } - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { - put_net(net); - return ERR_PTR(-EPERM); - } - return net; -} - -static void devlink_param_notify(struct devlink *devlink, - unsigned int port_index, - struct devlink_param_item *param_item, - enum devlink_command cmd); - -static void devlink_ns_change_notify(struct devlink *devlink, - struct net *dest_net, struct net *curr_net, - bool new) -{ - struct devlink_param_item *param_item; - enum devlink_command cmd; - - /* Userspace needs to be notified about devlink objects - * removed from original and entering new network namespace. - * The rest of the devlink objects are re-created during - * reload process so the notifications are generated separatelly. - */ - - if (!dest_net || net_eq(dest_net, curr_net)) - return; - - if (new) - devlink_notify(devlink, DEVLINK_CMD_NEW); - - cmd = new ? DEVLINK_CMD_PARAM_NEW : DEVLINK_CMD_PARAM_DEL; - list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, cmd); - - if (!new) - devlink_notify(devlink, DEVLINK_CMD_DEL); -} - -static bool devlink_reload_supported(const struct devlink_ops *ops) -{ - return ops->reload_down && ops->reload_up; -} - -static void devlink_reload_failed_set(struct devlink *devlink, - bool reload_failed) -{ - if (devlink->reload_failed == reload_failed) - return; - devlink->reload_failed = reload_failed; - devlink_notify(devlink, DEVLINK_CMD_NEW); -} - -bool devlink_is_reload_failed(const struct devlink *devlink) -{ - return devlink->reload_failed; -} -EXPORT_SYMBOL_GPL(devlink_is_reload_failed); - -static void -__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats, - enum devlink_reload_limit limit, u32 actions_performed) -{ - unsigned long actions = actions_performed; - int stat_idx; - int action; - - for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) { - stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action; - reload_stats[stat_idx]++; - } - devlink_notify(devlink, DEVLINK_CMD_NEW); -} - -static void -devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit, - u32 actions_performed) -{ - __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit, - actions_performed); -} - -/** - * devlink_remote_reload_actions_performed - Update devlink on reload actions - * performed which are not a direct result of devlink reload call. - * - * This should be called by a driver after performing reload actions in case it was not - * a result of devlink reload call. For example fw_activate was performed as a result - * of devlink reload triggered fw_activate on another host. - * The motivation for this function is to keep data on reload actions performed on this - * function whether it was done due to direct devlink reload call or not. - * - * @devlink: devlink - * @limit: reload limit - * @actions_performed: bitmask of actions performed - */ -void devlink_remote_reload_actions_performed(struct devlink *devlink, - enum devlink_reload_limit limit, - u32 actions_performed) -{ - if (WARN_ON(!actions_performed || - actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || - actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) || - limit > DEVLINK_RELOAD_LIMIT_MAX)) - return; - - __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit, - actions_performed); -} -EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed); - -static int devlink_reload(struct devlink *devlink, struct net *dest_net, - enum devlink_reload_action action, enum devlink_reload_limit limit, - u32 *actions_performed, struct netlink_ext_ack *extack) -{ - u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; - struct net *curr_net; - int err; - - memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, - sizeof(remote_reload_stats)); - - curr_net = devlink_net(devlink); - devlink_ns_change_notify(devlink, dest_net, curr_net, false); - err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); - if (err) - return err; - - if (dest_net && !net_eq(dest_net, curr_net)) { - move_netdevice_notifier_net(curr_net, dest_net, - &devlink->netdevice_nb); - write_pnet(&devlink->_net, dest_net); - } - - err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); - devlink_reload_failed_set(devlink, !!err); - if (err) - return err; - - devlink_ns_change_notify(devlink, dest_net, curr_net, true); - WARN_ON(!(*actions_performed & BIT(action))); - /* Catch driver on updating the remote action within devlink reload */ - WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, - sizeof(remote_reload_stats))); - devlink_reload_stats_update(devlink, limit, *actions_performed); - return 0; -} - -static int -devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed, - enum devlink_command cmd, struct genl_info *info) -{ - struct sk_buff *msg; - void *hdr; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd); - if (!hdr) - goto free_msg; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed, - actions_performed)) - goto nla_put_failure; - genlmsg_end(msg, hdr); - - return genlmsg_reply(msg, info); - -nla_put_failure: - genlmsg_cancel(msg, hdr); -free_msg: - nlmsg_free(msg); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - enum devlink_reload_action action; - enum devlink_reload_limit limit; - struct net *dest_net = NULL; - u32 actions_performed; - int err; - - if (!(devlink->features & DEVLINK_F_RELOAD)) - return -EOPNOTSUPP; - - err = devlink_resources_validate(devlink, NULL, info); - if (err) { - NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); - return err; - } - - if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) - action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]); - else - action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT; - - if (!devlink_reload_action_is_supported(devlink, action)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Requested reload action is not supported by the driver"); - return -EOPNOTSUPP; - } - - limit = DEVLINK_RELOAD_LIMIT_UNSPEC; - if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) { - struct nla_bitfield32 limits; - u32 limits_selected; - - limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]); - limits_selected = limits.value & limits.selector; - if (!limits_selected) { - NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected"); - return -EINVAL; - } - for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++) - if (limits_selected & BIT(limit)) - break; - /* UAPI enables multiselection, but currently it is not used */ - if (limits_selected != BIT(limit)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Multiselection of limit is not supported"); - return -EOPNOTSUPP; - } - if (!devlink_reload_limit_is_supported(devlink, limit)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Requested limit is not supported by the driver"); - return -EOPNOTSUPP; - } - if (devlink_reload_combination_is_invalid(action, limit)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Requested limit is invalid for this action"); - return -EINVAL; - } - } - if (info->attrs[DEVLINK_ATTR_NETNS_PID] || - info->attrs[DEVLINK_ATTR_NETNS_FD] || - info->attrs[DEVLINK_ATTR_NETNS_ID]) { - dest_net = devlink_netns_get(skb, info); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); - } - - err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack); - - if (dest_net) - put_net(dest_net); - - if (err) - return err; - /* For backward compatibility generate reply only if attributes used by user */ - if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) - return 0; - - return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, - DEVLINK_CMD_RELOAD, info); -} - -static int devlink_nl_flash_update_fill(struct sk_buff *msg, - struct devlink *devlink, - enum devlink_command cmd, - struct devlink_flash_notify *params) -{ - void *hdr; - - hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) - goto out; - - if (params->status_msg && - nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, - params->status_msg)) - goto nla_put_failure; - if (params->component && - nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, - params->component)) - goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, - params->done, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, - params->total, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, - params->timeout, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - -out: - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void __devlink_flash_update_notify(struct devlink *devlink, - enum devlink_command cmd, - struct devlink_flash_notify *params) -{ - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && - cmd != DEVLINK_CMD_FLASH_UPDATE_END && - cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); - - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_flash_update_fill(msg, devlink, cmd, params); - if (err) - goto out_free_msg; - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); - return; - -out_free_msg: - nlmsg_free(msg); -} - -static void devlink_flash_update_begin_notify(struct devlink *devlink) -{ - struct devlink_flash_notify params = {}; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE, - ¶ms); -} - -static void devlink_flash_update_end_notify(struct devlink *devlink) -{ - struct devlink_flash_notify params = {}; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE_END, - ¶ms); -} - -void devlink_flash_update_status_notify(struct devlink *devlink, - const char *status_msg, - const char *component, - unsigned long done, - unsigned long total) -{ - struct devlink_flash_notify params = { - .status_msg = status_msg, - .component = component, - .done = done, - .total = total, - }; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE_STATUS, - ¶ms); -} -EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); - -void devlink_flash_update_timeout_notify(struct devlink *devlink, - const char *status_msg, - const char *component, - unsigned long timeout) -{ - struct devlink_flash_notify params = { - .status_msg = status_msg, - .component = component, - .timeout = timeout, - }; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE_STATUS, - ¶ms); -} -EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); - -struct devlink_info_req { - struct sk_buff *msg; - void (*version_cb)(const char *version_name, - enum devlink_info_version_type version_type, - void *version_cb_priv); - void *version_cb_priv; -}; - -struct devlink_flash_component_lookup_ctx { - const char *lookup_name; - bool lookup_name_found; -}; - -static void -devlink_flash_component_lookup_cb(const char *version_name, - enum devlink_info_version_type version_type, - void *version_cb_priv) -{ - struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv; - - if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT || - lookup_ctx->lookup_name_found) - return; - - lookup_ctx->lookup_name_found = - !strcmp(lookup_ctx->lookup_name, version_name); -} - -static int devlink_flash_component_get(struct devlink *devlink, - struct nlattr *nla_component, - const char **p_component, - struct netlink_ext_ack *extack) -{ - struct devlink_flash_component_lookup_ctx lookup_ctx = {}; - struct devlink_info_req req = {}; - const char *component; - int ret; - - if (!nla_component) - return 0; - - component = nla_data(nla_component); - - if (!devlink->ops->info_get) { - NL_SET_ERR_MSG_ATTR(extack, nla_component, - "component update is not supported by this device"); - return -EOPNOTSUPP; - } - - lookup_ctx.lookup_name = component; - req.version_cb = devlink_flash_component_lookup_cb; - req.version_cb_priv = &lookup_ctx; - - ret = devlink->ops->info_get(devlink, &req, NULL); - if (ret) - return ret; - - if (!lookup_ctx.lookup_name_found) { - NL_SET_ERR_MSG_ATTR(extack, nla_component, - "selected component is not supported by this device"); - return -EINVAL; - } - *p_component = component; - return 0; -} - -static int devlink_nl_cmd_flash_update(struct sk_buff *skb, - struct genl_info *info) -{ - struct nlattr *nla_overwrite_mask, *nla_file_name; - struct devlink_flash_update_params params = {}; - struct devlink *devlink = info->user_ptr[0]; - const char *file_name; - u32 supported_params; - int ret; - - if (!devlink->ops->flash_update) - return -EOPNOTSUPP; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME)) - return -EINVAL; - - ret = devlink_flash_component_get(devlink, - info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT], - ¶ms.component, info->extack); - if (ret) - return ret; - - supported_params = devlink->ops->supported_flash_update_params; - - nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; - if (nla_overwrite_mask) { - struct nla_bitfield32 sections; - - if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) { - NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask, - "overwrite settings are not supported by this device"); - return -EOPNOTSUPP; - } - sections = nla_get_bitfield32(nla_overwrite_mask); - params.overwrite_mask = sections.value & sections.selector; - } - - nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]; - file_name = nla_data(nla_file_name); - ret = request_firmware(¶ms.fw, file_name, devlink->dev); - if (ret) { - NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file"); - return ret; - } - - devlink_flash_update_begin_notify(devlink); - ret = devlink->ops->flash_update(devlink, ¶ms, info->extack); - devlink_flash_update_end_notify(devlink); - - release_firmware(params.fw); - - return ret; -} - -static int -devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink, - u32 portid, u32 seq, int flags, - struct netlink_ext_ack *extack) -{ - struct nlattr *selftests; - void *hdr; - int err; - int i; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, - DEVLINK_CMD_SELFTESTS_GET); - if (!hdr) - return -EMSGSIZE; - - err = -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto err_cancel_msg; - - selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); - if (!selftests) - goto err_cancel_msg; - - for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; - i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { - if (devlink->ops->selftest_check(devlink, i, extack)) { - err = nla_put_flag(msg, i); - if (err) - goto err_cancel_msg; - } - } - - nla_nest_end(msg, selftests); - genlmsg_end(msg, hdr); - return 0; - -err_cancel_msg: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - if (!devlink->ops->selftest_check) - return -EOPNOTSUPP; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid, - info->snd_seq, 0, info->extack); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_selftests_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start || !devlink->ops->selftest_check) - goto inc; - - devl_lock(devlink); - err = devlink_nl_selftests_fill(msg, devlink, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->extack); - devl_unlock(devlink); - if (err) { - devlink_put(devlink); - break; - } -inc: - idx++; - devlink_put(devlink); - } - - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - -static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id, - enum devlink_selftest_status test_status) -{ - struct nlattr *result_attr; - - result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT); - if (!result_attr) - return -EMSGSIZE; - - if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) || - nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS, - test_status)) - goto nla_put_failure; - - nla_nest_end(skb, result_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, result_attr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_selftests_run(struct sk_buff *skb, - struct genl_info *info) -{ - struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1]; - struct devlink *devlink = info->user_ptr[0]; - struct nlattr *attrs, *selftests; - struct sk_buff *msg; - void *hdr; - int err; - int i; - - if (!devlink->ops->selftest_run || !devlink->ops->selftest_check) - return -EOPNOTSUPP; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS)) - return -EINVAL; - - attrs = info->attrs[DEVLINK_ATTR_SELFTESTS]; - - err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs, - devlink_selftest_nl_policy, info->extack); - if (err < 0) - return err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = -EMSGSIZE; - hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, - &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN); - if (!hdr) - goto free_msg; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); - if (!selftests) - goto genlmsg_cancel; - - for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; - i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { - enum devlink_selftest_status test_status; - - if (nla_get_flag(tb[i])) { - if (!devlink->ops->selftest_check(devlink, i, - info->extack)) { - if (devlink_selftest_result_put(msg, i, - DEVLINK_SELFTEST_STATUS_SKIP)) - goto selftests_nest_cancel; - continue; - } - - test_status = devlink->ops->selftest_run(devlink, i, - info->extack); - if (devlink_selftest_result_put(msg, i, test_status)) - goto selftests_nest_cancel; - } - } - - nla_nest_end(msg, selftests); - genlmsg_end(msg, hdr); - return genlmsg_reply(msg, info); - -selftests_nest_cancel: - nla_nest_cancel(msg, selftests); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); -free_msg: - nlmsg_free(msg); - return err; -} - static const struct devlink_param devlink_param_generic[] = { { .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, @@ -5654,7 +4177,13 @@ static void devlink_param_notify(struct devlink *devlink, WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && cmd != DEVLINK_CMD_PORT_PARAM_NEW && cmd != DEVLINK_CMD_PORT_PARAM_DEL); - ASSERT_DEVLINK_REGISTERED(devlink); + + /* devlink_notify_register() / devlink_notify_unregister() + * will replay the notifications if the params are added/removed + * outside of the lifetime of the instance. + */ + if (!devl_is_registered(devlink)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -5670,48 +4199,41 @@ static void devlink_param_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_param_item *param_item; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(param_item, &devlink->param_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_param_fill(msg, devlink, 0, param_item, - DEVLINK_CMD_PARAM_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(param_item, &devlink->param_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_param_fill(msg, devlink, 0, param_item, + DEVLINK_CMD_PARAM_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_cmd devl_cmd_param_get = { + .dump_one = devlink_nl_cmd_param_get_dump_one, +}; + static int devlink_param_type_get_from_info(struct genl_info *info, enum devlink_param_type *param_type) @@ -6375,21 +4897,20 @@ out: return err; } -static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, - struct netlink_callback *cb, - struct devlink *devlink, - int *idx, - int start) +static int +devlink_nl_cmd_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_region *region; struct devlink_port *port; unsigned long port_index; - int err = 0; + int idx = 0; + int err; - devl_lock(devlink); list_for_each_entry(region, &devlink->region_list, list) { - if (*idx < start) { - (*idx)++; + if (idx < state->idx) { + idx++; continue; } err = devlink_nl_region_fill(msg, devlink, @@ -6397,43 +4918,28 @@ static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, region); - if (err) - goto out; - (*idx)++; + if (err) { + state->idx = idx; + return err; + } + idx++; } xa_for_each(&devlink->ports, port_index, port) { - err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, idx, - start); - if (err) - goto out; + err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx, + state->idx); + if (err) { + state->idx = idx; + return err; + } } -out: - devl_unlock(devlink); - return err; + return 0; } -static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink, - &idx, start); - devlink_put(devlink); - if (err) - goto out; - } -out: - cb->args[0] = idx; - return msg->len; -} +const struct devlink_cmd devl_cmd_region_get = { + .dump_one = devlink_nl_cmd_region_get_dump_one, +}; static int devlink_nl_cmd_region_del(struct sk_buff *skb, struct genl_info *info) @@ -6716,6 +5222,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct nlattr *chunks_attr, *region_attr, *snapshot_attr; u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; @@ -6729,14 +5236,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, void *hdr; int err; - start_offset = *((u64 *)&cb->args[0]); + start_offset = state->start_offset; - devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) return PTR_ERR(devlink); - devl_lock(devlink); - if (!attrs[DEVLINK_ATTR_REGION_NAME]) { NL_SET_ERR_MSG(cb->extack, "No region name provided"); err = -EINVAL; @@ -6868,7 +5373,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - *((u64 *)&cb->args[0]) = ret_offset; + state->start_offset = ret_offset; nla_nest_end(skb, chunks_attr); genlmsg_end(skb, hdr); @@ -6884,223 +5389,6 @@ out_unlock: return err; } -int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) -{ - if (!req->msg) - return 0; - return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); -} -EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); - -int devlink_info_board_serial_number_put(struct devlink_info_req *req, - const char *bsn) -{ - if (!req->msg) - return 0; - return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, - bsn); -} -EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); - -static int devlink_info_version_put(struct devlink_info_req *req, int attr, - const char *version_name, - const char *version_value, - enum devlink_info_version_type version_type) -{ - struct nlattr *nest; - int err; - - if (req->version_cb) - req->version_cb(version_name, version_type, - req->version_cb_priv); - - if (!req->msg) - return 0; - - nest = nla_nest_start_noflag(req->msg, attr); - if (!nest) - return -EMSGSIZE; - - err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME, - version_name); - if (err) - goto nla_put_failure; - - err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE, - version_value); - if (err) - goto nla_put_failure; - - nla_nest_end(req->msg, nest); - - return 0; - -nla_put_failure: - nla_nest_cancel(req->msg, nest); - return err; -} - -int devlink_info_version_fixed_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, - version_name, version_value, - DEVLINK_INFO_VERSION_TYPE_NONE); -} -EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); - -int devlink_info_version_stored_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, - version_name, version_value, - DEVLINK_INFO_VERSION_TYPE_NONE); -} -EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); - -int devlink_info_version_stored_put_ext(struct devlink_info_req *req, - const char *version_name, - const char *version_value, - enum devlink_info_version_type version_type) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, - version_name, version_value, - version_type); -} -EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext); - -int devlink_info_version_running_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, - version_name, version_value, - DEVLINK_INFO_VERSION_TYPE_NONE); -} -EXPORT_SYMBOL_GPL(devlink_info_version_running_put); - -int devlink_info_version_running_put_ext(struct devlink_info_req *req, - const char *version_name, - const char *version_value, - enum devlink_info_version_type version_type) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, - version_name, version_value, - version_type); -} -EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext); - -static int devlink_nl_driver_info_get(struct device_driver *drv, - struct devlink_info_req *req) -{ - if (!drv) - return 0; - - if (drv->name[0]) - return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, - drv->name); - - return 0; -} - -static int -devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, struct netlink_ext_ack *extack) -{ - struct device *dev = devlink_to_dev(devlink); - struct devlink_info_req req = {}; - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - err = -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto err_cancel_msg; - - req.msg = msg; - if (devlink->ops->info_get) { - err = devlink->ops->info_get(devlink, &req, extack); - if (err) - goto err_cancel_msg; - } - - err = devlink_nl_driver_info_get(dev->driver, &req); - if (err) - goto err_cancel_msg; - - genlmsg_end(msg, hdr); - return 0; - -err_cancel_msg: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, - info->snd_portid, info->snd_seq, 0, - info->extack); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start) - goto inc; - - devl_lock(devlink); - err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->extack); - devl_unlock(devlink); - if (err == -EOPNOTSUPP) - err = 0; - else if (err) { - devlink_put(devlink); - break; - } -inc: - idx++; - devlink_put(devlink); - } - - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - struct devlink_fmsg_item { struct list_head list; int attrtype; @@ -7564,8 +5852,8 @@ devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, { struct devlink_fmsg_item *item; struct nlattr *fmsg_nlattr; + int err = 0; int i = 0; - int err; fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG); if (!fmsg_nlattr) @@ -7666,7 +5954,8 @@ static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, struct netlink_callback *cb, enum devlink_command cmd) { - int index = cb->args[0]; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + int index = state->idx; int tmp_index = index; void *hdr; int err; @@ -7682,7 +5971,7 @@ static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, if ((err && err != -EMSGSIZE) || tmp_index == index) goto nla_put_failure; - cb->args[0] = index; + state->idx = index; genlmsg_end(skb, hdr); return skb->len; @@ -7708,7 +5997,6 @@ struct devlink_health_reporter { u64 error_count; u64 recovery_count; u64 last_recovery_ts; - refcount_t refcount; }; void * @@ -7720,12 +6008,10 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); static struct devlink_health_reporter * __devlink_health_reporter_find_by_name(struct list_head *reporter_list, - struct mutex *list_lock, const char *reporter_name) { struct devlink_health_reporter *reporter; - lockdep_assert_held(list_lock); list_for_each_entry(reporter, reporter_list, list) if (!strcmp(reporter->ops->name, reporter_name)) return reporter; @@ -7737,7 +6023,6 @@ devlink_health_reporter_find_by_name(struct devlink *devlink, const char *reporter_name) { return __devlink_health_reporter_find_by_name(&devlink->reporter_list, - &devlink->reporters_lock, reporter_name); } @@ -7746,7 +6031,6 @@ devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, const char *reporter_name) { return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list, - &devlink_port->reporters_lock, reporter_name); } @@ -7771,13 +6055,12 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; mutex_init(&reporter->dump_lock); - refcount_set(&reporter->refcount, 1); return reporter; } /** - * devlink_port_health_reporter_create - create devlink health reporter for - * specified port instance + * devl_port_health_reporter_create - create devlink health reporter for + * specified port instance * * @port: devlink_port which should contain the new reporter * @ops: ops @@ -7785,34 +6068,47 @@ __devlink_health_reporter_create(struct devlink *devlink, * @priv: priv */ struct devlink_health_reporter * -devlink_port_health_reporter_create(struct devlink_port *port, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) +devl_port_health_reporter_create(struct devlink_port *port, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; - mutex_lock(&port->reporters_lock); + devl_assert_locked(port->devlink); + if (__devlink_health_reporter_find_by_name(&port->reporter_list, - &port->reporters_lock, ops->name)) { - reporter = ERR_PTR(-EEXIST); - goto unlock; - } + ops->name)) + return ERR_PTR(-EEXIST); reporter = __devlink_health_reporter_create(port->devlink, ops, graceful_period, priv); if (IS_ERR(reporter)) - goto unlock; + return reporter; reporter->devlink_port = port; list_add_tail(&reporter->list, &port->reporter_list); -unlock: - mutex_unlock(&port->reporters_lock); + return reporter; +} +EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); + +struct devlink_health_reporter * +devlink_port_health_reporter_create(struct devlink_port *port, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + struct devlink *devlink = port->devlink; + + devl_lock(devlink); + reporter = devl_port_health_reporter_create(port, ops, + graceful_period, priv); + devl_unlock(devlink); return reporter; } EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); /** - * devlink_health_reporter_create - create devlink health reporter + * devl_health_reporter_create - create devlink health reporter * * @devlink: devlink * @ops: ops @@ -7820,26 +6116,38 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); * @priv: priv */ struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) +devl_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; - mutex_lock(&devlink->reporters_lock); - if (devlink_health_reporter_find_by_name(devlink, ops->name)) { - reporter = ERR_PTR(-EEXIST); - goto unlock; - } + devl_assert_locked(devlink); + + if (devlink_health_reporter_find_by_name(devlink, ops->name)) + return ERR_PTR(-EEXIST); reporter = __devlink_health_reporter_create(devlink, ops, graceful_period, priv); if (IS_ERR(reporter)) - goto unlock; + return reporter; list_add_tail(&reporter->list, &devlink->reporter_list); -unlock: - mutex_unlock(&devlink->reporters_lock); + return reporter; +} +EXPORT_SYMBOL_GPL(devl_health_reporter_create); + +struct devlink_health_reporter * +devlink_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + + devl_lock(devlink); + reporter = devl_health_reporter_create(devlink, ops, + graceful_period, priv); + devl_unlock(devlink); return reporter; } EXPORT_SYMBOL_GPL(devlink_health_reporter_create); @@ -7853,51 +6161,31 @@ devlink_health_reporter_free(struct devlink_health_reporter *reporter) kfree(reporter); } -static void -devlink_health_reporter_put(struct devlink_health_reporter *reporter) -{ - if (refcount_dec_and_test(&reporter->refcount)) - devlink_health_reporter_free(reporter); -} - -static void -__devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ - list_del(&reporter->list); - devlink_health_reporter_put(reporter); -} - /** - * devlink_health_reporter_destroy - destroy devlink health reporter + * devl_health_reporter_destroy - destroy devlink health reporter * * @reporter: devlink health reporter to destroy */ void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) +devl_health_reporter_destroy(struct devlink_health_reporter *reporter) { - struct mutex *lock = &reporter->devlink->reporters_lock; + devl_assert_locked(reporter->devlink); - mutex_lock(lock); - __devlink_health_reporter_destroy(reporter); - mutex_unlock(lock); + list_del(&reporter->list); + devlink_health_reporter_free(reporter); } -EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); +EXPORT_SYMBOL_GPL(devl_health_reporter_destroy); -/** - * devlink_port_health_reporter_destroy - destroy devlink port health reporter - * - * @reporter: devlink health reporter to destroy - */ void -devlink_port_health_reporter_destroy(struct devlink_health_reporter *reporter) +devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) { - struct mutex *lock = &reporter->devlink_port->reporters_lock; + struct devlink *devlink = reporter->devlink; - mutex_lock(lock); - __devlink_health_reporter_destroy(reporter); - mutex_unlock(lock); + devl_lock(devlink); + devl_health_reporter_destroy(reporter); + devl_unlock(devlink); } -EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy); +EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); static int devlink_nl_health_reporter_fill(struct sk_buff *msg, @@ -8128,7 +6416,6 @@ static struct devlink_health_reporter * devlink_health_reporter_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { - struct devlink_health_reporter *reporter; struct devlink_port *devlink_port; char *reporter_name; @@ -8137,21 +6424,12 @@ devlink_health_reporter_get_from_attrs(struct devlink *devlink, reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); devlink_port = devlink_port_get_from_attrs(devlink, attrs); - if (IS_ERR(devlink_port)) { - mutex_lock(&devlink->reporters_lock); - reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); - if (reporter) - refcount_inc(&reporter->refcount); - mutex_unlock(&devlink->reporters_lock); - } else { - mutex_lock(&devlink_port->reporters_lock); - reporter = devlink_port_health_reporter_find_by_name(devlink_port, reporter_name); - if (reporter) - refcount_inc(&reporter->refcount); - mutex_unlock(&devlink_port->reporters_lock); - } - - return reporter; + if (IS_ERR(devlink_port)) + return devlink_health_reporter_find_by_name(devlink, + reporter_name); + else + return devlink_port_health_reporter_find_by_name(devlink_port, + reporter_name); } static struct devlink_health_reporter * @@ -8169,9 +6447,10 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) struct nlattr **attrs = info->attrs; struct devlink *devlink; - devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) return NULL; + devl_unlock(devlink); reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); devlink_put(devlink); @@ -8209,10 +6488,8 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - err = -ENOMEM; - goto out; - } + if (!msg) + return -ENOMEM; err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, @@ -8220,89 +6497,72 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, 0); if (err) { nlmsg_free(msg); - goto out; + return err; } - err = genlmsg_reply(msg, info); -out: - devlink_health_reporter_put(reporter); - return err; + return genlmsg_reply(msg, info); } static int -devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +devlink_nl_cmd_health_reporter_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_health_reporter *reporter; - unsigned long index, port_index; struct devlink_port *port; - struct devlink *devlink; - int start = cb->args[0]; + unsigned long port_index; int idx = 0; int err; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - mutex_lock(&devlink->reporters_lock); - list_for_each_entry(reporter, &devlink->reporter_list, - list) { - if (idx < start) { + list_for_each_entry(reporter, &devlink->reporter_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + err = devlink_nl_health_reporter_fill(msg, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + return err; + } + idx++; + } + xa_for_each(&devlink->ports, port_index, port) { + list_for_each_entry(reporter, &port->reporter_list, list) { + if (idx < state->idx) { idx++; continue; } - err = devlink_nl_health_reporter_fill( - msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI); + err = devlink_nl_health_reporter_fill(msg, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); if (err) { - mutex_unlock(&devlink->reporters_lock); - devlink_put(devlink); - goto out; + state->idx = idx; + return err; } idx++; } - mutex_unlock(&devlink->reporters_lock); - devlink_put(devlink); - } - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - xa_for_each(&devlink->ports, port_index, port) { - mutex_lock(&port->reporters_lock); - list_for_each_entry(reporter, &port->reporter_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_health_reporter_fill( - msg, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) { - mutex_unlock(&port->reporters_lock); - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - mutex_unlock(&port->reporters_lock); - } - devl_unlock(devlink); - devlink_put(devlink); } -out: - cb->args[0] = idx; - return msg->len; + + return 0; } +const struct devlink_cmd devl_cmd_health_reporter_get = { + .dump_one = devlink_nl_cmd_health_reporter_get_dump_one, +}; + static int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; - int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) @@ -8310,15 +6570,12 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) { - err = -EOPNOTSUPP; - goto out; - } + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + return -EOPNOTSUPP; + if (!reporter->ops->dump && - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { - err = -EOPNOTSUPP; - goto out; - } + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) + return -EOPNOTSUPP; if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = @@ -8332,11 +6589,7 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, reporter->auto_dump = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); - devlink_health_reporter_put(reporter); return 0; -out: - devlink_health_reporter_put(reporter); - return err; } static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, @@ -8344,16 +6597,12 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; - int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; - err = devlink_health_reporter_recover(reporter, NULL, info->extack); - - devlink_health_reporter_put(reporter); - return err; + return devlink_health_reporter_recover(reporter, NULL, info->extack); } static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, @@ -8368,16 +6617,12 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - if (!reporter->ops->diagnose) { - devlink_health_reporter_put(reporter); + if (!reporter->ops->diagnose) return -EOPNOTSUPP; - } fmsg = devlink_fmsg_alloc(); - if (!fmsg) { - devlink_health_reporter_put(reporter); + if (!fmsg) return -ENOMEM; - } err = devlink_fmsg_obj_nest_start(fmsg); if (err) @@ -8396,7 +6641,6 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, out: devlink_fmsg_free(fmsg); - devlink_health_reporter_put(reporter); return err; } @@ -8404,26 +6648,25 @@ static int devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_health_reporter *reporter; - u64 start = cb->args[0]; int err; reporter = devlink_health_reporter_get_from_cb(cb); if (!reporter) return -EINVAL; - if (!reporter->ops->dump) { - err = -EOPNOTSUPP; - goto out; - } + if (!reporter->ops->dump) + return -EOPNOTSUPP; + mutex_lock(&reporter->dump_lock); - if (!start) { + if (!state->idx) { err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) goto unlock; - cb->args[1] = reporter->dump_ts; + state->dump_ts = reporter->dump_ts; } - if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) { + if (!reporter->dump_fmsg || state->dump_ts != reporter->dump_ts) { NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry"); err = -EAGAIN; goto unlock; @@ -8433,8 +6676,6 @@ devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); unlock: mutex_unlock(&reporter->dump_lock); -out: - devlink_health_reporter_put(reporter); return err; } @@ -8449,15 +6690,12 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - if (!reporter->ops->dump) { - devlink_health_reporter_put(reporter); + if (!reporter->ops->dump) return -EOPNOTSUPP; - } mutex_lock(&reporter->dump_lock); devlink_health_dump_clear(reporter); mutex_unlock(&reporter->dump_lock); - devlink_health_reporter_put(reporter); return 0; } @@ -8466,21 +6704,15 @@ static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; - int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; - if (!reporter->ops->test) { - devlink_health_reporter_put(reporter); + if (!reporter->ops->test) return -EOPNOTSUPP; - } - - err = reporter->ops->test(reporter, info->extack); - devlink_health_reporter_put(reporter); - return err; + return reporter->ops->test(reporter, info->extack); } struct devlink_stats { @@ -8814,43 +7046,39 @@ err_trap_fill: return err; } -static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_item *trap_item; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_fill(msg, devlink, trap_item, - DEVLINK_CMD_TRAP_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (idx < state->idx) { idx++; + continue; } - devl_unlock(devlink); - devlink_put(devlink); + err = devlink_nl_trap_fill(msg, devlink, trap_item, + DEVLINK_CMD_TRAP_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; + } + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_trap_get = { + .dump_one = devlink_nl_cmd_trap_get_dump_one, +}; + static int __devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, enum devlink_trap_action trap_action, @@ -9029,46 +7257,41 @@ err_trap_group_fill: return err; } -static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_trap_group_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - enum devlink_command cmd = DEVLINK_CMD_TRAP_GROUP_NEW; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_group_item *group_item; - u32 portid = NETLINK_CB(cb->skb).portid; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(group_item, &devlink->trap_group_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_group_fill(msg, devlink, - group_item, cmd, - portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + + list_for_each_entry(group_item, &devlink->trap_group_list, list) { + if (idx < state->idx) { idx++; + continue; } - devl_unlock(devlink); - devlink_put(devlink); + err = devlink_nl_trap_group_fill(msg, devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; + } + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_trap_group_get = { + .dump_one = devlink_nl_cmd_trap_group_get_dump_one, +}; + static int __devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, @@ -9144,6 +7367,7 @@ static int devlink_trap_group_set(struct devlink *devlink, struct netlink_ext_ack *extack = info->extack; const struct devlink_trap_policer *policer; struct nlattr **attrs = info->attrs; + u32 policer_id; int err; if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) @@ -9152,17 +7376,11 @@ static int devlink_trap_group_set(struct devlink *devlink, if (!devlink->ops->trap_group_set) return -EOPNOTSUPP; - policer_item = group_item->policer_item; - if (attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) { - u32 policer_id; - - policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); - policer_item = devlink_trap_policer_item_lookup(devlink, - policer_id); - if (policer_id && !policer_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); - return -ENOENT; - } + policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); + policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); + if (policer_id && !policer_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); + return -ENOENT; } policer = policer_item ? policer_item->policer : NULL; @@ -9333,46 +7551,40 @@ err_trap_policer_fill: return err; } -static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_trap_policer_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - enum devlink_command cmd = DEVLINK_CMD_TRAP_POLICER_NEW; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_policer_item *policer_item; - u32 portid = NETLINK_CB(cb->skb).portid; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(policer_item, &devlink->trap_policer_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_policer_fill(msg, devlink, - policer_item, cmd, - portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_trap_policer_get = { + .dump_one = devlink_nl_cmd_trap_policer_get_dump_one, +}; + static int devlink_trap_policer_set(struct devlink *devlink, struct devlink_trap_policer_item *policer_item, @@ -9445,88 +7657,19 @@ static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, return devlink_trap_policer_set(devlink, policer_item, info); } -static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { - [DEVLINK_ATTR_UNSPEC] = { .strict_start_type = - DEVLINK_ATTR_TRAP_POLICER_ID }, - [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_PORT_TYPE_AUTO, - DEVLINK_PORT_TYPE_IB), - [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 }, - [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, - [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_ESWITCH_MODE_LEGACY, - DEVLINK_ESWITCH_MODE_SWITCHDEV), - [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, - [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, - [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, - [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 }, - [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 }, - [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, - [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, - [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = - NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS), - [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, - [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, - [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, - [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, - [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, - [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, - [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_ACTION_MAX), - [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), - [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 }, - [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, - [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, - [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, - [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, - [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, - [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, - [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, - [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, - [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, - [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG }, -}; - -static const struct genl_small_ops devlink_nl_ops[] = { +const struct genl_small_ops devlink_nl_ops[56] = { { .cmd = DEVLINK_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_get_doit, - .dumpit = devlink_nl_cmd_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_PORT_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_get_doit, - .dumpit = devlink_nl_cmd_port_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -9540,7 +7683,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_RATE_GET, .doit = devlink_nl_cmd_rate_get_doit, - .dumpit = devlink_nl_cmd_rate_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, /* can be retrieved by unprivileged users */ }, @@ -9588,7 +7731,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_LINECARD_GET, .doit = devlink_nl_cmd_linecard_get_doit, - .dumpit = devlink_nl_cmd_linecard_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, /* can be retrieved by unprivileged users */ }, @@ -9602,14 +7745,14 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, - .dumpit = devlink_nl_cmd_sb_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_SB_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_pool_get_doit, - .dumpit = devlink_nl_cmd_sb_pool_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9622,7 +7765,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_port_pool_get_doit, - .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -9637,7 +7780,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, - .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -9718,7 +7861,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_PARAM_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_param_get_doit, - .dumpit = devlink_nl_cmd_param_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9746,7 +7889,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_REGION_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_get_doit, - .dumpit = devlink_nl_cmd_region_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .flags = GENL_ADMIN_PERM, }, { @@ -9772,14 +7915,14 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_INFO_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_info_get_doit, - .dumpit = devlink_nl_cmd_info_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_get_doit, - .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, /* can be retrieved by unprivileged users */ }, @@ -9834,7 +7977,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_TRAP_GET, .doit = devlink_nl_cmd_trap_get_doit, - .dumpit = devlink_nl_cmd_trap_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9845,7 +7988,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_TRAP_GROUP_GET, .doit = devlink_nl_cmd_trap_group_get_doit, - .dumpit = devlink_nl_cmd_trap_group_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9856,7 +7999,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_TRAP_POLICER_GET, .doit = devlink_nl_cmd_trap_policer_get_doit, - .dumpit = devlink_nl_cmd_trap_policer_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9867,7 +8010,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_SELFTESTS_GET, .doit = devlink_nl_cmd_selftests_get_doit, - .dumpit = devlink_nl_cmd_selftests_get_dumpit + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9875,148 +8018,9 @@ static const struct genl_small_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_selftests_run, .flags = GENL_ADMIN_PERM, }, + /* -- No new ops here! Use split ops going forward! -- */ }; -static struct genl_family devlink_nl_family __ro_after_init = { - .name = DEVLINK_GENL_NAME, - .version = DEVLINK_GENL_VERSION, - .maxattr = DEVLINK_ATTR_MAX, - .policy = devlink_nl_policy, - .netnsok = true, - .parallel_ops = true, - .pre_doit = devlink_nl_pre_doit, - .post_doit = devlink_nl_post_doit, - .module = THIS_MODULE, - .small_ops = devlink_nl_ops, - .n_small_ops = ARRAY_SIZE(devlink_nl_ops), - .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, - .mcgrps = devlink_nl_mcgrps, - .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), -}; - -static bool devlink_reload_actions_valid(const struct devlink_ops *ops) -{ - const struct devlink_reload_combination *comb; - int i; - - if (!devlink_reload_supported(ops)) { - if (WARN_ON(ops->reload_actions)) - return false; - return true; - } - - if (WARN_ON(!ops->reload_actions || - ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || - ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) - return false; - - if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) || - ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX))) - return false; - - for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) { - comb = &devlink_reload_invalid_combinations[i]; - if (ops->reload_actions == BIT(comb->action) && - ops->reload_limits == BIT(comb->limit)) - return false; - } - return true; -} - -/** - * devlink_set_features - Set devlink supported features - * - * @devlink: devlink - * @features: devlink support features - * - * This interface allows us to set reload ops separatelly from - * the devlink_alloc. - */ -void devlink_set_features(struct devlink *devlink, u64 features) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - WARN_ON(features & DEVLINK_F_RELOAD && - !devlink_reload_supported(devlink->ops)); - devlink->features = features; -} -EXPORT_SYMBOL_GPL(devlink_set_features); - -static int devlink_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr); - -/** - * devlink_alloc_ns - Allocate new devlink instance resources - * in specific namespace - * - * @ops: ops - * @priv_size: size of user private data - * @net: net namespace - * @dev: parent device - * - * Allocate new devlink instance resources, including devlink index - * and name. - */ -struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, - size_t priv_size, struct net *net, - struct device *dev) -{ - struct devlink *devlink; - static u32 last_id; - int ret; - - WARN_ON(!ops || !dev); - if (!devlink_reload_actions_valid(ops)) - return NULL; - - devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); - if (!devlink) - return NULL; - - ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, - &last_id, GFP_KERNEL); - if (ret < 0) - goto err_xa_alloc; - - devlink->netdevice_nb.notifier_call = devlink_netdevice_event; - ret = register_netdevice_notifier(&devlink->netdevice_nb); - if (ret) - goto err_register_netdevice_notifier; - - devlink->dev = dev; - devlink->ops = ops; - xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); - xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); - write_pnet(&devlink->_net, net); - INIT_LIST_HEAD(&devlink->rate_list); - INIT_LIST_HEAD(&devlink->linecard_list); - INIT_LIST_HEAD(&devlink->sb_list); - INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); - INIT_LIST_HEAD(&devlink->resource_list); - INIT_LIST_HEAD(&devlink->param_list); - INIT_LIST_HEAD(&devlink->region_list); - INIT_LIST_HEAD(&devlink->reporter_list); - INIT_LIST_HEAD(&devlink->trap_list); - INIT_LIST_HEAD(&devlink->trap_group_list); - INIT_LIST_HEAD(&devlink->trap_policer_list); - lockdep_register_key(&devlink->lock_key); - mutex_init(&devlink->lock); - lockdep_set_class(&devlink->lock, &devlink->lock_key); - mutex_init(&devlink->reporters_lock); - mutex_init(&devlink->linecards_lock); - refcount_set(&devlink->refcount, 1); - init_completion(&devlink->comp); - - return devlink; - -err_register_netdevice_notifier: - xa_erase(&devlinks, devlink->index); -err_xa_alloc: - kfree(devlink); - return NULL; -} -EXPORT_SYMBOL_GPL(devlink_alloc_ns); - static void devlink_trap_policer_notify(struct devlink *devlink, const struct devlink_trap_policer_item *policer_item, @@ -10029,7 +8033,7 @@ static void devlink_trap_notify(struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd); -static void devlink_notify_register(struct devlink *devlink) +void devlink_notify_register(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; struct devlink_trap_group_item *group_item; @@ -10070,7 +8074,7 @@ static void devlink_notify_register(struct devlink *devlink) DEVLINK_CMD_PARAM_NEW); } -static void devlink_notify_unregister(struct devlink *devlink) +void devlink_notify_unregister(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; struct devlink_trap_group_item *group_item; @@ -10107,78 +8111,6 @@ static void devlink_notify_unregister(struct devlink *devlink) devlink_notify(devlink, DEVLINK_CMD_DEL); } -/** - * devlink_register - Register devlink instance - * - * @devlink: devlink - */ -void devlink_register(struct devlink *devlink) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - /* Make sure that we are in .probe() routine */ - - xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - devlink_notify_register(devlink); -} -EXPORT_SYMBOL_GPL(devlink_register); - -/** - * devlink_unregister - Unregister devlink instance - * - * @devlink: devlink - */ -void devlink_unregister(struct devlink *devlink) -{ - ASSERT_DEVLINK_REGISTERED(devlink); - /* Make sure that we are in .remove() routine */ - - xa_set_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING); - devlink_put(devlink); - wait_for_completion(&devlink->comp); - - devlink_notify_unregister(devlink); - xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - xa_clear_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING); -} -EXPORT_SYMBOL_GPL(devlink_unregister); - -/** - * devlink_free - Free devlink instance resources - * - * @devlink: devlink - */ -void devlink_free(struct devlink *devlink) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - mutex_destroy(&devlink->linecards_lock); - mutex_destroy(&devlink->reporters_lock); - mutex_destroy(&devlink->lock); - lockdep_unregister_key(&devlink->lock_key); - WARN_ON(!list_empty(&devlink->trap_policer_list)); - WARN_ON(!list_empty(&devlink->trap_group_list)); - WARN_ON(!list_empty(&devlink->trap_list)); - WARN_ON(!list_empty(&devlink->reporter_list)); - WARN_ON(!list_empty(&devlink->region_list)); - WARN_ON(!list_empty(&devlink->param_list)); - WARN_ON(!list_empty(&devlink->resource_list)); - WARN_ON(!list_empty(&devlink->dpipe_table_list)); - WARN_ON(!list_empty(&devlink->sb_list)); - WARN_ON(!list_empty(&devlink->rate_list)); - WARN_ON(!list_empty(&devlink->linecard_list)); - WARN_ON(!xa_empty(&devlink->ports)); - - xa_destroy(&devlink->snapshot_ids); - xa_destroy(&devlink->ports); - - WARN_ON_ONCE(unregister_netdevice_notifier(&devlink->netdevice_nb)); - - xa_erase(&devlinks, devlink->index); - - kfree(devlink); -} -EXPORT_SYMBOL_GPL(devlink_free); - static void devlink_port_type_warn(struct work_struct *work) { WARN(true, "Type was not set for devlink port."); @@ -10278,12 +8210,9 @@ int devl_port_register(struct devlink *devlink, devlink_port->index = port_index; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); - mutex_init(&devlink_port->reporters_lock); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); - if (err) { - mutex_destroy(&devlink_port->reporters_lock); + if (err) return err; - } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); @@ -10334,7 +8263,6 @@ void devl_port_unregister(struct devlink_port *devlink_port) devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); xa_erase(&devlink_port->devlink->ports, devlink_port->index); WARN_ON(!list_empty(&devlink_port->reporter_list)); - mutex_destroy(&devlink_port->reporters_lock); devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); @@ -10479,8 +8407,8 @@ void devlink_port_type_clear(struct devlink_port *devlink_port) } EXPORT_SYMBOL_GPL(devlink_port_type_clear); -static int devlink_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr) +int devlink_port_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct devlink_port *devlink_port = netdev->devlink_port; @@ -10914,7 +8842,7 @@ static void devlink_linecard_types_fini(struct devlink_linecard *linecard) } /** - * devlink_linecard_create - Create devlink linecard + * devl_linecard_create - Create devlink linecard * * @devlink: devlink * @linecard_index: driver-specific numerical identifier of the linecard @@ -10927,8 +8855,8 @@ static void devlink_linecard_types_fini(struct devlink_linecard *linecard) * Return: Line card structure or an ERR_PTR() encoded error code. */ struct devlink_linecard * -devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, - const struct devlink_linecard_ops *ops, void *priv) +devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, + const struct devlink_linecard_ops *ops, void *priv) { struct devlink_linecard *linecard; int err; @@ -10937,17 +8865,12 @@ devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, !ops->types_count || !ops->types_get)) return ERR_PTR(-EINVAL); - mutex_lock(&devlink->linecards_lock); - if (devlink_linecard_index_exists(devlink, linecard_index)) { - mutex_unlock(&devlink->linecards_lock); + if (devlink_linecard_index_exists(devlink, linecard_index)) return ERR_PTR(-EEXIST); - } linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); - if (!linecard) { - mutex_unlock(&devlink->linecards_lock); + if (!linecard) return ERR_PTR(-ENOMEM); - } linecard->devlink = devlink; linecard->index = linecard_index; @@ -10960,35 +8883,29 @@ devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, if (err) { mutex_destroy(&linecard->state_lock); kfree(linecard); - mutex_unlock(&devlink->linecards_lock); return ERR_PTR(err); } list_add_tail(&linecard->list, &devlink->linecard_list); - refcount_set(&linecard->refcount, 1); - mutex_unlock(&devlink->linecards_lock); devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); return linecard; } -EXPORT_SYMBOL_GPL(devlink_linecard_create); +EXPORT_SYMBOL_GPL(devl_linecard_create); /** - * devlink_linecard_destroy - Destroy devlink linecard + * devl_linecard_destroy - Destroy devlink linecard * * @linecard: devlink linecard */ -void devlink_linecard_destroy(struct devlink_linecard *linecard) +void devl_linecard_destroy(struct devlink_linecard *linecard) { - struct devlink *devlink = linecard->devlink; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); - mutex_lock(&devlink->linecards_lock); list_del(&linecard->list); devlink_linecard_types_fini(linecard); - mutex_unlock(&devlink->linecards_lock); - devlink_linecard_put(linecard); + mutex_destroy(&linecard->state_lock); + kfree(linecard); } -EXPORT_SYMBOL_GPL(devlink_linecard_destroy); +EXPORT_SYMBOL_GPL(devl_linecard_destroy); /** * devlink_linecard_provision_set - Set provisioning on linecard @@ -11591,8 +9508,46 @@ static int devlink_param_verify(const struct devlink_param *param) return devlink_param_driver_verify(param); } +static int devlink_param_register(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + WARN_ON(devlink_param_verify(param)); + WARN_ON(devlink_param_find_by_name(&devlink->param_list, param->name)); + + if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) + WARN_ON(param->get || param->set); + else + WARN_ON(!param->get || !param->set); + + param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); + if (!param_item) + return -ENOMEM; + + param_item->param = param; + + list_add_tail(¶m_item->list, &devlink->param_list); + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); + return 0; +} + +static void devlink_param_unregister(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + param_item = + devlink_param_find_by_name(&devlink->param_list, param->name); + if (WARN_ON(!param_item)) + return; + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL); + list_del(¶m_item->list); + kfree(param_item); +} + /** - * devlink_params_register - register configuration parameters + * devl_params_register - register configuration parameters * * @devlink: devlink * @params: configuration parameters array @@ -11600,14 +9555,14 @@ static int devlink_param_verify(const struct devlink_param *param) * * Register the configuration parameters supported by the driver. */ -int devlink_params_register(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) +int devl_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) { const struct devlink_param *param = params; int i, err; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); + lockdep_assert_held(&devlink->lock); for (i = 0; i < params_count; i++, param++) { err = devlink_param_register(devlink, param); @@ -11624,86 +9579,54 @@ rollback: devlink_param_unregister(devlink, param); return err; } +EXPORT_SYMBOL_GPL(devl_params_register); + +int devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + int err; + + devl_lock(devlink); + err = devl_params_register(devlink, params, params_count); + devl_unlock(devlink); + return err; +} EXPORT_SYMBOL_GPL(devlink_params_register); /** - * devlink_params_unregister - unregister configuration parameters + * devl_params_unregister - unregister configuration parameters * @devlink: devlink * @params: configuration parameters to unregister * @params_count: number of parameters provided */ -void devlink_params_unregister(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) +void devl_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) { const struct devlink_param *param = params; int i; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); + lockdep_assert_held(&devlink->lock); for (i = 0; i < params_count; i++, param++) devlink_param_unregister(devlink, param); } -EXPORT_SYMBOL_GPL(devlink_params_unregister); - -/** - * devlink_param_register - register one configuration parameter - * - * @devlink: devlink - * @param: one configuration parameter - * - * Register the configuration parameter supported by the driver. - * Return: returns 0 on successful registration or error code otherwise. - */ -int devlink_param_register(struct devlink *devlink, - const struct devlink_param *param) -{ - struct devlink_param_item *param_item; - - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - WARN_ON(devlink_param_verify(param)); - WARN_ON(devlink_param_find_by_name(&devlink->param_list, param->name)); - - if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) - WARN_ON(param->get || param->set); - else - WARN_ON(!param->get || !param->set); - - param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); - if (!param_item) - return -ENOMEM; - - param_item->param = param; - - list_add_tail(¶m_item->list, &devlink->param_list); - return 0; -} -EXPORT_SYMBOL_GPL(devlink_param_register); +EXPORT_SYMBOL_GPL(devl_params_unregister); -/** - * devlink_param_unregister - unregister one configuration parameter - * @devlink: devlink - * @param: configuration parameter to unregister - */ -void devlink_param_unregister(struct devlink *devlink, - const struct devlink_param *param) +void devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) { - struct devlink_param_item *param_item; - - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - param_item = - devlink_param_find_by_name(&devlink->param_list, param->name); - WARN_ON(!param_item); - list_del(¶m_item->list); - kfree(param_item); + devl_lock(devlink); + devl_params_unregister(devlink, params, params_count); + devl_unlock(devlink); } -EXPORT_SYMBOL_GPL(devlink_param_unregister); +EXPORT_SYMBOL_GPL(devlink_params_unregister); /** - * devlink_param_driverinit_value_get - get configuration parameter - * value for driver initializing + * devl_param_driverinit_value_get - get configuration parameter + * value for driver initializing * * @devlink: devlink * @param_id: parameter ID @@ -11712,21 +9635,25 @@ EXPORT_SYMBOL_GPL(devlink_param_unregister); * This function should be used by the driver to get driverinit * configuration for initialization after reload command. */ -int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, - union devlink_param_value *init_val) +int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val) { struct devlink_param_item *param_item; - if (!devlink_reload_supported(devlink->ops)) + lockdep_assert_held(&devlink->lock); + + if (WARN_ON(!devlink_reload_supported(devlink->ops))) return -EOPNOTSUPP; param_item = devlink_param_find_by_id(&devlink->param_list, param_id); if (!param_item) return -EINVAL; - if (!param_item->driverinit_value_valid || - !devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT)) + if (!param_item->driverinit_value_valid) + return -EOPNOTSUPP; + + if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT))) return -EOPNOTSUPP; if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) @@ -11736,12 +9663,12 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, return 0; } -EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); +EXPORT_SYMBOL_GPL(devl_param_driverinit_value_get); /** - * devlink_param_driverinit_value_set - set value of configuration - * parameter for driverinit - * configuration mode + * devl_param_driverinit_value_set - set value of configuration + * parameter for driverinit + * configuration mode * * @devlink: devlink * @param_id: parameter ID @@ -11750,34 +9677,33 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); * This function should be used by the driver to set driverinit * configuration mode default value. */ -int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, - union devlink_param_value init_val) +void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val) { struct devlink_param_item *param_item; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - param_item = devlink_param_find_by_id(&devlink->param_list, param_id); - if (!param_item) - return -EINVAL; + if (WARN_ON(!param_item)) + return; - if (!devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT)) - return -EOPNOTSUPP; + if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT))) + return; if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) strcpy(param_item->driverinit_value.vstr, init_val.vstr); else param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; - return 0; + + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } -EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); +EXPORT_SYMBOL_GPL(devl_param_driverinit_value_set); /** - * devlink_param_value_changed - notify devlink on a parameter's value - * change. Should be called by the driver - * right after the change. + * devl_param_value_changed - notify devlink on a parameter's value + * change. Should be called by the driver + * right after the change. * * @devlink: devlink * @param_id: parameter ID @@ -11786,7 +9712,7 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); * change, excluding driverinit configuration mode. * For driverinit configuration mode driver should use the function */ -void devlink_param_value_changed(struct devlink *devlink, u32 param_id) +void devl_param_value_changed(struct devlink *devlink, u32 param_id) { struct devlink_param_item *param_item; @@ -11795,7 +9721,7 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } -EXPORT_SYMBOL_GPL(devlink_param_value_changed); +EXPORT_SYMBOL_GPL(devl_param_value_changed); /** * devl_region_create - create a new address region @@ -12881,76 +10807,6 @@ devl_trap_policers_unregister(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devl_trap_policers_unregister); -static void __devlink_compat_running_version(struct devlink *devlink, - char *buf, size_t len) -{ - struct devlink_info_req req = {}; - const struct nlattr *nlattr; - struct sk_buff *msg; - int rem, err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - req.msg = msg; - err = devlink->ops->info_get(devlink, &req, NULL); - if (err) - goto free_msg; - - nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { - const struct nlattr *kv; - int rem_kv; - - if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) - continue; - - nla_for_each_nested(kv, nlattr, rem_kv) { - if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) - continue; - - strlcat(buf, nla_data(kv), len); - strlcat(buf, " ", len); - } - } -free_msg: - nlmsg_free(msg); -} - -void devlink_compat_running_version(struct devlink *devlink, - char *buf, size_t len) -{ - if (!devlink->ops->info_get) - return; - - devl_lock(devlink); - __devlink_compat_running_version(devlink, buf, len); - devl_unlock(devlink); -} - -int devlink_compat_flash_update(struct devlink *devlink, const char *file_name) -{ - struct devlink_flash_update_params params = {}; - int ret; - - if (!devlink->ops->flash_update) - return -EOPNOTSUPP; - - ret = request_firmware(¶ms.fw, file_name, devlink->dev); - if (ret) - return ret; - - devl_lock(devlink); - devlink_flash_update_begin_notify(devlink); - ret = devlink->ops->flash_update(devlink, ¶ms, NULL); - devlink_flash_update_end_notify(devlink); - devl_unlock(devlink); - - release_firmware(params.fw); - - return ret; -} - int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len) { @@ -12986,47 +10842,3 @@ int devlink_compat_switch_id_get(struct net_device *dev, return 0; } - -static void __net_exit devlink_pernet_pre_exit(struct net *net) -{ - struct devlink *devlink; - u32 actions_performed; - unsigned long index; - int err; - - /* In case network namespace is getting destroyed, reload - * all devlink instances from this namespace into init_net. - */ - devlinks_xa_for_each_registered_get(net, index, devlink) { - WARN_ON(!(devlink->features & DEVLINK_F_RELOAD)); - mutex_lock(&devlink->lock); - err = devlink_reload(devlink, &init_net, - DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_LIMIT_UNSPEC, - &actions_performed, NULL); - mutex_unlock(&devlink->lock); - if (err && err != -EOPNOTSUPP) - pr_warn("Failed to reload devlink instance into init_net\n"); - devlink_put(devlink); - } -} - -static struct pernet_operations devlink_pernet_ops __net_initdata = { - .pre_exit = devlink_pernet_pre_exit, -}; - -static int __init devlink_init(void) -{ - int err; - - err = genl_register_family(&devlink_nl_family); - if (err) - goto out; - err = register_pernet_subsys(&devlink_pernet_ops); - -out: - WARN_ON(err); - return err; -} - -subsys_initcall(devlink_init); diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c new file mode 100644 index 000000000000..7a332eb70f70 --- /dev/null +++ b/net/devlink/netlink.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <net/genetlink.h> +#include <net/sock.h> + +#include "devl_internal.h" + +static const struct genl_multicast_group devlink_nl_mcgrps[] = { + [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, +}; + +static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { + [DEVLINK_ATTR_UNSPEC] = { .strict_start_type = + DEVLINK_ATTR_TRAP_POLICER_ID }, + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_PORT_TYPE_AUTO, + DEVLINK_PORT_TYPE_IB), + [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_ESWITCH_MODE_LEGACY, + DEVLINK_ESWITCH_MODE_SWITCHDEV), + [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, + [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, + [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, + [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 }, + [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 }, + [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, + [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = + NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS), + [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, + [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, + [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, + [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_ACTION_MAX), + [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), + [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG }, +}; + +struct devlink * +devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) +{ + struct devlink *devlink; + unsigned long index; + char *busname; + char *devname; + + if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) + return ERR_PTR(-EINVAL); + + busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); + devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); + + devlinks_xa_for_each_registered_get(net, index, devlink) { + devl_lock(devlink); + if (devl_is_registered(devlink) && + strcmp(devlink->dev->bus->name, busname) == 0 && + strcmp(dev_name(devlink->dev), devname) == 0) + return devlink; + devl_unlock(devlink); + devlink_put(devlink); + } + + return ERR_PTR(-ENODEV); +} + +static int devlink_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct devlink_linecard *linecard; + struct devlink_port *devlink_port; + struct devlink *devlink; + int err; + + devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs); + if (IS_ERR(devlink)) + return PTR_ERR(devlink); + + info->user_ptr[0] = devlink; + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { + devlink_port = devlink_port_get_from_info(devlink, info); + if (IS_ERR(devlink_port)) { + err = PTR_ERR(devlink_port); + goto unlock; + } + info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { + devlink_port = devlink_port_get_from_info(devlink, info); + if (!IS_ERR(devlink_port)) + info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { + struct devlink_rate *devlink_rate; + + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) { + err = PTR_ERR(devlink_rate); + goto unlock; + } + info->user_ptr[1] = devlink_rate; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { + struct devlink_rate *rate_node; + + rate_node = devlink_rate_node_get_from_info(devlink, info); + if (IS_ERR(rate_node)) { + err = PTR_ERR(rate_node); + goto unlock; + } + info->user_ptr[1] = rate_node; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) { + err = PTR_ERR(linecard); + goto unlock; + } + info->user_ptr[1] = linecard; + } + return 0; + +unlock: + devl_unlock(devlink); + devlink_put(devlink); + return err; +} + +static void devlink_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink; + + devlink = info->user_ptr[0]; + devl_unlock(devlink); + devlink_put(devlink); +} + +static const struct devlink_cmd *devl_cmds[] = { + [DEVLINK_CMD_GET] = &devl_cmd_get, + [DEVLINK_CMD_PORT_GET] = &devl_cmd_port_get, + [DEVLINK_CMD_SB_GET] = &devl_cmd_sb_get, + [DEVLINK_CMD_SB_POOL_GET] = &devl_cmd_sb_pool_get, + [DEVLINK_CMD_SB_PORT_POOL_GET] = &devl_cmd_sb_port_pool_get, + [DEVLINK_CMD_SB_TC_POOL_BIND_GET] = &devl_cmd_sb_tc_pool_bind_get, + [DEVLINK_CMD_PARAM_GET] = &devl_cmd_param_get, + [DEVLINK_CMD_REGION_GET] = &devl_cmd_region_get, + [DEVLINK_CMD_INFO_GET] = &devl_cmd_info_get, + [DEVLINK_CMD_HEALTH_REPORTER_GET] = &devl_cmd_health_reporter_get, + [DEVLINK_CMD_TRAP_GET] = &devl_cmd_trap_get, + [DEVLINK_CMD_TRAP_GROUP_GET] = &devl_cmd_trap_group_get, + [DEVLINK_CMD_TRAP_POLICER_GET] = &devl_cmd_trap_policer_get, + [DEVLINK_CMD_RATE_GET] = &devl_cmd_rate_get, + [DEVLINK_CMD_LINECARD_GET] = &devl_cmd_linecard_get, + [DEVLINK_CMD_SELFTESTS_GET] = &devl_cmd_selftests_get, +}; + +int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + const struct devlink_cmd *cmd; + struct devlink *devlink; + int err = 0; + + cmd = devl_cmds[info->op.cmd]; + + while ((devlink = devlinks_xa_find_get(sock_net(msg->sk), + &state->instance))) { + devl_lock(devlink); + + if (devl_is_registered(devlink)) + err = cmd->dump_one(msg, devlink, cb); + else + err = 0; + + devl_unlock(devlink); + devlink_put(devlink); + + if (err) + break; + + state->instance++; + + /* restart sub-object walk for the next instance */ + state->idx = 0; + } + + if (err != -EMSGSIZE) + return err; + return msg->len; +} + +struct genl_family devlink_nl_family __ro_after_init = { + .name = DEVLINK_GENL_NAME, + .version = DEVLINK_GENL_VERSION, + .maxattr = DEVLINK_ATTR_MAX, + .policy = devlink_nl_policy, + .netnsok = true, + .parallel_ops = true, + .pre_doit = devlink_nl_pre_doit, + .post_doit = devlink_nl_post_doit, + .module = THIS_MODULE, + .small_ops = devlink_nl_ops, + .n_small_ops = ARRAY_SIZE(devlink_nl_ops), + .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, + .mcgrps = devlink_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), +}; diff --git a/net/dsa/master.c b/net/dsa/master.c index 26d90140d271..22d3f16b0e6d 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -299,7 +299,7 @@ static ssize_t tagging_show(struct device *d, struct device_attribute *attr, struct net_device *dev = to_net_dev(d); struct dsa_port *cpu_dp = dev->dsa_ptr; - return sprintf(buf, "%s\n", + return sysfs_emit(buf, "%s\n", dsa_tag_protocol_to_str(cpu_dp->tag_ops)); } @@ -464,9 +464,7 @@ int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp, err = dsa_port_lag_join(cpu_dp, lag_dev, uinfo, extack); if (err) { - if (extack && !extack->_msg) - NL_SET_ERR_MSG_MOD(extack, - "CPU port failed to join LAG"); + NL_SET_ERR_MSG_WEAK_MOD(extack, "CPU port failed to join LAG"); goto out_master_teardown; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index aab79c355224..6957971c2db2 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1117,6 +1117,40 @@ static void dsa_slave_net_selftest(struct net_device *ndev, net_selftest(ndev, etest, buf); } +static int dsa_slave_get_mm(struct net_device *dev, + struct ethtool_mm_state *state) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->get_mm) + return -EOPNOTSUPP; + + return ds->ops->get_mm(ds, dp->index, state); +} + +static int dsa_slave_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg, + struct netlink_ext_ack *extack) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->set_mm) + return -EOPNOTSUPP; + + return ds->ops->set_mm(ds, dp->index, cfg, extack); +} + +static void dsa_slave_get_mm_stats(struct net_device *dev, + struct ethtool_mm_stats *stats) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_mm_stats) + ds->ops->get_mm_stats(ds, dp->index, stats); +} + static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -2205,6 +2239,9 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .set_rxnfc = dsa_slave_set_rxnfc, .get_ts_info = dsa_slave_get_ts_info, .self_test = dsa_slave_net_selftest, + .get_mm = dsa_slave_get_mm, + .set_mm = dsa_slave_set_mm, + .get_mm_stats = dsa_slave_get_mm_stats, }; static const struct dcbnl_rtnl_ops __maybe_unused dsa_slave_dcbnl_ops = { @@ -2655,9 +2692,8 @@ static int dsa_slave_changeupper(struct net_device *dev, if (!err) dsa_bridge_mtu_normalization(dp); if (err == -EOPNOTSUPP) { - if (extack && !extack->_msg) - NL_SET_ERR_MSG_MOD(extack, - "Offloading not supported"); + NL_SET_ERR_MSG_WEAK_MOD(extack, + "Offloading not supported"); err = 0; } err = notifier_from_errno(err); @@ -2670,8 +2706,8 @@ static int dsa_slave_changeupper(struct net_device *dev, err = dsa_port_lag_join(dp, info->upper_dev, info->upper_info, extack); if (err == -EOPNOTSUPP) { - NL_SET_ERR_MSG_MOD(info->info.extack, - "Offloading not supported"); + NL_SET_ERR_MSG_WEAK_MOD(extack, + "Offloading not supported"); err = 0; } err = notifier_from_errno(err); @@ -2683,8 +2719,8 @@ static int dsa_slave_changeupper(struct net_device *dev, if (info->linking) { err = dsa_port_hsr_join(dp, info->upper_dev); if (err == -EOPNOTSUPP) { - NL_SET_ERR_MSG_MOD(info->info.extack, - "Offloading not supported"); + NL_SET_ERR_MSG_WEAK_MOD(extack, + "Offloading not supported"); err = 0; } err = notifier_from_errno(err); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 080e5c369f5b..0eb1c7784c3d 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -4,8 +4,10 @@ * Copyright (c) 2017 Microchip Technology */ +#include <linux/dsa/ksz_common.h> #include <linux/etherdevice.h> #include <linux/list.h> +#include <linux/ptp_classify.h> #include <net/dsa.h> #include "tag.h" @@ -16,9 +18,71 @@ #define LAN937X_NAME "lan937x" /* Typically only one byte is used for tail tag. */ +#define KSZ_PTP_TAG_LEN 4 #define KSZ_EGRESS_TAG_LEN 1 #define KSZ_INGRESS_TAG_LEN 1 +#define KSZ_HWTS_EN 0 + +struct ksz_tagger_private { + struct ksz_tagger_data data; /* Must be first */ + unsigned long state; + struct kthread_worker *xmit_worker; +}; + +static struct ksz_tagger_private * +ksz_tagger_private(struct dsa_switch *ds) +{ + return ds->tagger_data; +} + +static void ksz_hwtstamp_set_state(struct dsa_switch *ds, bool on) +{ + struct ksz_tagger_private *priv = ksz_tagger_private(ds); + + if (on) + set_bit(KSZ_HWTS_EN, &priv->state); + else + clear_bit(KSZ_HWTS_EN, &priv->state); +} + +static void ksz_disconnect(struct dsa_switch *ds) +{ + struct ksz_tagger_private *priv = ds->tagger_data; + + kthread_destroy_worker(priv->xmit_worker); + kfree(priv); + ds->tagger_data = NULL; +} + +static int ksz_connect(struct dsa_switch *ds) +{ + struct ksz_tagger_data *tagger_data; + struct kthread_worker *xmit_worker; + struct ksz_tagger_private *priv; + int ret; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + ds->dst->index, ds->index); + if (IS_ERR(xmit_worker)) { + ret = PTR_ERR(xmit_worker); + kfree(priv); + return ret; + } + + priv->xmit_worker = xmit_worker; + /* Export functions for switch driver use */ + tagger_data = &priv->data; + tagger_data->hwtstamp_set_state = ksz_hwtstamp_set_state; + ds->tagger_data = priv; + + return 0; +} + static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, struct net_device *dev, unsigned int port, unsigned int len) @@ -92,17 +156,20 @@ DSA_TAG_DRIVER(ksz8795_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME); /* - * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS. + * For Ingress (Host -> KSZ9477), 2/6 bytes are added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)| + * FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if PTP is enabled in the Hardware) * tag0 : Prioritization (not used now) * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5) * - * For Egress (KSZ9477 -> Host), 1 byte is added before FCS. + * For Egress (KSZ9477 -> Host), 1/5 bytes is added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if bit 7 of tag0 is set) * tag0 : zero-based value represents port * (eg, 0x00=port1, 0x02=port3, 0x06=port7) */ @@ -111,12 +178,100 @@ MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME); #define KSZ9477_PTP_TAG_LEN 4 #define KSZ9477_PTP_TAG_INDICATION 0x80 +#define KSZ9477_TAIL_TAG_PRIO GENMASK(8, 7) #define KSZ9477_TAIL_TAG_OVERRIDE BIT(9) #define KSZ9477_TAIL_TAG_LOOKUP BIT(10) +static void ksz_rcv_timestamp(struct sk_buff *skb, u8 *tag) +{ + u8 *tstamp_raw = tag - KSZ_PTP_TAG_LEN; + ktime_t tstamp; + + tstamp = ksz_decode_tstamp(get_unaligned_be32(tstamp_raw)); + KSZ_SKB_CB(skb)->tstamp = tstamp; +} + +/* Time stamp tag *needs* to be inserted if PTP is enabled in hardware. + * Regardless of Whether it is a PTP frame or not. + */ +static void ksz_xmit_timestamp(struct dsa_port *dp, struct sk_buff *skb) +{ + struct ksz_tagger_private *priv; + struct ptp_header *ptp_hdr; + unsigned int ptp_type; + u32 tstamp_raw = 0; + s64 correction; + + priv = ksz_tagger_private(dp->ds); + + if (!test_bit(KSZ_HWTS_EN, &priv->state)) + return; + + if (!KSZ_SKB_CB(skb)->update_correction) + goto output_tag; + + ptp_type = KSZ_SKB_CB(skb)->ptp_type; + + ptp_hdr = ptp_parse_header(skb, ptp_type); + if (!ptp_hdr) + goto output_tag; + + correction = (s64)get_unaligned_be64(&ptp_hdr->correction); + + if (correction < 0) { + struct timespec64 ts; + + ts = ns_to_timespec64(-correction >> 16); + tstamp_raw = ((ts.tv_sec & 3) << 30) | ts.tv_nsec; + + /* Set correction field to 0 and update UDP checksum */ + ptp_header_update_correction(skb, ptp_type, ptp_hdr, 0); + } + +output_tag: + put_unaligned_be32(tstamp_raw, skb_put(skb, KSZ_PTP_TAG_LEN)); +} + +/* Defer transmit if waiting for egress time stamp is required. */ +static struct sk_buff *ksz_defer_xmit(struct dsa_port *dp, struct sk_buff *skb) +{ + struct ksz_tagger_data *tagger_data = ksz_tagger_data(dp->ds); + struct ksz_tagger_private *priv = ksz_tagger_private(dp->ds); + void (*xmit_work_fn)(struct kthread_work *work); + struct sk_buff *clone = KSZ_SKB_CB(skb)->clone; + struct ksz_deferred_xmit_work *xmit_work; + struct kthread_worker *xmit_worker; + + if (!clone) + return skb; /* no deferred xmit for this packet */ + + xmit_work_fn = tagger_data->xmit_work_fn; + xmit_worker = priv->xmit_worker; + + if (!xmit_work_fn || !xmit_worker) + return NULL; + + xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); + if (!xmit_work) + return NULL; + + kthread_init_work(&xmit_work->work, xmit_work_fn); + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + xmit_work->dp = dp; + xmit_work->skb = skb_get(skb); + + kthread_queue_work(xmit_worker, &xmit_work->work); + + return NULL; +} + static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, struct net_device *dev) { + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 prio = netdev_txq_to_tc(dev, queue_mapping); struct dsa_port *dp = dsa_slave_to_port(dev); __be16 *tag; u8 *addr; @@ -126,17 +281,21 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, return NULL; /* Tag encoding */ + ksz_xmit_timestamp(dp, skb); + tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN); addr = skb_mac_header(skb); val = BIT(dp->index); + val |= FIELD_PREP(KSZ9477_TAIL_TAG_PRIO, prio); + if (is_link_local_ether_addr(addr)) val |= KSZ9477_TAIL_TAG_OVERRIDE; *tag = cpu_to_be16(val); - return skb; + return ksz_defer_xmit(dp, skb); } static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) @@ -147,8 +306,10 @@ static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) unsigned int len = KSZ_EGRESS_TAG_LEN; /* Extra 4-bytes PTP timestamp */ - if (tag[0] & KSZ9477_PTP_TAG_INDICATION) - len += KSZ9477_PTP_TAG_LEN; + if (tag[0] & KSZ9477_PTP_TAG_INDICATION) { + ksz_rcv_timestamp(skb, tag); + len += KSZ_PTP_TAG_LEN; + } return ksz_common_rcv(skb, dev, port, len); } @@ -158,18 +319,23 @@ static const struct dsa_device_ops ksz9477_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9477, .xmit = ksz9477_xmit, .rcv = ksz9477_rcv, - .needed_tailroom = KSZ9477_INGRESS_TAG_LEN, + .connect = ksz_connect, + .disconnect = ksz_disconnect, + .needed_tailroom = KSZ9477_INGRESS_TAG_LEN + KSZ_PTP_TAG_LEN, }; DSA_TAG_DRIVER(ksz9477_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9477, KSZ9477_NAME); +#define KSZ9893_TAIL_TAG_PRIO GENMASK(4, 3) #define KSZ9893_TAIL_TAG_OVERRIDE BIT(5) #define KSZ9893_TAIL_TAG_LOOKUP BIT(6) static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, struct net_device *dev) { + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 prio = netdev_txq_to_tc(dev, queue_mapping); struct dsa_port *dp = dsa_slave_to_port(dev); u8 *addr; u8 *tag; @@ -178,15 +344,19 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, return NULL; /* Tag encoding */ + ksz_xmit_timestamp(dp, skb); + tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); addr = skb_mac_header(skb); *tag = BIT(dp->index); + *tag |= FIELD_PREP(KSZ9893_TAIL_TAG_PRIO, prio); + if (is_link_local_ether_addr(addr)) *tag |= KSZ9893_TAIL_TAG_OVERRIDE; - return skb; + return ksz_defer_xmit(dp, skb); } static const struct dsa_device_ops ksz9893_netdev_ops = { @@ -194,23 +364,28 @@ static const struct dsa_device_ops ksz9893_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9893, .xmit = ksz9893_xmit, .rcv = ksz9477_rcv, - .needed_tailroom = KSZ_INGRESS_TAG_LEN, + .connect = ksz_connect, + .disconnect = ksz_disconnect, + .needed_tailroom = KSZ_INGRESS_TAG_LEN + KSZ_PTP_TAG_LEN, }; DSA_TAG_DRIVER(ksz9893_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893, KSZ9893_NAME); -/* For xmit, 2 bytes are added before FCS. +/* For xmit, 2/6 bytes are added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)| + * FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if PTP is enabled in the Hardware) * tag0 : represents tag override, lookup and valid * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x80=port8) * - * For rcv, 1 byte is added before FCS. + * For rcv, 1/5 bytes is added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if bit 7 of tag0 is set) * tag0 : zero-based value represents port * (eg, 0x00=port1, 0x02=port3, 0x07=port8) */ @@ -219,11 +394,14 @@ MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893, KSZ9893_NAME); #define LAN937X_TAIL_TAG_BLOCKING_OVERRIDE BIT(11) #define LAN937X_TAIL_TAG_LOOKUP BIT(12) #define LAN937X_TAIL_TAG_VALID BIT(13) +#define LAN937X_TAIL_TAG_PRIO GENMASK(10, 8) #define LAN937X_TAIL_TAG_PORT_MASK 7 static struct sk_buff *lan937x_xmit(struct sk_buff *skb, struct net_device *dev) { + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 prio = netdev_txq_to_tc(dev, queue_mapping); struct dsa_port *dp = dsa_slave_to_port(dev); const struct ethhdr *hdr = eth_hdr(skb); __be16 *tag; @@ -232,10 +410,14 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb, if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) return NULL; + ksz_xmit_timestamp(dp, skb); + tag = skb_put(skb, LAN937X_EGRESS_TAG_LEN); val = BIT(dp->index); + val |= FIELD_PREP(LAN937X_TAIL_TAG_PRIO, prio); + if (is_link_local_ether_addr(hdr->h_dest)) val |= LAN937X_TAIL_TAG_BLOCKING_OVERRIDE; @@ -244,7 +426,7 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb, put_unaligned_be16(val, tag); - return skb; + return ksz_defer_xmit(dp, skb); } static const struct dsa_device_ops lan937x_netdev_ops = { @@ -252,7 +434,9 @@ static const struct dsa_device_ops lan937x_netdev_ops = { .proto = DSA_TAG_PROTO_LAN937X, .xmit = lan937x_xmit, .rcv = ksz9477_rcv, - .needed_tailroom = LAN937X_EGRESS_TAG_LEN, + .connect = ksz_connect, + .disconnect = ksz_disconnect, + .needed_tailroom = LAN937X_EGRESS_TAG_LEN + KSZ_PTP_TAG_LEN, }; DSA_TAG_DRIVER(lan937x_netdev_ops); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 228f13df2e18..504f954a1b28 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o \ - pse-pd.o + tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \ + module.o pse-pd.o plca.o mm.o diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index c7e37130647e..61c40e889a4d 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -86,18 +86,6 @@ static int channels_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_channels_request_ops = { - .request_cmd = ETHTOOL_MSG_CHANNELS_GET, - .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY, - .hdr_attr = ETHTOOL_A_CHANNELS_HEADER, - .req_info_size = sizeof(struct channels_req_info), - .reply_data_size = sizeof(struct channels_reply_data), - - .prepare_data = channels_prepare_data, - .reply_size = channels_reply_size, - .fill_reply = channels_fill_reply, -}; - /* CHANNELS_SET */ const struct nla_policy ethnl_channels_set_policy[] = { @@ -109,36 +97,28 @@ const struct nla_policy ethnl_channels_set_policy[] = { [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_U32 }, }; -int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_channels_validate(struct ethnl_req_info *req_info, + struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_channels && ops->set_channels ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) { unsigned int from_channel, old_total, i; bool mod = false, mod_combined = false; + struct net_device *dev = req_info->dev; struct ethtool_channels channels = {}; - struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; u32 err_attr, max_rxfh_in_use; - const struct ethtool_ops *ops; - struct net_device *dev; u64 max_rxnfc_in_use; int ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_CHANNELS_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_channels || !ops->set_channels) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ops->get_channels(dev, &channels); + dev->ethtool_ops->get_channels(dev, &channels); old_total = channels.combined_count + max(channels.rx_count, channels.tx_count); @@ -151,9 +131,8 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) ethnl_update_u32(&channels.combined_count, tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod_combined); mod |= mod_combined; - ret = 0; if (!mod) - goto out_ops; + return 0; /* ensure new channel counts are within limits */ if (channels.rx_count > channels.max_rx) @@ -167,10 +146,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) else err_attr = 0; if (err_attr) { - ret = -EINVAL; NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel count exceeds maximum"); - goto out_ops; + return -EINVAL; } /* ensure there is at least one RX and one TX channel */ @@ -183,10 +161,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) if (err_attr) { if (mod_combined) err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT; - ret = -EINVAL; NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel counts would result in no RX or TX channel being configured"); - goto out_ops; + return -EINVAL; } /* ensure the new Rx count fits within the configured Rx flow @@ -198,14 +175,12 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) ethtool_get_max_rxfh_channel(dev, &max_rxfh_in_use)) max_rxfh_in_use = 0; if (channels.combined_count + channels.rx_count <= max_rxfh_in_use) { - ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings"); - goto out_ops; + return -EINVAL; } if (channels.combined_count + channels.rx_count <= max_rxnfc_in_use) { - ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing ntuple filter settings"); - goto out_ops; + return -EINVAL; } /* Disabling channels, query zero-copy AF_XDP sockets */ @@ -213,21 +188,26 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) min(channels.rx_count, channels.tx_count); for (i = from_channel; i < old_total; i++) if (xsk_get_pool_from_qid(dev, i)) { - ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); - goto out_ops; + return -EINVAL; } ret = dev->ethtool_ops->set_channels(dev, &channels); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_channels_request_ops = { + .request_cmd = ETHTOOL_MSG_CHANNELS_GET, + .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY, + .hdr_attr = ETHTOOL_A_CHANNELS_HEADER, + .req_info_size = sizeof(struct channels_req_info), + .reply_data_size = sizeof(struct channels_reply_data), + + .prepare_data = channels_prepare_data, + .reply_size = channels_reply_size, + .fill_reply = channels_fill_reply, + + .set_validate = ethnl_set_channels_validate, + .set = ethnl_set_channels, + .set_ntf_cmd = ETHTOOL_MSG_CHANNELS_NTF, +}; diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 487bdf345541..443e7e642c96 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -105,7 +105,10 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_TX */ - nla_total_size(sizeof(u8)); /* _USE_CQE_MODE_RX */ + nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_RX */ + nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */ + nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */ + nla_total_size(sizeof(u32)); /* _TX_AGGR_TIME_USECS */ } static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, @@ -180,24 +183,18 @@ static int coalesce_fill_reply(struct sk_buff *skb, coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, kcoal->use_cqe_mode_tx, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, - kcoal->use_cqe_mode_rx, supported)) + kcoal->use_cqe_mode_rx, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, + kcoal->tx_aggr_max_bytes, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, + kcoal->tx_aggr_max_frames, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, + kcoal->tx_aggr_time_usecs, supported)) return -EMSGSIZE; return 0; } -const struct ethnl_request_ops ethnl_coalesce_request_ops = { - .request_cmd = ETHTOOL_MSG_COALESCE_GET, - .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY, - .hdr_attr = ETHTOOL_A_COALESCE_HEADER, - .req_info_size = sizeof(struct coalesce_req_info), - .reply_data_size = sizeof(struct coalesce_reply_data), - - .prepare_data = coalesce_prepare_data, - .reply_size = coalesce_reply_size, - .fill_reply = coalesce_fill_reply, -}; - /* COALESCE_SET */ const struct nla_policy ethnl_coalesce_set_policy[] = { @@ -227,51 +224,49 @@ const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_USE_CQE_MODE_TX] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_COALESCE_USE_CQE_MODE_RX] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 }, }; -int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_coalesce_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { - struct kernel_ethtool_coalesce kernel_coalesce = {}; - struct ethtool_coalesce coalesce = {}; - struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; - const struct ethtool_ops *ops; - struct net_device *dev; u32 supported_params; - bool mod = false; - int ret; u16 a; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_COALESCE_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; if (!ops->get_coalesce || !ops->set_coalesce) - goto out_dev; + return -EOPNOTSUPP; /* make sure that only supported parameters are present */ supported_params = ops->supported_coalesce_params; for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++) if (tb[a] && !(supported_params & attr_to_mask(a))) { - ret = -EINVAL; NL_SET_ERR_MSG_ATTR(info->extack, tb[a], "cannot modify an unsupported parameter"); - goto out_dev; + return -EINVAL; } - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ret = ops->get_coalesce(dev, &coalesce, &kernel_coalesce, - info->extack); + return 1; +} + +static int +ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct kernel_ethtool_coalesce kernel_coalesce = {}; + struct net_device *dev = req_info->dev; + struct ethtool_coalesce coalesce = {}; + struct nlattr **tb = info->attrs; + bool mod = false; + int ret; + + ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, + info->extack); if (ret < 0) - goto out_ops; + return ret; ethnl_update_u32(&coalesce.rx_coalesce_usecs, tb[ETHTOOL_A_COALESCE_RX_USECS], &mod); @@ -321,21 +316,32 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod); ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx, tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod); - ret = 0; + ethnl_update_u32(&kernel_coalesce.tx_aggr_max_bytes, + tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES], &mod); + ethnl_update_u32(&kernel_coalesce.tx_aggr_max_frames, + tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod); + ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs, + tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod); if (!mod) - goto out_ops; + return 0; ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, info->extack); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_coalesce_request_ops = { + .request_cmd = ETHTOOL_MSG_COALESCE_GET, + .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY, + .hdr_attr = ETHTOOL_A_COALESCE_HEADER, + .req_info_size = sizeof(struct coalesce_req_info), + .reply_data_size = sizeof(struct coalesce_reply_data), + + .prepare_data = coalesce_prepare_data, + .reply_size = coalesce_reply_size, + .fill_reply = coalesce_fill_reply, + + .set_validate = ethnl_set_coalesce_validate, + .set = ethnl_set_coalesce, + .set_ntf_cmd = ETHTOOL_MSG_COALESCE_NTF, +}; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 6f399afc2ff2..5fb19050991e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -208,6 +208,9 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(800000, DR8_2, Full), __DEFINE_LINK_MODE_NAME(800000, SR8, Full), __DEFINE_LINK_MODE_NAME(800000, VR8, Full), + __DEFINE_LINK_MODE_NAME(10, T1S, Full), + __DEFINE_LINK_MODE_NAME(10, T1S, Half), + __DEFINE_LINK_MODE_NAME(10, T1S_P2MP, Half), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -244,6 +247,8 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_X 1 #define __LINK_MODE_LANES_FX 1 #define __LINK_MODE_LANES_T1L 1 +#define __LINK_MODE_LANES_T1S 1 +#define __LINK_MODE_LANES_T1S_P2MP 1 #define __LINK_MODE_LANES_VR8 8 #define __LINK_MODE_LANES_DR8_2 8 @@ -366,6 +371,9 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full), __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full), __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full), + __DEFINE_LINK_MODE_PARAMS(10, T1S, Full), + __DEFINE_LINK_MODE_PARAMS(10, T1S, Half), + __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index b1b9db810eca..28b8aaaf9bcb 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -54,4 +54,6 @@ int ethtool_get_module_info_call(struct net_device *dev, int ethtool_get_module_eeprom_call(struct net_device *dev, struct ethtool_eeprom *ee, u8 *data); +bool __ethtool_dev_mm_supported(struct net_device *dev); + #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c index d73888c7d19c..e4369769817e 100644 --- a/net/ethtool/debug.c +++ b/net/ethtool/debug.c @@ -63,18 +63,6 @@ static int debug_fill_reply(struct sk_buff *skb, netif_msg_class_names, compact); } -const struct ethnl_request_ops ethnl_debug_request_ops = { - .request_cmd = ETHTOOL_MSG_DEBUG_GET, - .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, - .hdr_attr = ETHTOOL_A_DEBUG_HEADER, - .req_info_size = sizeof(struct debug_req_info), - .reply_data_size = sizeof(struct debug_reply_data), - - .prepare_data = debug_prepare_data, - .reply_size = debug_reply_size, - .fill_reply = debug_fill_reply, -}; - /* DEBUG_SET */ const struct nla_policy ethnl_debug_set_policy[] = { @@ -83,46 +71,47 @@ const struct nla_policy ethnl_debug_set_policy[] = { [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED }, }; -int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_debug_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { - struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_msglevel && ops->set_msglevel ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_debug(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct net_device *dev; bool mod = false; u32 msg_mask; int ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_DEBUG_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ret = -EOPNOTSUPP; - if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - msg_mask = dev->ethtool_ops->get_msglevel(dev); ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT, tb[ETHTOOL_A_DEBUG_MSGMASK], netif_msg_class_names, info->extack, &mod); if (ret < 0 || !mod) - goto out_ops; + return ret; dev->ethtool_ops->set_msglevel(dev, msg_mask); - ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return 1; } + +const struct ethnl_request_ops ethnl_debug_request_ops = { + .request_cmd = ETHTOOL_MSG_DEBUG_GET, + .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, + .hdr_attr = ETHTOOL_A_DEBUG_HEADER, + .req_info_size = sizeof(struct debug_req_info), + .reply_data_size = sizeof(struct debug_reply_data), + + .prepare_data = debug_prepare_data, + .reply_size = debug_reply_size, + .fill_reply = debug_fill_reply, + + .set_validate = ethnl_set_debug_validate, + .set = ethnl_set_debug, + .set_ntf_cmd = ETHTOOL_MSG_DEBUG_NTF, +}; diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c index 45c42b2d5f17..42104bcb0e47 100644 --- a/net/ethtool/eee.c +++ b/net/ethtool/eee.c @@ -108,18 +108,6 @@ static int eee_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_eee_request_ops = { - .request_cmd = ETHTOOL_MSG_EEE_GET, - .reply_cmd = ETHTOOL_MSG_EEE_GET_REPLY, - .hdr_attr = ETHTOOL_A_EEE_HEADER, - .req_info_size = sizeof(struct eee_req_info), - .reply_data_size = sizeof(struct eee_reply_data), - - .prepare_data = eee_prepare_data, - .reply_size = eee_reply_size, - .fill_reply = eee_fill_reply, -}; - /* EEE_SET */ const struct nla_policy ethnl_eee_set_policy[] = { @@ -131,60 +119,56 @@ const struct nla_policy ethnl_eee_set_policy[] = { [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_U32 }, }; -int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_eee_validate(struct ethnl_req_info *req_info, struct genl_info *info) { - struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_eee && ops->set_eee ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - const struct ethtool_ops *ops; struct ethtool_eee eee = {}; - struct net_device *dev; bool mod = false; int ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_EEE_HEADER], - genl_info_net(info), info->extack, - true); + ret = dev->ethtool_ops->get_eee(dev, &eee); if (ret < 0) return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_eee || !ops->set_eee) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ret = ops->get_eee(dev, &eee); - if (ret < 0) - goto out_ops; ret = ethnl_update_bitset32(&eee.advertised, EEE_MODES_COUNT, tb[ETHTOOL_A_EEE_MODES_OURS], link_mode_names, info->extack, &mod); if (ret < 0) - goto out_ops; + return ret; ethnl_update_bool32(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod); ethnl_update_bool32(&eee.tx_lpi_enabled, tb[ETHTOOL_A_EEE_TX_LPI_ENABLED], &mod); ethnl_update_u32(&eee.tx_lpi_timer, tb[ETHTOOL_A_EEE_TX_LPI_TIMER], &mod); - ret = 0; if (!mod) - goto out_ops; + return 0; ret = dev->ethtool_ops->set_eee(dev, &eee); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_eee_request_ops = { + .request_cmd = ETHTOOL_MSG_EEE_GET, + .reply_cmd = ETHTOOL_MSG_EEE_GET_REPLY, + .hdr_attr = ETHTOOL_A_EEE_HEADER, + .req_info_size = sizeof(struct eee_req_info), + .reply_data_size = sizeof(struct eee_reply_data), + + .prepare_data = eee_prepare_data, + .reply_size = eee_reply_size, + .fill_reply = eee_fill_reply, + + .set_validate = ethnl_set_eee_validate, + .set = ethnl_set_eee, + .set_ntf_cmd = ETHTOOL_MSG_EEE_NTF, +}; diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index 9f5a134e2e01..0d9a3d153170 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -217,18 +217,6 @@ static int fec_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_fec_request_ops = { - .request_cmd = ETHTOOL_MSG_FEC_GET, - .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY, - .hdr_attr = ETHTOOL_A_FEC_HEADER, - .req_info_size = sizeof(struct fec_req_info), - .reply_data_size = sizeof(struct fec_reply_data), - - .prepare_data = fec_prepare_data, - .reply_size = fec_reply_size, - .fill_reply = fec_fill_reply, -}; - /* FEC_SET */ const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = { @@ -237,36 +225,28 @@ const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = { [ETHTOOL_A_FEC_AUTO] = NLA_POLICY_MAX(NLA_U8, 1), }; -int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_fec_validate(struct ethnl_req_info *req_info, struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_fecparam && ops->set_fecparam ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_fec(struct ethnl_req_info *req_info, struct genl_info *info) { __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes) = {}; - struct ethnl_req_info req_info = {}; + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; struct ethtool_fecparam fec = {}; - const struct ethtool_ops *ops; - struct net_device *dev; bool mod = false; u8 fec_auto; int ret; - ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEC_HEADER], - genl_info_net(info), info->extack, - true); + ret = dev->ethtool_ops->get_fecparam(dev, &fec); if (ret < 0) return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_fecparam || !ops->set_fecparam) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ret = ops->get_fecparam(dev, &fec); - if (ret < 0) - goto out_ops; ethtool_fec_to_link_modes(fec.fec, fec_link_modes, &fec_auto); @@ -275,36 +255,39 @@ int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info) tb[ETHTOOL_A_FEC_MODES], link_mode_names, info->extack, &mod); if (ret < 0) - goto out_ops; + return ret; ethnl_update_u8(&fec_auto, tb[ETHTOOL_A_FEC_AUTO], &mod); - - ret = 0; if (!mod) - goto out_ops; + return 0; ret = ethtool_link_modes_to_fecparam(&fec, fec_link_modes, fec_auto); if (ret) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], "invalid FEC modes requested"); - goto out_ops; + return ret; } if (!fec.fec) { - ret = -EINVAL; NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], "no FEC modes set"); - goto out_ops; + return -EINVAL; } ret = dev->ethtool_ops->set_fecparam(dev, &fec); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_FEC_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_fec_request_ops = { + .request_cmd = ETHTOOL_MSG_FEC_GET, + .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY, + .hdr_attr = ETHTOOL_A_FEC_HEADER, + .req_info_size = sizeof(struct fec_req_info), + .reply_data_size = sizeof(struct fec_reply_data), + + .prepare_data = fec_prepare_data, + .reply_size = fec_reply_size, + .fill_reply = fec_fill_reply, + + .set_validate = ethnl_set_fec_validate, + .set = ethnl_set_fec, + .set_ntf_cmd = ETHTOOL_MSG_FEC_NTF, +}; diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index efa0f7f48836..310dfe63292a 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -73,18 +73,6 @@ static int linkinfo_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_linkinfo_request_ops = { - .request_cmd = ETHTOOL_MSG_LINKINFO_GET, - .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, - .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, - .req_info_size = sizeof(struct linkinfo_req_info), - .reply_data_size = sizeof(struct linkinfo_reply_data), - - .prepare_data = linkinfo_prepare_data, - .reply_size = linkinfo_reply_size, - .fill_reply = linkinfo_fill_reply, -}; - /* LINKINFO_SET */ const struct nla_policy ethnl_linkinfo_set_policy[] = { @@ -95,37 +83,31 @@ const struct nla_policy ethnl_linkinfo_set_policy[] = { [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 }, }; -int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_linkinfo_validate(struct ethnl_req_info *req_info, + struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + if (!ops->get_link_ksettings || !ops->set_link_ksettings) + return -EOPNOTSUPP; + return 1; +} + +static int +ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; struct ethtool_link_settings *lsettings; - struct ethnl_req_info req_info = {}; + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct net_device *dev; bool mod = false; int ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_LINKINFO_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ret = -EOPNOTSUPP; - if (!dev->ethtool_ops->get_link_ksettings || - !dev->ethtool_ops->set_link_ksettings) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); - goto out_ops; + return ret; } lsettings = &ksettings.base; @@ -134,21 +116,30 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) &mod); ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl, tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod); - ret = 0; if (!mod) - goto out_ops; + return 0; ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); - if (ret < 0) + if (ret < 0) { GENL_SET_ERR_MSG(info, "link settings update failed"); - else - ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + return ret; + } -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return 1; } + +const struct ethnl_request_ops ethnl_linkinfo_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKINFO_GET, + .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, + .req_info_size = sizeof(struct linkinfo_req_info), + .reply_data_size = sizeof(struct linkinfo_reply_data), + + .prepare_data = linkinfo_prepare_data, + .reply_size = linkinfo_reply_size, + .fill_reply = linkinfo_fill_reply, + + .set_validate = ethnl_set_linkinfo_validate, + .set = ethnl_set_linkinfo, + .set_ntf_cmd = ETHTOOL_MSG_LINKINFO_NTF, +}; diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 126e06c713a3..fab66c169b9f 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -151,18 +151,6 @@ static int linkmodes_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_linkmodes_request_ops = { - .request_cmd = ETHTOOL_MSG_LINKMODES_GET, - .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY, - .hdr_attr = ETHTOOL_A_LINKMODES_HEADER, - .req_info_size = sizeof(struct linkmodes_req_info), - .reply_data_size = sizeof(struct linkmodes_reply_data), - - .prepare_data = linkmodes_prepare_data, - .reply_size = linkmodes_reply_size, - .fill_reply = linkmodes_fill_reply, -}; - /* LINKMODES_SET */ const struct nla_policy ethnl_linkmodes_set_policy[] = { @@ -310,59 +298,64 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, return 0; } -int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_linkmodes_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { - struct ethtool_link_ksettings ksettings = {}; - struct ethnl_req_info req_info = {}; - struct nlattr **tb = info->attrs; - struct net_device *dev; - bool mod = false; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; int ret; - ret = ethnl_check_linkmodes(info, tb); + ret = ethnl_check_linkmodes(info, info->attrs); if (ret < 0) return ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_LINKMODES_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ret = -EOPNOTSUPP; - if (!dev->ethtool_ops->get_link_ksettings || - !dev->ethtool_ops->set_link_ksettings) - goto out_dev; + if (!ops->get_link_ksettings || !ops->set_link_ksettings) + return -EOPNOTSUPP; + return 1; +} - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; +static int +ethnl_set_linkmodes(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct ethtool_link_ksettings ksettings = {}; + struct net_device *dev = req_info->dev; + struct nlattr **tb = info->attrs; + bool mod = false; + int ret; ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); - goto out_ops; + return ret; } ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod, dev); if (ret < 0) - goto out_ops; + return ret; + if (!mod) + return 0; - if (mod) { - ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); - if (ret < 0) - GENL_SET_ERR_MSG(info, "link settings update failed"); - else - ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); + if (ret < 0) { + GENL_SET_ERR_MSG(info, "link settings update failed"); + return ret; } -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return 1; } + +const struct ethnl_request_ops ethnl_linkmodes_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKMODES_GET, + .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKMODES_HEADER, + .req_info_size = sizeof(struct linkmodes_req_info), + .reply_data_size = sizeof(struct linkmodes_reply_data), + + .prepare_data = linkmodes_prepare_data, + .reply_size = linkmodes_reply_size, + .fill_reply = linkmodes_fill_reply, + + .set_validate = ethnl_set_linkmodes_validate, + .set = ethnl_set_linkmodes, + .set_ntf_cmd = ETHTOOL_MSG_LINKMODES_NTF, +}; diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c new file mode 100644 index 000000000000..e612856eed8c --- /dev/null +++ b/net/ethtool/mm.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022-2023 NXP + */ +#include "common.h" +#include "netlink.h" + +struct mm_req_info { + struct ethnl_req_info base; +}; + +struct mm_reply_data { + struct ethnl_reply_data base; + struct ethtool_mm_state state; + struct ethtool_mm_stats stats; +}; + +#define MM_REPDATA(__reply_base) \ + container_of(__reply_base, struct mm_reply_data, base) + +#define ETHTOOL_MM_STAT_CNT \ + (__ETHTOOL_A_MM_STAT_CNT - (ETHTOOL_A_MM_STAT_PAD + 1)) + +const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1] = { + [ETHTOOL_A_MM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), +}; + +static int mm_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct mm_reply_data *data = MM_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_ops *ops; + int ret; + + ops = dev->ethtool_ops; + + if (!ops->get_mm) + return -EOPNOTSUPP; + + ethtool_stats_init((u64 *)&data->stats, + sizeof(data->stats) / sizeof(u64)); + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = ops->get_mm(dev, &data->state); + if (ret) + goto out_complete; + + if (ops->get_mm_stats && (req_base->flags & ETHTOOL_FLAG_STATS)) + ops->get_mm_stats(dev, &data->stats); + +out_complete: + ethnl_ops_complete(dev); + + return ret; +} + +static int mm_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + int len = 0; + + len += nla_total_size(sizeof(u8)); /* _MM_PMAC_ENABLED */ + len += nla_total_size(sizeof(u8)); /* _MM_TX_ENABLED */ + len += nla_total_size(sizeof(u8)); /* _MM_TX_ACTIVE */ + len += nla_total_size(sizeof(u8)); /* _MM_VERIFY_ENABLED */ + len += nla_total_size(sizeof(u8)); /* _MM_VERIFY_STATUS */ + len += nla_total_size(sizeof(u32)); /* _MM_VERIFY_TIME */ + len += nla_total_size(sizeof(u32)); /* _MM_MAX_VERIFY_TIME */ + len += nla_total_size(sizeof(u32)); /* _MM_TX_MIN_FRAG_SIZE */ + len += nla_total_size(sizeof(u32)); /* _MM_RX_MIN_FRAG_SIZE */ + + if (req_base->flags & ETHTOOL_FLAG_STATS) + len += nla_total_size(0) + /* _MM_STATS */ + nla_total_size_64bit(sizeof(u64)) * ETHTOOL_MM_STAT_CNT; + + return len; +} + +static int mm_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) +{ + if (val == ETHTOOL_STAT_NOT_SET) + return 0; + if (nla_put_u64_64bit(skb, attrtype, val, ETHTOOL_A_MM_STAT_PAD)) + return -EMSGSIZE; + return 0; +} + +static int mm_put_stats(struct sk_buff *skb, + const struct ethtool_mm_stats *stats) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_MM_STATS); + if (!nest) + return -EMSGSIZE; + + if (mm_put_stat(skb, stats->MACMergeFrameAssErrorCount, + ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS) || + mm_put_stat(skb, stats->MACMergeFrameSmdErrorCount, + ETHTOOL_A_MM_STAT_SMD_ERRORS) || + mm_put_stat(skb, stats->MACMergeFrameAssOkCount, + ETHTOOL_A_MM_STAT_REASSEMBLY_OK) || + mm_put_stat(skb, stats->MACMergeFragCountRx, + ETHTOOL_A_MM_STAT_RX_FRAG_COUNT) || + mm_put_stat(skb, stats->MACMergeFragCountTx, + ETHTOOL_A_MM_STAT_TX_FRAG_COUNT) || + mm_put_stat(skb, stats->MACMergeHoldCount, + ETHTOOL_A_MM_STAT_HOLD_COUNT)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int mm_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct mm_reply_data *data = MM_REPDATA(reply_base); + const struct ethtool_mm_state *state = &data->state; + + if (nla_put_u8(skb, ETHTOOL_A_MM_TX_ENABLED, state->tx_enabled) || + nla_put_u8(skb, ETHTOOL_A_MM_TX_ACTIVE, state->tx_active) || + nla_put_u8(skb, ETHTOOL_A_MM_PMAC_ENABLED, state->pmac_enabled) || + nla_put_u8(skb, ETHTOOL_A_MM_VERIFY_ENABLED, state->verify_enabled) || + nla_put_u8(skb, ETHTOOL_A_MM_VERIFY_STATUS, state->verify_status) || + nla_put_u32(skb, ETHTOOL_A_MM_VERIFY_TIME, state->verify_time) || + nla_put_u32(skb, ETHTOOL_A_MM_MAX_VERIFY_TIME, state->max_verify_time) || + nla_put_u32(skb, ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, state->tx_min_frag_size) || + nla_put_u32(skb, ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, state->rx_min_frag_size)) + return -EMSGSIZE; + + if (req_base->flags & ETHTOOL_FLAG_STATS && + mm_put_stats(skb, &data->stats)) + return -EMSGSIZE; + + return 0; +} + +const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1] = { + [ETHTOOL_A_MM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_MM_VERIFY_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_MM_VERIFY_TIME] = NLA_POLICY_RANGE(NLA_U32, 1, 128), + [ETHTOOL_A_MM_TX_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_MM_PMAC_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_MM_TX_MIN_FRAG_SIZE] = NLA_POLICY_RANGE(NLA_U32, 60, 252), +}; + +static void mm_state_to_cfg(const struct ethtool_mm_state *state, + struct ethtool_mm_cfg *cfg) +{ + /* We could also compare state->verify_status against + * ETHTOOL_MM_VERIFY_STATUS_DISABLED, but state->verify_enabled + * is more like an administrative state which should be seen in + * ETHTOOL_MSG_MM_GET replies. For example, a port with verification + * disabled might be in the ETHTOOL_MM_VERIFY_STATUS_INITIAL + * if it's down. + */ + cfg->verify_enabled = state->verify_enabled; + cfg->verify_time = state->verify_time; + cfg->tx_enabled = state->tx_enabled; + cfg->pmac_enabled = state->pmac_enabled; + cfg->tx_min_frag_size = state->tx_min_frag_size; +} + +static int +ethnl_set_mm_validate(struct ethnl_req_info *req_info, struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_mm && ops->set_mm ? 1 : -EOPNOTSUPP; +} + +static int ethnl_set_mm(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct net_device *dev = req_info->dev; + struct ethtool_mm_state state = {}; + struct nlattr **tb = info->attrs; + struct ethtool_mm_cfg cfg = {}; + bool mod = false; + int ret; + + ret = dev->ethtool_ops->get_mm(dev, &state); + if (ret) + return ret; + + mm_state_to_cfg(&state, &cfg); + + ethnl_update_bool(&cfg.verify_enabled, tb[ETHTOOL_A_MM_VERIFY_ENABLED], + &mod); + ethnl_update_u32(&cfg.verify_time, tb[ETHTOOL_A_MM_VERIFY_TIME], &mod); + ethnl_update_bool(&cfg.tx_enabled, tb[ETHTOOL_A_MM_TX_ENABLED], &mod); + ethnl_update_bool(&cfg.pmac_enabled, tb[ETHTOOL_A_MM_PMAC_ENABLED], + &mod); + ethnl_update_u32(&cfg.tx_min_frag_size, + tb[ETHTOOL_A_MM_TX_MIN_FRAG_SIZE], &mod); + + if (!mod) + return 0; + + if (cfg.verify_time > state.max_verify_time) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MM_VERIFY_TIME], + "verifyTime exceeds device maximum"); + return -ERANGE; + } + + ret = dev->ethtool_ops->set_mm(dev, &cfg, extack); + return ret < 0 ? ret : 1; +} + +const struct ethnl_request_ops ethnl_mm_request_ops = { + .request_cmd = ETHTOOL_MSG_MM_GET, + .reply_cmd = ETHTOOL_MSG_MM_GET_REPLY, + .hdr_attr = ETHTOOL_A_MM_HEADER, + .req_info_size = sizeof(struct mm_req_info), + .reply_data_size = sizeof(struct mm_reply_data), + + .prepare_data = mm_prepare_data, + .reply_size = mm_reply_size, + .fill_reply = mm_fill_reply, + + .set_validate = ethnl_set_mm_validate, + .set = ethnl_set_mm, + .set_ntf_cmd = ETHTOOL_MSG_MM_NTF, +}; + +/* Returns whether a given device supports the MAC merge layer + * (has an eMAC and a pMAC). Must be called under rtnl_lock() and + * ethnl_ops_begin(). + */ +bool __ethtool_dev_mm_supported(struct net_device *dev) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_mm_state state = {}; + int ret = -EOPNOTSUPP; + + if (ops && ops->get_mm) + ret = ops->get_mm(dev, &state); + + return !!ret; +} diff --git a/net/ethtool/module.c b/net/ethtool/module.c index 898ed436b9e4..e0d539b21423 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -91,18 +91,6 @@ static int module_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_module_request_ops = { - .request_cmd = ETHTOOL_MSG_MODULE_GET, - .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY, - .hdr_attr = ETHTOOL_A_MODULE_HEADER, - .req_info_size = sizeof(struct module_req_info), - .reply_data_size = sizeof(struct module_reply_data), - - .prepare_data = module_prepare_data, - .reply_size = module_reply_size, - .fill_reply = module_fill_reply, -}; - /* MODULE_SET */ const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1] = { @@ -112,69 +100,62 @@ const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLI ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO), }; -static int module_set_power_mode(struct net_device *dev, struct nlattr **tb, - bool *p_mod, struct netlink_ext_ack *extack) +static int +ethnl_set_module_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { - struct ethtool_module_power_mode_params power = {}; - struct ethtool_module_power_mode_params power_new; - const struct ethtool_ops *ops = dev->ethtool_ops; - int ret; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + struct nlattr **tb = info->attrs; if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]) return 0; if (!ops->get_module_power_mode || !ops->set_module_power_mode) { - NL_SET_ERR_MSG_ATTR(extack, + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY], "Setting power mode policy is not supported by this device"); return -EOPNOTSUPP; } - power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); - ret = ops->get_module_power_mode(dev, &power, extack); - if (ret < 0) - return ret; - - if (power_new.policy == power.policy) - return 0; - *p_mod = true; - - return ops->set_module_power_mode(dev, &power_new, extack); + return 1; } -int ethnl_set_module(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info) { - struct ethnl_req_info req_info = {}; + struct ethtool_module_power_mode_params power = {}; + struct ethtool_module_power_mode_params power_new; + const struct ethtool_ops *ops; + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct net_device *dev; - bool mod = false; int ret; - ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_MODULE_HEADER], - genl_info_net(info), info->extack, - true); + ops = dev->ethtool_ops; + + power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); + ret = ops->get_module_power_mode(dev, &power, info->extack); if (ret < 0) return ret; - dev = req_info.dev; - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; + if (power_new.policy == power.policy) + return 0; - ret = module_set_power_mode(dev, tb, &mod, info->extack); - if (ret < 0) - goto out_ops; + ret = ops->set_module_power_mode(dev, &power_new, info->extack); + return ret < 0 ? ret : 1; +} - if (!mod) - goto out_ops; +const struct ethnl_request_ops ethnl_module_request_ops = { + .request_cmd = ETHTOOL_MSG_MODULE_GET, + .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY, + .hdr_attr = ETHTOOL_A_MODULE_HEADER, + .req_info_size = sizeof(struct module_req_info), + .reply_data_size = sizeof(struct module_reply_data), - ethtool_notify(dev, ETHTOOL_MSG_MODULE_NTF, NULL); + .prepare_data = module_prepare_data, + .reply_size = module_reply_size, + .fill_reply = module_fill_reply, -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); - ethnl_parse_header_dev_put(&req_info); - return ret; -} + .set_validate = ethnl_set_module_validate, + .set = ethnl_set_module, + .set_ntf_cmd = ETHTOOL_MSG_MODULE_NTF, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index aee98be6237f..08120095cc68 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -269,25 +269,43 @@ static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, [ETHTOOL_MSG_LINKINFO_GET] = ðnl_linkinfo_request_ops, + [ETHTOOL_MSG_LINKINFO_SET] = ðnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_GET] = ðnl_linkmodes_request_ops, + [ETHTOOL_MSG_LINKMODES_SET] = ðnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKSTATE_GET] = ðnl_linkstate_request_ops, [ETHTOOL_MSG_DEBUG_GET] = ðnl_debug_request_ops, + [ETHTOOL_MSG_DEBUG_SET] = ðnl_debug_request_ops, [ETHTOOL_MSG_WOL_GET] = ðnl_wol_request_ops, + [ETHTOOL_MSG_WOL_SET] = ðnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_GET] = ðnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_GET] = ðnl_privflags_request_ops, + [ETHTOOL_MSG_PRIVFLAGS_SET] = ðnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_GET] = ðnl_rings_request_ops, + [ETHTOOL_MSG_RINGS_SET] = ðnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_GET] = ðnl_channels_request_ops, + [ETHTOOL_MSG_CHANNELS_SET] = ðnl_channels_request_ops, [ETHTOOL_MSG_COALESCE_GET] = ðnl_coalesce_request_ops, + [ETHTOOL_MSG_COALESCE_SET] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_GET] = ðnl_pause_request_ops, + [ETHTOOL_MSG_PAUSE_SET] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_GET] = ðnl_eee_request_ops, + [ETHTOOL_MSG_EEE_SET] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, + [ETHTOOL_MSG_FEC_SET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, [ETHTOOL_MSG_PHC_VCLOCKS_GET] = ðnl_phc_vclocks_request_ops, [ETHTOOL_MSG_MODULE_GET] = ðnl_module_request_ops, + [ETHTOOL_MSG_MODULE_SET] = ðnl_module_request_ops, [ETHTOOL_MSG_PSE_GET] = ðnl_pse_request_ops, + [ETHTOOL_MSG_PSE_SET] = ðnl_pse_request_ops, [ETHTOOL_MSG_RSS_GET] = ðnl_rss_request_ops, + [ETHTOOL_MSG_PLCA_GET_CFG] = ðnl_plca_cfg_request_ops, + [ETHTOOL_MSG_PLCA_SET_CFG] = ðnl_plca_cfg_request_ops, + [ETHTOOL_MSG_PLCA_GET_STATUS] = ðnl_plca_status_request_ops, + [ETHTOOL_MSG_MM_GET] = ðnl_mm_request_ops, + [ETHTOOL_MSG_MM_SET] = ðnl_mm_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -588,6 +606,52 @@ static int ethnl_default_done(struct netlink_callback *cb) return 0; } +static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + const struct ethnl_request_ops *ops; + struct ethnl_req_info req_info = {}; + const u8 cmd = info->genlhdr->cmd; + int ret; + + ops = ethnl_default_requests[cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) + return -EOPNOTSUPP; + if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) + return -EINVAL; + + ret = ethnl_parse_header_dev_get(&req_info, info->attrs[ops->hdr_attr], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + if (ops->set_validate) { + ret = ops->set_validate(&req_info, info); + /* 0 means nothing to do */ + if (ret <= 0) + goto out_dev; + } + + rtnl_lock(); + ret = ethnl_ops_begin(req_info.dev); + if (ret < 0) + goto out_rtnl; + + ret = ops->set(&req_info, info); + if (ret <= 0) + goto out_ops; + ethtool_notify(req_info.dev, ops->set_ntf_cmd, NULL); + + ret = 0; +out_ops: + ethnl_ops_complete(req_info.dev); +out_rtnl: + rtnl_unlock(); +out_dev: + ethnl_parse_header_dev_put(&req_info); + return ret; +} + static const struct ethnl_request_ops * ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_LINKINFO_NTF] = ðnl_linkinfo_request_ops, @@ -603,6 +667,8 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_NTF] = ðnl_fec_request_ops, [ETHTOOL_MSG_MODULE_NTF] = ðnl_module_request_ops, + [ETHTOOL_MSG_PLCA_NTF] = ðnl_plca_cfg_request_ops, + [ETHTOOL_MSG_MM_NTF] = ðnl_mm_request_ops, }; /* default notification handler */ @@ -696,6 +762,8 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -760,7 +828,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_LINKINFO_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_linkinfo, + .doit = ethnl_default_set_doit, .policy = ethnl_linkinfo_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkinfo_set_policy) - 1, }, @@ -776,7 +844,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_LINKMODES_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_linkmodes, + .doit = ethnl_default_set_doit, .policy = ethnl_linkmodes_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkmodes_set_policy) - 1, }, @@ -801,7 +869,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_DEBUG_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_debug, + .doit = ethnl_default_set_doit, .policy = ethnl_debug_set_policy, .maxattr = ARRAY_SIZE(ethnl_debug_set_policy) - 1, }, @@ -818,7 +886,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_WOL_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_wol, + .doit = ethnl_default_set_doit, .policy = ethnl_wol_set_policy, .maxattr = ARRAY_SIZE(ethnl_wol_set_policy) - 1, }, @@ -850,7 +918,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PRIVFLAGS_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_privflags, + .doit = ethnl_default_set_doit, .policy = ethnl_privflags_set_policy, .maxattr = ARRAY_SIZE(ethnl_privflags_set_policy) - 1, }, @@ -866,7 +934,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_RINGS_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_rings, + .doit = ethnl_default_set_doit, .policy = ethnl_rings_set_policy, .maxattr = ARRAY_SIZE(ethnl_rings_set_policy) - 1, }, @@ -882,7 +950,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_CHANNELS_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_channels, + .doit = ethnl_default_set_doit, .policy = ethnl_channels_set_policy, .maxattr = ARRAY_SIZE(ethnl_channels_set_policy) - 1, }, @@ -898,7 +966,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_COALESCE_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_coalesce, + .doit = ethnl_default_set_doit, .policy = ethnl_coalesce_set_policy, .maxattr = ARRAY_SIZE(ethnl_coalesce_set_policy) - 1, }, @@ -914,7 +982,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PAUSE_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_pause, + .doit = ethnl_default_set_doit, .policy = ethnl_pause_set_policy, .maxattr = ARRAY_SIZE(ethnl_pause_set_policy) - 1, }, @@ -930,7 +998,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_EEE_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_eee, + .doit = ethnl_default_set_doit, .policy = ethnl_eee_set_policy, .maxattr = ARRAY_SIZE(ethnl_eee_set_policy) - 1, }, @@ -977,7 +1045,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_FEC_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_fec, + .doit = ethnl_default_set_doit, .policy = ethnl_fec_set_policy, .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, }, @@ -1021,7 +1089,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_MODULE_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_module, + .doit = ethnl_default_set_doit, .policy = ethnl_module_set_policy, .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1, }, @@ -1037,7 +1105,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PSE_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_pse, + .doit = ethnl_default_set_doit, .policy = ethnl_pse_set_policy, .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1, }, @@ -1047,6 +1115,47 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_rss_get_policy, .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_PLCA_GET_CFG, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_plca_get_cfg_policy, + .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_PLCA_SET_CFG, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_set_doit, + .policy = ethnl_plca_set_cfg_policy, + .maxattr = ARRAY_SIZE(ethnl_plca_set_cfg_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_PLCA_GET_STATUS, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_plca_get_status_policy, + .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_MM_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_mm_get_policy, + .maxattr = ARRAY_SIZE(ethnl_mm_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_MM_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_set_doit, + .policy = ethnl_mm_set_policy, + .maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 3753787ba233..ae0732460e88 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -138,6 +138,32 @@ static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, } /** + * ethnl_update_bool() - updateb bool used as bool from NLA_U8 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Use the bool value from NLA_U8 netlink attribute @attr to set bool variable + * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is + * null. Bool pointed to by @mod is set to true if this function changed the + * logical value of *dst, otherwise it is left as is. + */ +static inline void ethnl_update_bool(bool *dst, const struct nlattr *attr, + bool *mod) +{ + u8 val; + + if (!attr) + return; + val = !!nla_get_u8(attr); + if (!!*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** * ethnl_update_binary() - update binary data from NLA_BINARY attribute * @dst: value to update * @len: destination buffer length @@ -258,13 +284,14 @@ int ethnl_ops_begin(struct net_device *dev); void ethnl_ops_complete(struct net_device *dev); /** - * struct ethnl_request_ops - unified handling of GET requests + * struct ethnl_request_ops - unified handling of GET and SET requests * @request_cmd: command id for request (GET) * @reply_cmd: command id for reply (GET_REPLY) * @hdr_attr: attribute type for request header * @req_info_size: size of request info * @reply_data_size: size of reply data * @allow_nodev_do: allow non-dump request with no device identification + * @set_ntf_cmd: notification to generate on changes (SET) * @parse_request: * Parse request except common header (struct ethnl_req_info). Common * header is already filled on entry, the rest up to @repdata_offset @@ -293,6 +320,18 @@ void ethnl_ops_complete(struct net_device *dev); * used e.g. to free any additional data structures outside the main * structure which were allocated by ->prepare_data(). When processing * dump requests, ->cleanup() is called for each message. + * @set_validate: + * Check if set operation is supported for a given device, and perform + * extra input checks. Expected return values: + * - 0 if the operation is a noop for the device (rare) + * - 1 if operation should proceed to calling @set + * - negative errno on errors + * Called without any locks, just a reference on the netdev. + * @set: + * Execute the set operation. The implementation should return + * - 0 if no configuration has changed + * - 1 if configuration changed and notification should be generated + * - negative errno on errors * * Description of variable parts of GET request handling when using the * unified infrastructure. When used, a pointer to an instance of this @@ -309,6 +348,7 @@ struct ethnl_request_ops { unsigned int req_info_size; unsigned int reply_data_size; bool allow_nodev_do; + u8 set_ntf_cmd; int (*parse_request)(struct ethnl_req_info *req_info, struct nlattr **tb, @@ -322,6 +362,11 @@ struct ethnl_request_ops { const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); void (*cleanup_data)(struct ethnl_reply_data *reply_data); + + int (*set_validate)(struct ethnl_req_info *req_info, + struct genl_info *info); + int (*set)(struct ethnl_req_info *req_info, + struct genl_info *info); }; /* request handlers */ @@ -347,6 +392,9 @@ extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct ethnl_request_ops ethnl_module_request_ops; extern const struct ethnl_request_ops ethnl_pse_request_ops; extern const struct ethnl_request_ops ethnl_rss_request_ops; +extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; +extern const struct ethnl_request_ops ethnl_plca_status_request_ops; +extern const struct ethnl_request_ops ethnl_mm_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -370,7 +418,7 @@ extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEAD extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1]; -extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1]; +extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1]; extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; @@ -381,33 +429,25 @@ extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INF extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1]; -extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1]; +extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1]; extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1]; extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1]; extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_CONTEXT + 1]; +extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; +extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; +extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; +extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1]; +extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1]; -int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info); int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_module(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index a8c113d244db..6657d0b888d8 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -5,8 +5,12 @@ struct pause_req_info { struct ethnl_req_info base; + enum ethtool_mac_stats_src src; }; +#define PAUSE_REQINFO(__req_base) \ + container_of(__req_base, struct pause_req_info, base) + struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; @@ -19,13 +23,40 @@ struct pause_reply_data { const struct nla_policy ethnl_pause_get_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), + [ETHTOOL_A_PAUSE_STATS_SRC] = + NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; +static int pause_parse_request(struct ethnl_req_info *req_base, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; + struct pause_req_info *req_info = PAUSE_REQINFO(req_base); + + if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) { + if (!(req_base->flags & ETHTOOL_FLAG_STATS)) { + NL_SET_ERR_MSG_MOD(extack, + "ETHTOOL_FLAG_STATS must be set when requesting a source of stats"); + return -EINVAL; + } + + src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]); + } + + req_info->src = src; + + return 0; +} + static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) { + const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); + struct netlink_ext_ack *extack = info ? info->extack : NULL; struct pause_reply_data *data = PAUSE_REPDATA(reply_base); + enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; @@ -34,14 +65,26 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, ethtool_stats_init((u64 *)&data->pausestat, sizeof(data->pausestat) / 8); + data->pausestat.src = src; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; + + if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || + src == ETHTOOL_MAC_STATS_SRC_PMAC) && + !__ethtool_dev_mm_supported(dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Device does not support MAC merge layer"); + ethnl_ops_complete(dev); + return -EOPNOTSUPP; + } + dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); + ethnl_ops_complete(dev); return 0; @@ -56,6 +99,7 @@ static int pause_reply_size(const struct ethnl_req_info *req_base, if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ + nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } @@ -77,6 +121,9 @@ static int pause_put_stats(struct sk_buff *skb, const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; struct nlattr *nest; + if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src)) + return -EMSGSIZE; + nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); if (!nest) return -EMSGSIZE; @@ -114,18 +161,6 @@ static int pause_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_pause_request_ops = { - .request_cmd = ETHTOOL_MSG_PAUSE_GET, - .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, - .hdr_attr = ETHTOOL_A_PAUSE_HEADER, - .req_info_size = sizeof(struct pause_req_info), - .reply_data_size = sizeof(struct pause_reply_data), - - .prepare_data = pause_prepare_data, - .reply_size = pause_reply_size, - .fill_reply = pause_fill_reply, -}; - /* PAUSE_SET */ const struct nla_policy ethnl_pause_set_policy[] = { @@ -136,51 +171,49 @@ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, }; -int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_pause_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; struct ethtool_pauseparam params = {}; - struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; - const struct ethtool_ops *ops; - struct net_device *dev; bool mod = false; int ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_PAUSE_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_pauseparam || !ops->set_pauseparam) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ops->get_pauseparam(dev, ¶ms); + dev->ethtool_ops->get_pauseparam(dev, ¶ms); ethnl_update_bool32(¶ms.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); ethnl_update_bool32(¶ms.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); ethnl_update_bool32(¶ms.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); - ret = 0; if (!mod) - goto out_ops; + return 0; ret = dev->ethtool_ops->set_pauseparam(dev, ¶ms); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_pause_request_ops = { + .request_cmd = ETHTOOL_MSG_PAUSE_GET, + .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, + .hdr_attr = ETHTOOL_A_PAUSE_HEADER, + .req_info_size = sizeof(struct pause_req_info), + .reply_data_size = sizeof(struct pause_reply_data), + + .parse_request = pause_parse_request, + .prepare_data = pause_prepare_data, + .reply_size = pause_reply_size, + .fill_reply = pause_fill_reply, + + .set_validate = ethnl_set_pause_validate, + .set = ethnl_set_pause, + .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF, +}; diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c new file mode 100644 index 000000000000..5a8cab4df0c9 --- /dev/null +++ b/net/ethtool/plca.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/phy.h> +#include <linux/ethtool_netlink.h> + +#include "netlink.h" +#include "common.h" + +struct plca_req_info { + struct ethnl_req_info base; +}; + +struct plca_reply_data { + struct ethnl_reply_data base; + struct phy_plca_cfg plca_cfg; + struct phy_plca_status plca_st; +}; + +// Helpers ------------------------------------------------------------------ // + +#define PLCA_REPDATA(__reply_base) \ + container_of(__reply_base, struct plca_reply_data, base) + +static void plca_update_sint(int *dst, const struct nlattr *attr, + bool *mod) +{ + if (!attr) + return; + + *dst = nla_get_u32(attr); + *mod = true; +} + +// PLCA get configuration message ------------------------------------------- // + +const struct nla_policy ethnl_plca_get_cfg_policy[] = { + [ETHTOOL_A_PLCA_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct plca_reply_data *data = PLCA_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_phy_ops *ops; + int ret; + + // check that the PHY device is available and connected + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out; + } + + // note: rtnl_lock is held already by ethnl_default_doit + ops = ethtool_phy_ops; + if (!ops || !ops->get_plca_cfg) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out; + + memset(&data->plca_cfg, 0xff, + sizeof_field(struct plca_reply_data, plca_cfg)); + + ret = ops->get_plca_cfg(dev->phydev, &data->plca_cfg); + ethnl_ops_complete(dev); + +out: + return ret; +} + +static int plca_get_cfg_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u16)) + /* _VERSION */ + nla_total_size(sizeof(u8)) + /* _ENABLED */ + nla_total_size(sizeof(u32)) + /* _NODE_CNT */ + nla_total_size(sizeof(u32)) + /* _NODE_ID */ + nla_total_size(sizeof(u32)) + /* _TO_TIMER */ + nla_total_size(sizeof(u32)) + /* _BURST_COUNT */ + nla_total_size(sizeof(u32)); /* _BURST_TIMER */ +} + +static int plca_get_cfg_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct plca_reply_data *data = PLCA_REPDATA(reply_base); + const struct phy_plca_cfg *plca = &data->plca_cfg; + + if ((plca->version >= 0 && + nla_put_u16(skb, ETHTOOL_A_PLCA_VERSION, plca->version)) || + (plca->enabled >= 0 && + nla_put_u8(skb, ETHTOOL_A_PLCA_ENABLED, !!plca->enabled)) || + (plca->node_id >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_ID, plca->node_id)) || + (plca->node_cnt >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_CNT, plca->node_cnt)) || + (plca->to_tmr >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_TO_TMR, plca->to_tmr)) || + (plca->burst_cnt >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_CNT, plca->burst_cnt)) || + (plca->burst_tmr >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_TMR, plca->burst_tmr))) + return -EMSGSIZE; + + return 0; +}; + +// PLCA set configuration message ------------------------------------------- // + +const struct nla_policy ethnl_plca_set_cfg_policy[] = { + [ETHTOOL_A_PLCA_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_PLCA_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_PLCA_NODE_ID] = NLA_POLICY_MAX(NLA_U32, 255), + [ETHTOOL_A_PLCA_NODE_CNT] = NLA_POLICY_RANGE(NLA_U32, 1, 255), + [ETHTOOL_A_PLCA_TO_TMR] = NLA_POLICY_MAX(NLA_U32, 255), + [ETHTOOL_A_PLCA_BURST_CNT] = NLA_POLICY_MAX(NLA_U32, 255), + [ETHTOOL_A_PLCA_BURST_TMR] = NLA_POLICY_MAX(NLA_U32, 255), +}; + +static int +ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; + const struct ethtool_phy_ops *ops; + struct nlattr **tb = info->attrs; + struct phy_plca_cfg plca_cfg; + bool mod = false; + int ret; + + // check that the PHY device is available and connected + if (!dev->phydev) + return -EOPNOTSUPP; + + ops = ethtool_phy_ops; + if (!ops || !ops->set_plca_cfg) + return -EOPNOTSUPP; + + memset(&plca_cfg, 0xff, sizeof(plca_cfg)); + plca_update_sint(&plca_cfg.enabled, tb[ETHTOOL_A_PLCA_ENABLED], &mod); + plca_update_sint(&plca_cfg.node_id, tb[ETHTOOL_A_PLCA_NODE_ID], &mod); + plca_update_sint(&plca_cfg.node_cnt, tb[ETHTOOL_A_PLCA_NODE_CNT], &mod); + plca_update_sint(&plca_cfg.to_tmr, tb[ETHTOOL_A_PLCA_TO_TMR], &mod); + plca_update_sint(&plca_cfg.burst_cnt, tb[ETHTOOL_A_PLCA_BURST_CNT], + &mod); + plca_update_sint(&plca_cfg.burst_tmr, tb[ETHTOOL_A_PLCA_BURST_TMR], + &mod); + if (!mod) + return 0; + + ret = ops->set_plca_cfg(dev->phydev, &plca_cfg, info->extack); + return ret < 0 ? ret : 1; +} + +const struct ethnl_request_ops ethnl_plca_cfg_request_ops = { + .request_cmd = ETHTOOL_MSG_PLCA_GET_CFG, + .reply_cmd = ETHTOOL_MSG_PLCA_GET_CFG_REPLY, + .hdr_attr = ETHTOOL_A_PLCA_HEADER, + .req_info_size = sizeof(struct plca_req_info), + .reply_data_size = sizeof(struct plca_reply_data), + + .prepare_data = plca_get_cfg_prepare_data, + .reply_size = plca_get_cfg_reply_size, + .fill_reply = plca_get_cfg_fill_reply, + + .set = ethnl_set_plca, + .set_ntf_cmd = ETHTOOL_MSG_PLCA_NTF, +}; + +// PLCA get status message -------------------------------------------------- // + +const struct nla_policy ethnl_plca_get_status_policy[] = { + [ETHTOOL_A_PLCA_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct plca_reply_data *data = PLCA_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_phy_ops *ops; + int ret; + + // check that the PHY device is available and connected + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out; + } + + // note: rtnl_lock is held already by ethnl_default_doit + ops = ethtool_phy_ops; + if (!ops || !ops->get_plca_status) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out; + + memset(&data->plca_st, 0xff, + sizeof_field(struct plca_reply_data, plca_st)); + + ret = ops->get_plca_status(dev->phydev, &data->plca_st); + ethnl_ops_complete(dev); +out: + return ret; +} + +static int plca_get_status_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u8)); /* _STATUS */ +} + +static int plca_get_status_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct plca_reply_data *data = PLCA_REPDATA(reply_base); + const u8 status = data->plca_st.pst; + + if (nla_put_u8(skb, ETHTOOL_A_PLCA_STATUS, !!status)) + return -EMSGSIZE; + + return 0; +}; + +const struct ethnl_request_ops ethnl_plca_status_request_ops = { + .request_cmd = ETHTOOL_MSG_PLCA_GET_STATUS, + .reply_cmd = ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, + .hdr_attr = ETHTOOL_A_PLCA_HEADER, + .req_info_size = sizeof(struct plca_req_info), + .reply_data_size = sizeof(struct plca_reply_data), + + .prepare_data = plca_get_status_prepare_data, + .reply_size = plca_get_status_reply_size, + .fill_reply = plca_get_status_fill_reply, +}; diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c index 4c7bfa81e4ab..23264a1ebf12 100644 --- a/net/ethtool/privflags.c +++ b/net/ethtool/privflags.c @@ -118,19 +118,6 @@ static void privflags_cleanup_data(struct ethnl_reply_data *reply_data) kfree(data->priv_flag_names); } -const struct ethnl_request_ops ethnl_privflags_request_ops = { - .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET, - .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, - .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER, - .req_info_size = sizeof(struct privflags_req_info), - .reply_data_size = sizeof(struct privflags_reply_data), - - .prepare_data = privflags_prepare_data, - .reply_size = privflags_reply_size, - .fill_reply = privflags_fill_reply, - .cleanup_data = privflags_cleanup_data, -}; - /* PRIVFLAGS_SET */ const struct nla_policy ethnl_privflags_set_policy[] = { @@ -139,63 +126,70 @@ const struct nla_policy ethnl_privflags_set_policy[] = { [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_NESTED }, }; -int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_privflags_validate(struct ethnl_req_info *req_info, + struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + if (!info->attrs[ETHTOOL_A_PRIVFLAGS_FLAGS]) + return -EINVAL; + + if (!ops->get_priv_flags || !ops->set_priv_flags || + !ops->get_sset_count || !ops->get_strings) + return -EOPNOTSUPP; + return 1; +} + +static int +ethnl_set_privflags(struct ethnl_req_info *req_info, struct genl_info *info) { const char (*names)[ETH_GSTRING_LEN] = NULL; - struct ethnl_req_info req_info = {}; + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - const struct ethtool_ops *ops; - struct net_device *dev; unsigned int nflags; bool mod = false; bool compact; u32 flags; int ret; - if (!tb[ETHTOOL_A_PRIVFLAGS_FLAGS]) - return -EINVAL; ret = ethnl_bitset_is_compact(tb[ETHTOOL_A_PRIVFLAGS_FLAGS], &compact); if (ret < 0) return ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_PRIVFLAGS_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_priv_flags || !ops->set_priv_flags || - !ops->get_sset_count || !ops->get_strings) - goto out_dev; - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; ret = ethnl_get_priv_flags_info(dev, &nflags, compact ? NULL : &names); if (ret < 0) - goto out_ops; - flags = ops->get_priv_flags(dev); + return ret; + flags = dev->ethtool_ops->get_priv_flags(dev); ret = ethnl_update_bitset32(&flags, nflags, tb[ETHTOOL_A_PRIVFLAGS_FLAGS], names, info->extack, &mod); if (ret < 0 || !mod) goto out_free; - ret = ops->set_priv_flags(dev, flags); + ret = dev->ethtool_ops->set_priv_flags(dev, flags); if (ret < 0) goto out_free; - ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL); + ret = 1; out_free: kfree(names); -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); return ret; } + +const struct ethnl_request_ops ethnl_privflags_request_ops = { + .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET, + .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, + .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER, + .req_info_size = sizeof(struct privflags_req_info), + .reply_data_size = sizeof(struct privflags_reply_data), + + .prepare_data = privflags_prepare_data, + .reply_size = privflags_reply_size, + .fill_reply = privflags_fill_reply, + .cleanup_data = privflags_cleanup_data, + + .set_validate = ethnl_set_privflags_validate, + .set = ethnl_set_privflags, + .set_ntf_cmd = ETHTOOL_MSG_PRIVFLAGS_NTF, +}; diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index e8683e485dc9..a5b607b0a652 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -106,18 +106,6 @@ static int pse_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_pse_request_ops = { - .request_cmd = ETHTOOL_MSG_PSE_GET, - .reply_cmd = ETHTOOL_MSG_PSE_GET_REPLY, - .hdr_attr = ETHTOOL_A_PSE_HEADER, - .req_info_size = sizeof(struct pse_req_info), - .reply_data_size = sizeof(struct pse_reply_data), - - .prepare_data = pse_prepare_data, - .reply_size = pse_reply_size, - .fill_reply = pse_fill_reply, -}; - /* PSE_SET */ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { @@ -127,59 +115,50 @@ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED), }; -static int pse_set_pse_config(struct net_device *dev, - struct netlink_ext_ack *extack, - struct nlattr **tb) +static int +ethnl_set_pse_validate(struct ethnl_req_info *req_info, struct genl_info *info) { - struct phy_device *phydev = dev->phydev; - struct pse_control_config config = {}; + return !!info->attrs[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]; +} - /* Optional attribute. Do not return error if not set. */ - if (!tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]) - return 0; +static int +ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; + struct pse_control_config config = {}; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; /* this values are already validated by the ethnl_pse_set_policy */ config.admin_cotrol = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]); + phydev = dev->phydev; if (!phydev) { - NL_SET_ERR_MSG(extack, "No PHY is attached"); + NL_SET_ERR_MSG(info->extack, "No PHY is attached"); return -EOPNOTSUPP; } if (!phydev->psec) { - NL_SET_ERR_MSG(extack, "No PSE is attached"); + NL_SET_ERR_MSG(info->extack, "No PSE is attached"); return -EOPNOTSUPP; } - return pse_ethtool_set_config(phydev->psec, extack, &config); + /* Return errno directly - PSE has no notification */ + return pse_ethtool_set_config(phydev->psec, info->extack, &config); } -int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info) -{ - struct ethnl_req_info req_info = {}; - struct nlattr **tb = info->attrs; - struct net_device *dev; - int ret; - - ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_PSE_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - - dev = req_info.dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - - ret = pse_set_pse_config(dev, info->extack, tb); - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); +const struct ethnl_request_ops ethnl_pse_request_ops = { + .request_cmd = ETHTOOL_MSG_PSE_GET, + .reply_cmd = ETHTOOL_MSG_PSE_GET_REPLY, + .hdr_attr = ETHTOOL_A_PSE_HEADER, + .req_info_size = sizeof(struct pse_req_info), + .reply_data_size = sizeof(struct pse_reply_data), - ethnl_parse_header_dev_put(&req_info); + .prepare_data = pse_prepare_data, + .reply_size = pse_reply_size, + .fill_reply = pse_fill_reply, - return ret; -} + .set_validate = ethnl_set_pse_validate, + .set = ethnl_set_pse, + /* PSE has no notification */ +}; diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index fa3ec8d438f7..2a2d3539630c 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -102,18 +102,6 @@ static int rings_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_rings_request_ops = { - .request_cmd = ETHTOOL_MSG_RINGS_GET, - .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, - .hdr_attr = ETHTOOL_A_RINGS_HEADER, - .req_info_size = sizeof(struct rings_req_info), - .reply_data_size = sizeof(struct rings_reply_data), - - .prepare_data = rings_prepare_data, - .reply_size = rings_reply_size, - .fill_reply = rings_fill_reply, -}; - /* RINGS_SET */ const struct nla_policy ethnl_rings_set_policy[] = { @@ -128,62 +116,53 @@ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), }; -int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_rings_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { - struct kernel_ethtool_ringparam kernel_ringparam = {}; - struct ethtool_ringparam ringparam = {}; - struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; - const struct nlattr *err_attr; - const struct ethtool_ops *ops; - struct net_device *dev; - bool mod = false; - int ret; - - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_RINGS_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_ringparam || !ops->set_ringparam) - goto out_dev; if (tb[ETHTOOL_A_RINGS_RX_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) { - ret = -EOPNOTSUPP; NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], "setting rx buf len not supported"); - goto out_dev; + return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { - ret = -EOPNOTSUPP; NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_CQE_SIZE], "setting cqe size not supported"); - goto out_dev; + return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) { - ret = -EOPNOTSUPP; NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH], "setting tx push not supported"); - goto out_dev; + return -EOPNOTSUPP; } - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ops->get_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); + return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct kernel_ethtool_ringparam kernel_ringparam = {}; + struct ethtool_ringparam ringparam = {}; + struct net_device *dev = req_info->dev; + struct nlattr **tb = info->attrs; + const struct nlattr *err_attr; + bool mod = false; + int ret; + + dev->ethtool_ops->get_ringparam(dev, &ringparam, + &kernel_ringparam, info->extack); ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, @@ -197,9 +176,8 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ethnl_update_u8(&kernel_ringparam.tx_push, tb[ETHTOOL_A_RINGS_TX_PUSH], &mod); - ret = 0; if (!mod) - goto out_ops; + return 0; /* ensure new ring parameters are within limits */ if (ringparam.rx_pending > ringparam.rx_max_pending) @@ -213,23 +191,28 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) else err_attr = NULL; if (err_attr) { - ret = -EINVAL; NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested ring size exceeds maximum"); - goto out_ops; + return -EINVAL; } ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_rings_request_ops = { + .request_cmd = ETHTOOL_MSG_RINGS_GET, + .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, + .hdr_attr = ETHTOOL_A_RINGS_HEADER, + .req_info_size = sizeof(struct rings_req_info), + .reply_data_size = sizeof(struct rings_reply_data), + + .prepare_data = rings_prepare_data, + .reply_size = rings_reply_size, + .fill_reply = rings_fill_reply, + + .set_validate = ethnl_set_rings_validate, + .set = ethnl_set_rings, + .set_ntf_cmd = ETHTOOL_MSG_RINGS_NTF, +}; diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index a20e0a24ff61..010ed19ccc99 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -7,6 +7,7 @@ struct stats_req_info { struct ethnl_req_info base; DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT); + enum ethtool_mac_stats_src src; }; #define STATS_REQINFO(__req_base) \ @@ -75,16 +76,19 @@ const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers", }; -const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { +const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_STATS_GROUPS] = { .type = NLA_NESTED }, + [ETHTOOL_A_STATS_SRC] = + NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int stats_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { + enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct stats_req_info *req_info = STATS_REQINFO(req_base); bool mod = false; int err; @@ -100,6 +104,11 @@ static int stats_parse_request(struct ethnl_req_info *req_base, return -EINVAL; } + if (tb[ETHTOOL_A_STATS_SRC]) + src = nla_get_u32(tb[ETHTOOL_A_STATS_SRC]); + + req_info->src = src; + return 0; } @@ -108,7 +117,9 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, struct genl_info *info) { const struct stats_req_info *req_info = STATS_REQINFO(req_base); + struct netlink_ext_ack *extack = info ? info->extack : NULL; struct stats_reply_data *data = STATS_REPDATA(reply_base); + enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; @@ -116,11 +127,25 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; + if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || + src == ETHTOOL_MAC_STATS_SRC_PMAC) && + !__ethtool_dev_mm_supported(dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Device does not support MAC merge layer"); + ethnl_ops_complete(dev); + return -EOPNOTSUPP; + } + /* Mark all stats as unset (see ETHTOOL_STAT_NOT_SET) to prevent them * from being reported to user space in case driver did not set them. */ memset(&data->stats, 0xff, sizeof(data->stats)); + data->phy_stats.src = src; + data->mac_stats.src = src; + data->ctrl_stats.src = src; + data->rmon_stats.src = src; + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); @@ -146,6 +171,8 @@ static int stats_reply_size(const struct ethnl_req_info *req_base, unsigned int n_grps = 0, n_stats = 0; int len = 0; + len += nla_total_size(sizeof(u32)); /* _STATS_SRC */ + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) { n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64); n_grps++; @@ -379,6 +406,9 @@ static int stats_fill_reply(struct sk_buff *skb, const struct stats_reply_data *data = STATS_REPDATA(reply_base); int ret = 0; + if (nla_put_u32(skb, ETHTOOL_A_STATS_SRC, req_info->src)) + return -EMSGSIZE; + if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY, ETH_SS_STATS_ETH_PHY, @@ -410,3 +440,130 @@ const struct ethnl_request_ops ethnl_stats_request_ops = { .reply_size = stats_reply_size, .fill_reply = stats_fill_reply, }; + +static u64 ethtool_stats_sum(u64 a, u64 b) +{ + if (a == ETHTOOL_STAT_NOT_SET) + return b; + if (b == ETHTOOL_STAT_NOT_SET) + return a; + return a + b; +} + +/* Avoid modifying the aggregation procedure every time a new counter is added + * by treating the structures as an array of u64 statistics. + */ +static void ethtool_aggregate_stats(void *aggr_stats, const void *emac_stats, + const void *pmac_stats, size_t stats_size, + size_t stats_offset) +{ + size_t num_stats = stats_size / sizeof(u64); + const u64 *s1 = emac_stats + stats_offset; + const u64 *s2 = pmac_stats + stats_offset; + u64 *s = aggr_stats + stats_offset; + int i; + + for (i = 0; i < num_stats; i++) + s[i] = ethtool_stats_sum(s1[i], s2[i]); +} + +void ethtool_aggregate_mac_stats(struct net_device *dev, + struct ethtool_eth_mac_stats *mac_stats) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_eth_mac_stats pmac, emac; + + memset(&emac, 0xff, sizeof(emac)); + memset(&pmac, 0xff, sizeof(pmac)); + emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; + pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; + + ops->get_eth_mac_stats(dev, &emac); + ops->get_eth_mac_stats(dev, &pmac); + + ethtool_aggregate_stats(mac_stats, &emac, &pmac, + sizeof(mac_stats->stats), + offsetof(struct ethtool_eth_mac_stats, stats)); +} +EXPORT_SYMBOL(ethtool_aggregate_mac_stats); + +void ethtool_aggregate_phy_stats(struct net_device *dev, + struct ethtool_eth_phy_stats *phy_stats) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_eth_phy_stats pmac, emac; + + memset(&emac, 0xff, sizeof(emac)); + memset(&pmac, 0xff, sizeof(pmac)); + emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; + pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; + + ops->get_eth_phy_stats(dev, &emac); + ops->get_eth_phy_stats(dev, &pmac); + + ethtool_aggregate_stats(phy_stats, &emac, &pmac, + sizeof(phy_stats->stats), + offsetof(struct ethtool_eth_phy_stats, stats)); +} +EXPORT_SYMBOL(ethtool_aggregate_phy_stats); + +void ethtool_aggregate_ctrl_stats(struct net_device *dev, + struct ethtool_eth_ctrl_stats *ctrl_stats) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_eth_ctrl_stats pmac, emac; + + memset(&emac, 0xff, sizeof(emac)); + memset(&pmac, 0xff, sizeof(pmac)); + emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; + pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; + + ops->get_eth_ctrl_stats(dev, &emac); + ops->get_eth_ctrl_stats(dev, &pmac); + + ethtool_aggregate_stats(ctrl_stats, &emac, &pmac, + sizeof(ctrl_stats->stats), + offsetof(struct ethtool_eth_ctrl_stats, stats)); +} +EXPORT_SYMBOL(ethtool_aggregate_ctrl_stats); + +void ethtool_aggregate_pause_stats(struct net_device *dev, + struct ethtool_pause_stats *pause_stats) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_pause_stats pmac, emac; + + memset(&emac, 0xff, sizeof(emac)); + memset(&pmac, 0xff, sizeof(pmac)); + emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; + pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; + + ops->get_pause_stats(dev, &emac); + ops->get_pause_stats(dev, &pmac); + + ethtool_aggregate_stats(pause_stats, &emac, &pmac, + sizeof(pause_stats->stats), + offsetof(struct ethtool_pause_stats, stats)); +} +EXPORT_SYMBOL(ethtool_aggregate_pause_stats); + +void ethtool_aggregate_rmon_stats(struct net_device *dev, + struct ethtool_rmon_stats *rmon_stats) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + const struct ethtool_rmon_hist_range *dummy; + struct ethtool_rmon_stats pmac, emac; + + memset(&emac, 0xff, sizeof(emac)); + memset(&pmac, 0xff, sizeof(pmac)); + emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; + pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; + + ops->get_rmon_stats(dev, &emac, &dummy); + ops->get_rmon_stats(dev, &pmac, &dummy); + + ethtool_aggregate_stats(rmon_stats, &emac, &pmac, + sizeof(rmon_stats->stats), + offsetof(struct ethtool_rmon_stats, stats)); +} +EXPORT_SYMBOL(ethtool_aggregate_rmon_stats); diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c index 88f435e76481..a4a43d9e6e9d 100644 --- a/net/ethtool/wol.c +++ b/net/ethtool/wol.c @@ -82,18 +82,6 @@ static int wol_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_wol_request_ops = { - .request_cmd = ETHTOOL_MSG_WOL_GET, - .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY, - .hdr_attr = ETHTOOL_A_WOL_HEADER, - .req_info_size = sizeof(struct wol_req_info), - .reply_data_size = sizeof(struct wol_reply_data), - - .prepare_data = wol_prepare_data, - .reply_size = wol_reply_size, - .fill_reply = wol_fill_reply, -}; - /* WOL_SET */ const struct nla_policy ethnl_wol_set_policy[] = { @@ -104,67 +92,66 @@ const struct nla_policy ethnl_wol_set_policy[] = { .len = SOPASS_MAX }, }; -int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_wol_validate(struct ethnl_req_info *req_info, struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_wol && ops->set_wol ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_wol(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; - struct ethnl_req_info req_info = {}; + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct net_device *dev; bool mod = false; int ret; - ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_WOL_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ret = -EOPNOTSUPP; - if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - dev->ethtool_ops->get_wol(dev, &wol); ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT, tb[ETHTOOL_A_WOL_MODES], wol_mode_names, info->extack, &mod); if (ret < 0) - goto out_ops; + return ret; if (wol.wolopts & ~wol.supported) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES], "cannot enable unsupported WoL mode"); - ret = -EINVAL; - goto out_ops; + return -EINVAL; } if (tb[ETHTOOL_A_WOL_SOPASS]) { if (!(wol.supported & WAKE_MAGICSECURE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_SOPASS], "magicsecure not supported, cannot set password"); - ret = -EINVAL; - goto out_ops; + return -EINVAL; } ethnl_update_binary(wol.sopass, sizeof(wol.sopass), tb[ETHTOOL_A_WOL_SOPASS], &mod); } if (!mod) - goto out_ops; + return 0; ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) - goto out_ops; + return ret; dev->wol_enabled = !!wol.wolopts; - ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return 1; } + +const struct ethnl_request_ops ethnl_wol_request_ops = { + .request_cmd = ETHTOOL_MSG_WOL_GET, + .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY, + .hdr_attr = ETHTOOL_A_WOL_HEADER, + .req_info_size = sizeof(struct wol_req_info), + .reply_data_size = sizeof(struct wol_reply_data), + + .prepare_data = wol_prepare_data, + .reply_size = wol_reply_size, + .fill_reply = wol_fill_reply, + + .set_validate = ethnl_set_wol_validate, + .set = ethnl_set_wol, + .set_ntf_cmd = ETHTOOL_MSG_WOL_NTF, +}; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index af7d2cf490fb..880277c9fd07 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o +fou-y := fou_core.o fou_nl.o obj-$(CONFIG_NET_FOU) += fou.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cf11f10927e1..2c778b013cb0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1486,6 +1486,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out; + NAPI_GRO_CB(skb)->proto = proto; id = ntohl(*(__be32 *)&iph->id); flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; @@ -1619,9 +1620,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int inet_gro_complete(struct sk_buff *skb, int nhoff) { - __be16 newlen = htons(skb->len - nhoff); struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; + __be16 totlen = iph->tot_len; int proto = iph->protocol; int err = -ENOSYS; @@ -1630,8 +1631,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) skb_set_inner_network_header(skb, nhoff); } - csum_replace2(&iph->check, iph->tot_len, newlen); - iph->tot_len = newlen; + iph_set_totlen(iph, skb->len - nhoff); + csum_replace2(&iph->check, totlen, iph->tot_len); ops = rcu_dereference(inet_offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 4517d2bd186a..13fc0c185cd9 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -248,7 +248,8 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, } static int bpf_tcp_ca_check_member(const struct btf_type *t, - const struct btf_member *member) + const struct btf_member *member, + const struct bpf_prog *prog) { if (is_unsupported(__btf_member_bit_offset(t, member) / 8)) return -ENOTSUPP; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 6cd3b6c559f0..79ae7204e8ed 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -2222,7 +2222,7 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb, memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); if (len_delta != 0) { iph->ihl = 5 + (opt_len >> 2); - iph->tot_len = htons(skb->len); + iph_set_totlen(iph, skb->len); } ip_send_check(iph); diff --git a/net/ipv4/fou.c b/net/ipv4/fou_core.c index 0c3c6d0cee29..cafec9b4eee0 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou_core.c @@ -19,6 +19,8 @@ #include <uapi/linux/fou.h> #include <uapi/linux/genetlink.h> +#include "fou_nl.h" + struct fou { struct socket *sock; u8 protocol; @@ -640,20 +642,6 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) static struct genl_family fou_nl_family; -static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { - [FOU_ATTR_PORT] = { .type = NLA_U16, }, - [FOU_ATTR_AF] = { .type = NLA_U8, }, - [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, - [FOU_ATTR_TYPE] = { .type = NLA_U8, }, - [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, - [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, - [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, - [FOU_ATTR_LOCAL_V6] = { .len = sizeof(struct in6_addr), }, - [FOU_ATTR_PEER_V6] = { .len = sizeof(struct in6_addr), }, - [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, - [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, -}; - static int parse_nl_config(struct genl_info *info, struct fou_cfg *cfg) { @@ -745,7 +733,7 @@ static int parse_nl_config(struct genl_info *info, return 0; } -static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) +int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_cfg cfg; @@ -758,7 +746,7 @@ static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) return fou_create(net, &cfg, NULL); } -static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) +int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_cfg cfg; @@ -827,7 +815,7 @@ nla_put_failure: return -EMSGSIZE; } -static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) +int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_net *fn = net_generic(net, fou_net_id); @@ -874,7 +862,7 @@ out_free: return ret; } -static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) +int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fou_net *fn = net_generic(net, fou_net_id); @@ -897,33 +885,12 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static const struct genl_small_ops fou_nl_ops[] = { - { - .cmd = FOU_CMD_ADD, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = fou_nl_cmd_add_port, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = FOU_CMD_DEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = fou_nl_cmd_rm_port, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = FOU_CMD_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = fou_nl_cmd_get_port, - .dumpit = fou_nl_dump, - }, -}; - static struct genl_family fou_nl_family __ro_after_init = { .hdrsize = 0, .name = FOU_GENL_NAME, .version = FOU_GENL_VERSION, .maxattr = FOU_ATTR_MAX, - .policy = fou_nl_policy, + .policy = fou_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = fou_nl_ops, diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c new file mode 100644 index 000000000000..6c3820f41dd5 --- /dev/null +++ b/net/ipv4/fou_nl.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/fou.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "fou_nl.h" + +#include <linux/fou.h> + +/* Global operation policy for fou */ +const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = { + [FOU_ATTR_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_AF] = { .type = NLA_U8, }, + [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, + [FOU_ATTR_TYPE] = { .type = NLA_U8, }, + [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, + [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, + [FOU_ATTR_LOCAL_V6] = { .len = 16, }, + [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, + [FOU_ATTR_PEER_V6] = { .len = 16, }, + [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, +}; + +/* Ops table for fou */ +const struct genl_small_ops fou_nl_ops[3] = { + { + .cmd = FOU_CMD_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = fou_nl_add_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = FOU_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = fou_nl_del_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = FOU_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = fou_nl_get_doit, + .dumpit = fou_nl_get_dumpit, + }, +}; diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h new file mode 100644 index 000000000000..b7a68121ce6f --- /dev/null +++ b/net/ipv4/fou_nl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/fou.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_FOU_GEN_H +#define _LINUX_FOU_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <linux/fou.h> + +/* Global operation policy for fou */ +extern const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1]; + +/* Ops table for fou */ +extern const struct genl_small_ops fou_nl_ops[3]; + +int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info); +int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info); +int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info); +int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); + +#endif /* _LINUX_FOU_GEN_H */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 46aa2d65e40a..8cebb476b3ab 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -296,6 +296,7 @@ static bool icmpv4_global_allow(struct net *net, int type, int code) if (icmp_global_allow()) return true; + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -325,6 +326,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, if (peer) inet_putpeer(peer); out: + if (!rc) + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); return rc; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f2c43f67187d..7d206a10ad14 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -117,7 +117,7 @@ bool inet_rcv_saddr_any(const struct sock *sk) return !sk->sk_rcv_saddr; } -void inet_get_local_port_range(struct net *net, int *low, int *high) +void inet_get_local_port_range(const struct net *net, int *low, int *high) { unsigned int seq; @@ -130,6 +130,27 @@ void inet_get_local_port_range(struct net *net, int *low, int *high) } EXPORT_SYMBOL(inet_get_local_port_range); +void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct net *net = sock_net(sk); + int lo, hi, sk_lo, sk_hi; + + inet_get_local_port_range(net, &lo, &hi); + + sk_lo = inet->local_port_range.lo; + sk_hi = inet->local_port_range.hi; + + if (unlikely(lo <= sk_lo && sk_lo <= hi)) + lo = sk_lo; + if (unlikely(lo <= sk_hi && sk_hi <= hi)) + hi = sk_hi; + + *low = lo; + *high = hi; +} +EXPORT_SYMBOL(inet_sk_get_local_port_range); + static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) @@ -316,7 +337,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, ports_exhausted: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: - inet_get_local_port_range(net, &low, &high); + inet_sk_get_local_port_range(sk, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ if (high - low < 4) attempt_half = 0; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index f58d73888638..e41fdc38ce19 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1008,17 +1008,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, u32 index; if (port) { - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL, NULL); - spin_unlock_bh(&head->lock); - return 0; - } - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ + local_bh_disable(); ret = check_established(death_row, sk, port, NULL); local_bh_enable(); return ret; @@ -1026,7 +1016,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, l3mdev = inet_sk_bound_l3mdev(sk); - inet_get_local_port_range(net, &low, &high); + inet_sk_get_local_port_range(sk, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; if (likely(remaining > 1)) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e880ce77322a..fe9ead9ee863 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; - len = ntohs(iph->tot_len); + len = iph_totlen(skb, iph); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 922c87ef1ab5..4e4e308c3230 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); - iph->tot_len = htons(skb->len); + iph_set_totlen(iph, skb->len); ip_send_check(iph); /* if egress device is enslaved to an L3 master device pass the diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 9f92ae35bb01..b511ff0adc0a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -923,6 +923,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, case IP_CHECKSUM: case IP_RECVFRAGSIZE: case IP_RECVERR_RFC4884: + case IP_LOCAL_PORT_RANGE: if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; @@ -1365,6 +1366,20 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(inet->min_ttl, val); break; + case IP_LOCAL_PORT_RANGE: + { + const __u16 lo = val; + const __u16 hi = val >> 16; + + if (optlen != sizeof(__u32)) + goto e_inval; + if (lo != 0 && hi != 0 && lo > hi) + goto e_inval; + + inet->local_port_range.lo = lo; + inet->local_port_range.hi = hi; + break; + } default: err = -ENOPROTOOPT; break; @@ -1743,6 +1758,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_MINTTL: val = inet->min_ttl; break; + case IP_LOCAL_PORT_RANGE: + val = inet->local_port_range.hi << 16 | inet->local_port_range.lo; + break; default: sockopt_release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index aab384126f61..f71a7e9a7de6 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -259,20 +259,6 @@ config IP_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_CLUSTERIP - tristate "CLUSTERIP target support" - depends on IP_NF_MANGLE - depends on NF_CONNTRACK - depends on NETFILTER_ADVANCED - select NF_CONNTRACK_MARK - select NETFILTER_FAMILY_ARP - help - The CLUSTERIP target allows you to build load-balancing clusters of - network servers without having a dedicated load-balancing - router/server/switch. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_TARGET_ECN tristate "ECN target support" depends on IP_NF_MANGLE diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 93bad1184251..5a26f9de1ab9 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -39,7 +39,6 @@ obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o # targets -obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c deleted file mode 100644 index b3cc416ed292..000000000000 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ /dev/null @@ -1,929 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Cluster IP hashmark target - * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> - * based on ideas of Fabio Olive Leite <olive@unixforge.org> - * - * Development of this code funded by SuSE Linux AG, https://www.suse.com/ - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> -#include <linux/proc_fs.h> -#include <linux/jhash.h> -#include <linux/bitops.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/icmp.h> -#include <linux/if_arp.h> -#include <linux/seq_file.h> -#include <linux/refcount.h> -#include <linux/netfilter_arp.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/checksum.h> -#include <net/ip.h> - -#define CLUSTERIP_VERSION "0.8" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); - -struct clusterip_config { - struct list_head list; /* list of all configs */ - refcount_t refcount; /* reference count */ - refcount_t entries; /* number of entries/rules - * referencing us */ - - __be32 clusterip; /* the IP address */ - u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ - int ifindex; /* device ifindex */ - u_int16_t num_total_nodes; /* total number of nodes */ - unsigned long local_nodes; /* node number array */ - -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *pde; /* proc dir entry */ -#endif - enum clusterip_hashmode hash_mode; /* which hashing mode */ - u_int32_t hash_initval; /* hash initialization */ - struct rcu_head rcu; /* for call_rcu */ - struct net *net; /* netns for pernet list */ - char ifname[IFNAMSIZ]; /* device ifname */ -}; - -#ifdef CONFIG_PROC_FS -static const struct proc_ops clusterip_proc_ops; -#endif - -struct clusterip_net { - struct list_head configs; - /* lock protects the configs list */ - spinlock_t lock; - - bool clusterip_deprecated_warning; -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *procdir; - /* mutex protects the config->pde*/ - struct mutex mutex; -#endif - unsigned int hook_users; -}; - -static unsigned int clusterip_arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); - -static const struct nf_hook_ops cip_arp_ops = { - .hook = clusterip_arp_mangle, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_OUT, - .priority = -1 -}; - -static unsigned int clusterip_net_id __read_mostly; -static inline struct clusterip_net *clusterip_pernet(struct net *net) -{ - return net_generic(net, clusterip_net_id); -} - -static inline void -clusterip_config_get(struct clusterip_config *c) -{ - refcount_inc(&c->refcount); -} - -static void clusterip_config_rcu_free(struct rcu_head *head) -{ - struct clusterip_config *config; - struct net_device *dev; - - config = container_of(head, struct clusterip_config, rcu); - dev = dev_get_by_name(config->net, config->ifname); - if (dev) { - dev_mc_del(dev, config->clustermac); - dev_put(dev); - } - kfree(config); -} - -static inline void -clusterip_config_put(struct clusterip_config *c) -{ - if (refcount_dec_and_test(&c->refcount)) - call_rcu(&c->rcu, clusterip_config_rcu_free); -} - -/* decrease the count of entries using/referencing this config. If last - * entry(rule) is removed, remove the config from lists, but don't free it - * yet, since proc-files could still be holding references */ -static inline void -clusterip_config_entry_put(struct clusterip_config *c) -{ - struct clusterip_net *cn = clusterip_pernet(c->net); - - local_bh_disable(); - if (refcount_dec_and_lock(&c->entries, &cn->lock)) { - list_del_rcu(&c->list); - spin_unlock(&cn->lock); - local_bh_enable(); - /* In case anyone still accesses the file, the open/close - * functions are also incrementing the refcount on their own, - * so it's safe to remove the entry even if it's in use. */ -#ifdef CONFIG_PROC_FS - mutex_lock(&cn->mutex); - if (cn->procdir) - proc_remove(c->pde); - mutex_unlock(&cn->mutex); -#endif - return; - } - local_bh_enable(); -} - -static struct clusterip_config * -__clusterip_config_find(struct net *net, __be32 clusterip) -{ - struct clusterip_config *c; - struct clusterip_net *cn = clusterip_pernet(net); - - list_for_each_entry_rcu(c, &cn->configs, list) { - if (c->clusterip == clusterip) - return c; - } - - return NULL; -} - -static inline struct clusterip_config * -clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) -{ - struct clusterip_config *c; - - rcu_read_lock_bh(); - c = __clusterip_config_find(net, clusterip); - if (c) { -#ifdef CONFIG_PROC_FS - if (!c->pde) - c = NULL; - else -#endif - if (unlikely(!refcount_inc_not_zero(&c->refcount))) - c = NULL; - else if (entry) { - if (unlikely(!refcount_inc_not_zero(&c->entries))) { - clusterip_config_put(c); - c = NULL; - } - } - } - rcu_read_unlock_bh(); - - return c; -} - -static void -clusterip_config_init_nodelist(struct clusterip_config *c, - const struct ipt_clusterip_tgt_info *i) -{ - int n; - - for (n = 0; n < i->num_local_nodes; n++) - set_bit(i->local_nodes[n] - 1, &c->local_nodes); -} - -static int -clusterip_netdev_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net *net = dev_net(dev); - struct clusterip_net *cn = clusterip_pernet(net); - struct clusterip_config *c; - - spin_lock_bh(&cn->lock); - list_for_each_entry_rcu(c, &cn->configs, list) { - switch (event) { - case NETDEV_REGISTER: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } - break; - case NETDEV_UNREGISTER: - if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; - } - break; - case NETDEV_CHANGENAME: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } else if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; - } - break; - } - } - spin_unlock_bh(&cn->lock); - - return NOTIFY_DONE; -} - -static struct clusterip_config * -clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, - __be32 ip, const char *iniface) -{ - struct clusterip_net *cn = clusterip_pernet(net); - struct clusterip_config *c; - struct net_device *dev; - int err; - - if (iniface[0] == '\0') { - pr_info("Please specify an interface name\n"); - return ERR_PTR(-EINVAL); - } - - c = kzalloc(sizeof(*c), GFP_ATOMIC); - if (!c) - return ERR_PTR(-ENOMEM); - - dev = dev_get_by_name(net, iniface); - if (!dev) { - pr_info("no such interface %s\n", iniface); - kfree(c); - return ERR_PTR(-ENOENT); - } - c->ifindex = dev->ifindex; - strcpy(c->ifname, dev->name); - memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); - dev_mc_add(dev, c->clustermac); - dev_put(dev); - - c->clusterip = ip; - c->num_total_nodes = i->num_total_nodes; - clusterip_config_init_nodelist(c, i); - c->hash_mode = i->hash_mode; - c->hash_initval = i->hash_initval; - c->net = net; - refcount_set(&c->refcount, 1); - - spin_lock_bh(&cn->lock); - if (__clusterip_config_find(net, ip)) { - err = -EBUSY; - goto out_config_put; - } - - list_add_rcu(&c->list, &cn->configs); - spin_unlock_bh(&cn->lock); - -#ifdef CONFIG_PROC_FS - { - char buffer[16]; - - /* create proc dir entry */ - sprintf(buffer, "%pI4", &ip); - mutex_lock(&cn->mutex); - c->pde = proc_create_data(buffer, 0600, - cn->procdir, - &clusterip_proc_ops, c); - mutex_unlock(&cn->mutex); - if (!c->pde) { - err = -ENOMEM; - goto err; - } - } -#endif - - refcount_set(&c->entries, 1); - return c; - -#ifdef CONFIG_PROC_FS -err: -#endif - spin_lock_bh(&cn->lock); - list_del_rcu(&c->list); -out_config_put: - spin_unlock_bh(&cn->lock); - clusterip_config_put(c); - return ERR_PTR(err); -} - -#ifdef CONFIG_PROC_FS -static int -clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) -{ - - if (nodenum == 0 || - nodenum > c->num_total_nodes) - return 1; - - /* check if we already have this number in our bitfield */ - if (test_and_set_bit(nodenum - 1, &c->local_nodes)) - return 1; - - return 0; -} - -static bool -clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) -{ - if (nodenum == 0 || - nodenum > c->num_total_nodes) - return true; - - if (test_and_clear_bit(nodenum - 1, &c->local_nodes)) - return false; - - return true; -} -#endif - -static inline u_int32_t -clusterip_hashfn(const struct sk_buff *skb, - const struct clusterip_config *config) -{ - const struct iphdr *iph = ip_hdr(skb); - unsigned long hashval; - u_int16_t sport = 0, dport = 0; - int poff; - - poff = proto_ports_offset(iph->protocol); - if (poff >= 0) { - const u_int16_t *ports; - u16 _ports[2]; - - ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); - if (ports) { - sport = ports[0]; - dport = ports[1]; - } - } else { - net_info_ratelimited("unknown protocol %u\n", iph->protocol); - } - - switch (config->hash_mode) { - case CLUSTERIP_HASHMODE_SIP: - hashval = jhash_1word(ntohl(iph->saddr), - config->hash_initval); - break; - case CLUSTERIP_HASHMODE_SIP_SPT: - hashval = jhash_2words(ntohl(iph->saddr), sport, - config->hash_initval); - break; - case CLUSTERIP_HASHMODE_SIP_SPT_DPT: - hashval = jhash_3words(ntohl(iph->saddr), sport, dport, - config->hash_initval); - break; - default: - /* to make gcc happy */ - hashval = 0; - /* This cannot happen, unless the check function wasn't called - * at rule load time */ - pr_info("unknown mode %u\n", config->hash_mode); - BUG(); - break; - } - - /* node numbers are 1..n, not 0..n */ - return reciprocal_scale(hashval, config->num_total_nodes) + 1; -} - -static inline int -clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) -{ - return test_bit(hash - 1, &config->local_nodes); -} - -/*********************************************************************** - * IPTABLES TARGET - ***********************************************************************/ - -static unsigned int -clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - u_int32_t hash; - - /* don't need to clusterip_config_get() here, since refcount - * is only decremented by destroy() - and ip_tables guarantees - * that the ->target() function isn't called after ->destroy() */ - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_DROP; - - /* special case: ICMP error handling. conntrack distinguishes between - * error messages (RELATED) and information requests (see below) */ - if (ip_hdr(skb)->protocol == IPPROTO_ICMP && - (ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)) - return XT_CONTINUE; - - /* nf_conntrack_proto_icmp guarantees us that we only have ICMP_ECHO, - * TIMESTAMP, INFO_REQUEST or ICMP_ADDRESS type icmp packets from here - * on, which all have an ID field [relevant for hashing]. */ - - hash = clusterip_hashfn(skb, cipinfo->config); - - switch (ctinfo) { - case IP_CT_NEW: - WRITE_ONCE(ct->mark, hash); - break; - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - /* FIXME: we don't handle expectations at the moment. - * They can arrive on a different node than - * the master connection (e.g. FTP passive mode) */ - case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED_REPLY: - break; - default: /* Prevent gcc warnings */ - break; - } - -#ifdef DEBUG - nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); -#endif - pr_debug("hash=%u ct_hash=%u ", hash, READ_ONCE(ct->mark)); - if (!clusterip_responsible(cipinfo->config, hash)) { - pr_debug("not responsible\n"); - return NF_DROP; - } - pr_debug("responsible\n"); - - /* despite being received via linklayer multicast, this is - * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */ - skb->pkt_type = PACKET_HOST; - - return XT_CONTINUE; -} - -static int clusterip_tg_check(const struct xt_tgchk_param *par) -{ - struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; - struct clusterip_net *cn = clusterip_pernet(par->net); - const struct ipt_entry *e = par->entryinfo; - struct clusterip_config *config; - int ret, i; - - if (par->nft_compat) { - pr_err("cannot use CLUSTERIP target from nftables compat\n"); - return -EOPNOTSUPP; - } - - if (cn->hook_users == UINT_MAX) - return -EOVERFLOW; - - if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && - cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && - cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { - pr_info("unknown mode %u\n", cipinfo->hash_mode); - return -EINVAL; - - } - if (e->ip.dmsk.s_addr != htonl(0xffffffff) || - e->ip.dst.s_addr == 0) { - pr_info("Please specify destination IP\n"); - return -EINVAL; - } - if (cipinfo->num_local_nodes > ARRAY_SIZE(cipinfo->local_nodes)) { - pr_info("bad num_local_nodes %u\n", cipinfo->num_local_nodes); - return -EINVAL; - } - for (i = 0; i < cipinfo->num_local_nodes; i++) { - if (cipinfo->local_nodes[i] - 1 >= - sizeof(config->local_nodes) * 8) { - pr_info("bad local_nodes[%d] %u\n", - i, cipinfo->local_nodes[i]); - return -EINVAL; - } - } - - config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1); - if (!config) { - if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { - pr_info("no config found for %pI4, need 'new'\n", - &e->ip.dst.s_addr); - return -EINVAL; - } else { - config = clusterip_config_init(par->net, cipinfo, - e->ip.dst.s_addr, - e->ip.iniface); - if (IS_ERR(config)) - return PTR_ERR(config); - } - } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) { - clusterip_config_entry_put(config); - clusterip_config_put(config); - return -EINVAL; - } - - ret = nf_ct_netns_get(par->net, par->family); - if (ret < 0) { - pr_info("cannot load conntrack support for proto=%u\n", - par->family); - clusterip_config_entry_put(config); - clusterip_config_put(config); - return ret; - } - - if (cn->hook_users == 0) { - ret = nf_register_net_hook(par->net, &cip_arp_ops); - - if (ret < 0) { - clusterip_config_entry_put(config); - clusterip_config_put(config); - nf_ct_netns_put(par->net, par->family); - return ret; - } - } - - cn->hook_users++; - - if (!cn->clusterip_deprecated_warning) { - pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " - "use xt_cluster instead\n"); - cn->clusterip_deprecated_warning = true; - } - - cipinfo->config = config; - return ret; -} - -/* drop reference count of cluster config when rule is deleted */ -static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) -{ - const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; - struct clusterip_net *cn = clusterip_pernet(par->net); - - /* if no more entries are referencing the config, remove it - * from the list and destroy the proc entry */ - clusterip_config_entry_put(cipinfo->config); - - clusterip_config_put(cipinfo->config); - - nf_ct_netns_put(par->net, par->family); - cn->hook_users--; - - if (cn->hook_users == 0) - nf_unregister_net_hook(par->net, &cip_arp_ops); -} - -#ifdef CONFIG_NETFILTER_XTABLES_COMPAT -struct compat_ipt_clusterip_tgt_info -{ - u_int32_t flags; - u_int8_t clustermac[6]; - u_int16_t num_total_nodes; - u_int16_t num_local_nodes; - u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; - u_int32_t hash_mode; - u_int32_t hash_initval; - compat_uptr_t config; -}; -#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ - -static struct xt_target clusterip_tg_reg __read_mostly = { - .name = "CLUSTERIP", - .family = NFPROTO_IPV4, - .target = clusterip_tg, - .checkentry = clusterip_tg_check, - .destroy = clusterip_tg_destroy, - .targetsize = sizeof(struct ipt_clusterip_tgt_info), - .usersize = offsetof(struct ipt_clusterip_tgt_info, config), -#ifdef CONFIG_NETFILTER_XTABLES_COMPAT - .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), -#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ - .me = THIS_MODULE -}; - - -/*********************************************************************** - * ARP MANGLING CODE - ***********************************************************************/ - -/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */ -struct arp_payload { - u_int8_t src_hw[ETH_ALEN]; - __be32 src_ip; - u_int8_t dst_hw[ETH_ALEN]; - __be32 dst_ip; -} __packed; - -#ifdef DEBUG -static void arp_print(struct arp_payload *payload) -{ -#define HBUFFERLEN 30 - char hbuffer[HBUFFERLEN]; - int j, k; - - for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) { - hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); - hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); - hbuffer[k++] = ':'; - } - hbuffer[--k] = '\0'; - - pr_debug("src %pI4@%s, dst %pI4\n", - &payload->src_ip, hbuffer, &payload->dst_ip); -} -#endif - -static unsigned int -clusterip_arp_mangle(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct arphdr *arp = arp_hdr(skb); - struct arp_payload *payload; - struct clusterip_config *c; - struct net *net = state->net; - - /* we don't care about non-ethernet and non-ipv4 ARP */ - if (arp->ar_hrd != htons(ARPHRD_ETHER) || - arp->ar_pro != htons(ETH_P_IP) || - arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) - return NF_ACCEPT; - - /* we only want to mangle arp requests and replies */ - if (arp->ar_op != htons(ARPOP_REPLY) && - arp->ar_op != htons(ARPOP_REQUEST)) - return NF_ACCEPT; - - payload = (void *)(arp+1); - - /* if there is no clusterip configuration for the arp reply's - * source ip, we don't want to mangle it */ - c = clusterip_config_find_get(net, payload->src_ip, 0); - if (!c) - return NF_ACCEPT; - - /* normally the linux kernel always replies to arp queries of - * addresses on different interfacs. However, in the CLUSTERIP case - * this wouldn't work, since we didn't subscribe the mcast group on - * other interfaces */ - if (c->ifindex != state->out->ifindex) { - pr_debug("not mangling arp reply on different interface: cip'%d'-skb'%d'\n", - c->ifindex, state->out->ifindex); - clusterip_config_put(c); - return NF_ACCEPT; - } - - /* mangle reply hardware address */ - memcpy(payload->src_hw, c->clustermac, arp->ar_hln); - -#ifdef DEBUG - pr_debug("mangled arp reply: "); - arp_print(payload); -#endif - - clusterip_config_put(c); - - return NF_ACCEPT; -} - -/*********************************************************************** - * PROC DIR HANDLING - ***********************************************************************/ - -#ifdef CONFIG_PROC_FS - -struct clusterip_seq_position { - unsigned int pos; /* position */ - unsigned int weight; /* number of bits set == size */ - unsigned int bit; /* current bit */ - unsigned long val; /* current value */ -}; - -static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) -{ - struct clusterip_config *c = s->private; - unsigned int weight; - u_int32_t local_nodes; - struct clusterip_seq_position *idx; - - /* FIXME: possible race */ - local_nodes = c->local_nodes; - weight = hweight32(local_nodes); - if (*pos >= weight) - return NULL; - - idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL); - if (!idx) - return ERR_PTR(-ENOMEM); - - idx->pos = *pos; - idx->weight = weight; - idx->bit = ffs(local_nodes); - idx->val = local_nodes; - clear_bit(idx->bit - 1, &idx->val); - - return idx; -} - -static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct clusterip_seq_position *idx = v; - - *pos = ++idx->pos; - if (*pos >= idx->weight) { - kfree(v); - return NULL; - } - idx->bit = ffs(idx->val); - clear_bit(idx->bit - 1, &idx->val); - return idx; -} - -static void clusterip_seq_stop(struct seq_file *s, void *v) -{ - if (!IS_ERR(v)) - kfree(v); -} - -static int clusterip_seq_show(struct seq_file *s, void *v) -{ - struct clusterip_seq_position *idx = v; - - if (idx->pos != 0) - seq_putc(s, ','); - - seq_printf(s, "%u", idx->bit); - - if (idx->pos == idx->weight - 1) - seq_putc(s, '\n'); - - return 0; -} - -static const struct seq_operations clusterip_seq_ops = { - .start = clusterip_seq_start, - .next = clusterip_seq_next, - .stop = clusterip_seq_stop, - .show = clusterip_seq_show, -}; - -static int clusterip_proc_open(struct inode *inode, struct file *file) -{ - int ret = seq_open(file, &clusterip_seq_ops); - - if (!ret) { - struct seq_file *sf = file->private_data; - struct clusterip_config *c = pde_data(inode); - - sf->private = c; - - clusterip_config_get(c); - } - - return ret; -} - -static int clusterip_proc_release(struct inode *inode, struct file *file) -{ - struct clusterip_config *c = pde_data(inode); - int ret; - - ret = seq_release(inode, file); - - if (!ret) - clusterip_config_put(c); - - return ret; -} - -static ssize_t clusterip_proc_write(struct file *file, const char __user *input, - size_t size, loff_t *ofs) -{ - struct clusterip_config *c = pde_data(file_inode(file)); -#define PROC_WRITELEN 10 - char buffer[PROC_WRITELEN+1]; - unsigned long nodenum; - int rc; - - if (size > PROC_WRITELEN) - return -EIO; - if (copy_from_user(buffer, input, size)) - return -EFAULT; - buffer[size] = 0; - - if (*buffer == '+') { - rc = kstrtoul(buffer+1, 10, &nodenum); - if (rc) - return rc; - if (clusterip_add_node(c, nodenum)) - return -ENOMEM; - } else if (*buffer == '-') { - rc = kstrtoul(buffer+1, 10, &nodenum); - if (rc) - return rc; - if (clusterip_del_node(c, nodenum)) - return -ENOENT; - } else - return -EIO; - - return size; -} - -static const struct proc_ops clusterip_proc_ops = { - .proc_open = clusterip_proc_open, - .proc_read = seq_read, - .proc_write = clusterip_proc_write, - .proc_lseek = seq_lseek, - .proc_release = clusterip_proc_release, -}; - -#endif /* CONFIG_PROC_FS */ - -static int clusterip_net_init(struct net *net) -{ - struct clusterip_net *cn = clusterip_pernet(net); - - INIT_LIST_HEAD(&cn->configs); - - spin_lock_init(&cn->lock); - -#ifdef CONFIG_PROC_FS - cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); - if (!cn->procdir) { - pr_err("Unable to proc dir entry\n"); - return -ENOMEM; - } - mutex_init(&cn->mutex); -#endif /* CONFIG_PROC_FS */ - - return 0; -} - -static void clusterip_net_exit(struct net *net) -{ -#ifdef CONFIG_PROC_FS - struct clusterip_net *cn = clusterip_pernet(net); - - mutex_lock(&cn->mutex); - proc_remove(cn->procdir); - cn->procdir = NULL; - mutex_unlock(&cn->mutex); -#endif -} - -static struct pernet_operations clusterip_net_ops = { - .init = clusterip_net_init, - .exit = clusterip_net_exit, - .id = &clusterip_net_id, - .size = sizeof(struct clusterip_net), -}; - -static struct notifier_block cip_netdev_notifier = { - .notifier_call = clusterip_netdev_event -}; - -static int __init clusterip_tg_init(void) -{ - int ret; - - ret = register_pernet_subsys(&clusterip_net_ops); - if (ret < 0) - return ret; - - ret = xt_register_target(&clusterip_tg_reg); - if (ret < 0) - goto cleanup_subsys; - - ret = register_netdevice_notifier(&cip_netdev_notifier); - if (ret < 0) - goto unregister_target; - - pr_info("ClusterIP Version %s loaded successfully\n", - CLUSTERIP_VERSION); - - return 0; - -unregister_target: - xt_unregister_target(&clusterip_tg_reg); -cleanup_subsys: - unregister_pernet_subsys(&clusterip_net_ops); - return ret; -} - -static void __exit clusterip_tg_exit(void) -{ - pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); - - unregister_netdevice_notifier(&cip_netdev_notifier); - xt_unregister_target(&clusterip_tg_reg); - unregister_pernet_subsys(&clusterip_net_ops); - - /* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */ - rcu_barrier(); -} - -module_init(clusterip_tg_init); -module_exit(clusterip_tg_exit); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f88daace9de3..eaf1d3113b62 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -353,7 +353,7 @@ static void icmp_put(struct seq_file *seq) seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); - seq_puts(seq, " OutMsgs OutErrors"); + seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", @@ -363,9 +363,11 @@ static void icmp_put(struct seq_file *seq) for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); - seq_printf(seq, " %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 006c1f0ed8b4..94df935ee0c5 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -93,7 +93,7 @@ int raw_hash_sk(struct sock *sk) struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_nulls_head *hlist; - hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; + hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)]; spin_lock(&h->lock); __sk_nulls_add_node_rcu(sk, hlist); @@ -160,9 +160,9 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) +static int raw_v4_input(struct net *net, struct sk_buff *skb, + const struct iphdr *iph, int hash) { - struct net *net = dev_net(skb->dev); struct hlist_nulls_head *hlist; struct hlist_nulls_node *hnode; int sdif = inet_sdif(skb); @@ -193,9 +193,10 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) int raw_local_deliver(struct sk_buff *skb, int protocol) { - int hash = protocol & (RAW_HTABLE_SIZE - 1); + struct net *net = dev_net(skb->dev); - return raw_v4_input(skb, ip_hdr(skb), hash); + return raw_v4_input(net, skb, ip_hdr(skb), + raw_hashfunc(net, protocol)); } static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) @@ -271,7 +272,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) struct sock *sk; int hash; - hash = protocol & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, protocol); hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); @@ -287,11 +288,13 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason; + /* Charge it to the socket. */ ipv4_pktinfo_prepare(sk, skb); - if (sock_queue_rcv_skb(sk, skb) < 0) { - kfree_skb(skb); + if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { + kfree_skb_reason(skb, reason); return NET_RX_DROP; } @@ -302,7 +305,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8320d0ecb13a..ea370afa70ed 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2102,6 +2102,7 @@ process: /* min_ttl can be changed concurrently from do_ip_setsockopt() */ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9592fe3e444a..c605d171eb2d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -248,7 +248,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, int low, high, remaining; unsigned int rand; - inet_get_local_port_range(net, &low, &high); + inet_sk_get_local_port_range(sk, &low, &high); remaining = (high - low) + 1; rand = get_random_u32(); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 9d92d51c4757..c9346515e24d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -183,6 +183,7 @@ static bool icmpv6_global_allow(struct net *net, int type) if (icmp_global_allow()) return true; + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -224,6 +225,9 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (peer) inet_putpeer(peer); } + if (!res) + __ICMP6_INC_STATS(net, ip6_dst_idev(dst), + ICMP6_MIB_RATELIMITHOST); dst_release(dst); return res; } @@ -328,7 +332,6 @@ static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt { struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hao *hao; - struct in6_addr tmp; int off; if (opt->dsthao) { @@ -336,9 +339,7 @@ static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt if (likely(off >= 0)) { hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + off); - tmp = iph->saddr; - iph->saddr = hao->addr; - hao->addr = tmp; + swap(iph->saddr, hao->addr); } } } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index d6306aa46bb1..e20b3705c2d2 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -94,6 +94,7 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), + SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ada087b50541..bac9ba747bde 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -152,7 +152,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; - hash = nexthdr & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_nulls_for_each(sk, hnode, hlist) { @@ -338,7 +338,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, struct sock *sk; int hash; - hash = nexthdr & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_nulls_for_each(sk, hnode, hlist) { @@ -355,17 +355,19 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason; + if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } /* Charge it to the socket. */ skb_dst_drop(skb); - if (sock_queue_rcv_skb(sk, skb) < 0) { - kfree_skb(skb); + if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { + kfree_skb_reason(skb, reason); return NET_RX_DROP; } @@ -386,7 +388,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -410,7 +412,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet->hdrincl) { if (skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e74e0361fd92..c180c2ef17c5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -91,7 +91,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); -static int ip6_dst_gc(struct dst_ops *ops); +static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); @@ -2593,9 +2593,10 @@ INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags_noref(struct net *net, - const struct sock *sk, - struct flowi6 *fl6, int flags) +static struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) { bool any_src; @@ -2624,7 +2625,6 @@ struct dst_entry *ip6_route_output_flags_noref(struct net *net, return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } -EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, @@ -3284,23 +3284,17 @@ out: return dst; } -static int ip6_dst_gc(struct dst_ops *ops) +static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; - int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; unsigned int val; int entries; - entries = dst_entries_get_fast(ops); - if (entries > rt_max_size) - entries = dst_entries_get_slow(ops); - - if (time_after(rt_last_gc + rt_min_interval, jiffies) && - entries <= rt_max_size) + if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); @@ -3310,7 +3304,6 @@ static int ip6_dst_gc(struct dst_ops *ops) out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); - return entries > rt_max_size; } static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, @@ -6512,7 +6505,7 @@ static int __net_init ip6_route_net_init(struct net *net) #endif net->ipv6.sysctl.flush_delay = 0; - net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index ff691d9f4a04..b1c028df686e 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -13,7 +13,7 @@ #include <net/rpl.h> struct rpl_iptunnel_encap { - struct ipv6_rpl_sr_hdr srh[0]; + DECLARE_FLEX_ARRAY(struct ipv6_rpl_sr_hdr, srh); }; struct rpl_lwt { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 11b736a76bd7..543ee2167720 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1708,8 +1708,9 @@ process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ - if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { + if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 890a2423f559..cfe828bd7fc6 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -28,6 +28,7 @@ #include <net/netns/generic.h> #include <net/sock.h> #include <uapi/linux/kcm.h> +#include <trace/events/sock.h> unsigned int kcm_net_id; @@ -349,6 +350,8 @@ static void psock_data_ready(struct sock *sk) { struct kcm_psock *psock; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 672eff6f5d32..f5d43f42f6d8 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1252,6 +1252,21 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, prev_beacon_int = link_conf->beacon_int; link_conf->beacon_int = params->beacon_interval; + if (params->vht_cap) { + link_conf->vht_su_beamformer = + params->vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); + link_conf->vht_su_beamformee = + params->vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); + link_conf->vht_mu_beamformer = + params->vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE); + link_conf->vht_mu_beamformee = + params->vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); + } + if (params->he_cap && params->he_oper) { link_conf->he_support = true; link_conf->htc_trig_based_pkt_ext = @@ -1266,6 +1281,21 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, changed |= BSS_CHANGED_HE_BSS_COLOR; } + if (params->he_cap) { + link_conf->he_su_beamformer = + params->he_cap->phy_cap_info[3] & + IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER; + link_conf->he_su_beamformee = + params->he_cap->phy_cap_info[4] & + IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE; + link_conf->he_mu_beamformer = + params->he_cap->phy_cap_info[4] & + IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER; + link_conf->he_full_ul_mumimo = + params->he_cap->phy_cap_info[2] & + IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO; + } + if (sdata->vif.type == NL80211_IFTYPE_AP && params->mbssid_config.tx_wdev) { err = ieee80211_set_ap_mbssid_options(sdata, @@ -2734,7 +2764,7 @@ static int ieee80211_scan(struct wiphy *wiphy, * If the scan has been forced (and the driver supports * forcing), don't care about being beaconing already. * This will create problems to the attached stations (e.g. all - * the frames sent while scanning on other channel will be + * the frames sent while scanning on other channel will be * lost) */ if (sdata->deflink.u.ap.beacon && @@ -4632,7 +4662,7 @@ void ieee80211_color_change_finish(struct ieee80211_vif *vif) EXPORT_SYMBOL_GPL(ieee80211_color_change_finish); void -ieeee80211_obss_color_collision_notify(struct ieee80211_vif *vif, +ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, u64 color_bitmap, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); @@ -4642,7 +4672,7 @@ ieeee80211_obss_color_collision_notify(struct ieee80211_vif *vif, cfg80211_obss_color_collision_notify(sdata->dev, color_bitmap, gfp); } -EXPORT_SYMBOL_GPL(ieeee80211_obss_color_collision_notify); +EXPORT_SYMBOL_GPL(ieee80211_obss_color_collision_notify); static int ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c6562a6d2503..e284897ba5e9 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3219,9 +3219,9 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx) color = le32_get_bits(he_oper->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_MASK); if (color == bss_conf->he_bss_color.color) - ieeee80211_obss_color_collision_notify(&rx->sdata->vif, - BIT_ULL(color), - GFP_ATOMIC); + ieee80211_obss_color_collision_notify(&rx->sdata->vif, + BIT_ULL(color), + GFP_ATOMIC); } } @@ -5194,6 +5194,15 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rate_idx, status->nss)) goto drop; break; + case RX_ENC_EHT: + if (WARN_ONCE(status->rate_idx > 15 || + !status->nss || + status->nss > 8 || + status->eht.gi > NL80211_RATE_INFO_EHT_GI_3_2, + "Rate marked as an EHT rate but data is invalid: MCS:%d, NSS:%d, GI:%d\n", + status->rate_idx, status->nss, status->eht.gi)) + goto drop; + break; default: WARN_ON_ONCE(1); fallthrough; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 04e0f132b1d9..27c737fe7fb8 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation */ #include <linux/module.h> @@ -2406,6 +2406,13 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); break; + case STA_STATS_RATE_TYPE_EHT: + rinfo->flags = RATE_INFO_FLAGS_EHT_MCS; + rinfo->mcs = STA_STATS_GET(EHT_MCS, rate); + rinfo->nss = STA_STATS_GET(EHT_NSS, rate); + rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate); + rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate); + break; } } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 69820b551668..c30f02874fb1 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -936,6 +936,7 @@ enum sta_stats_type { STA_STATS_RATE_TYPE_VHT, STA_STATS_RATE_TYPE_HE, STA_STATS_RATE_TYPE_S1G, + STA_STATS_RATE_TYPE_EHT, }; #define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) @@ -945,12 +946,16 @@ enum sta_stats_type { #define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4) -#define STA_STATS_FIELD_BW GENMASK(11, 8) -#define STA_STATS_FIELD_SGI GENMASK(12, 12) -#define STA_STATS_FIELD_TYPE GENMASK(15, 13) -#define STA_STATS_FIELD_HE_RU GENMASK(18, 16) -#define STA_STATS_FIELD_HE_GI GENMASK(20, 19) -#define STA_STATS_FIELD_HE_DCM GENMASK(21, 21) +#define STA_STATS_FIELD_EHT_MCS GENMASK( 3, 0) +#define STA_STATS_FIELD_EHT_NSS GENMASK( 7, 4) +#define STA_STATS_FIELD_BW GENMASK(12, 8) +#define STA_STATS_FIELD_SGI GENMASK(13, 13) +#define STA_STATS_FIELD_TYPE GENMASK(16, 14) +#define STA_STATS_FIELD_HE_RU GENMASK(19, 17) +#define STA_STATS_FIELD_HE_GI GENMASK(21, 20) +#define STA_STATS_FIELD_HE_DCM GENMASK(22, 22) +#define STA_STATS_FIELD_EHT_RU GENMASK(20, 17) +#define STA_STATS_FIELD_EHT_GI GENMASK(22, 21) #define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) @@ -989,6 +994,13 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) r |= STA_STATS_FIELD(HE_RU, s->he_ru); r |= STA_STATS_FIELD(HE_DCM, s->he_dcm); break; + case RX_ENC_EHT: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_EHT); + r |= STA_STATS_FIELD(EHT_NSS, s->nss); + r |= STA_STATS_FIELD(EHT_MCS, s->rate_idx); + r |= STA_STATS_FIELD(EHT_GI, s->eht.gi); + r |= STA_STATS_FIELD(EHT_RU, s->eht.ru); + break; default: WARN_ON(1); return STA_STATS_RATE_INVALID; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 261ac667887f..1a28fe5cb614 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -832,19 +832,6 @@ static void __iterate_stations(struct ieee80211_local *local, } } -void ieee80211_iterate_stations(struct ieee80211_hw *hw, - void (*iterator)(void *data, - struct ieee80211_sta *sta), - void *data) -{ - struct ieee80211_local *local = hw_to_local(hw); - - mutex_lock(&local->sta_mtx); - __iterate_stations(local, iterator, data); - mutex_unlock(&local->sta_mtx); -} -EXPORT_SYMBOL_GPL(ieee80211_iterate_stations); - void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), @@ -4033,6 +4020,19 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, /* Fill cfg80211 rate info */ switch (status->encoding) { + case RX_ENC_EHT: + ri.flags |= RATE_INFO_FLAGS_EHT_MCS; + ri.mcs = status->rate_idx; + ri.nss = status->nss; + ri.eht_ru_alloc = status->eht.ru; + if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) + ri.flags |= RATE_INFO_FLAGS_SHORT_GI; + /* TODO/FIXME: is this right? handle other PPDUs */ + if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + mpdu_offset += 2; + ts += 36; + } + break; case RX_ENC_HE: ri.flags |= RATE_INFO_FLAGS_HE_MCS; ri.mcs = status->rate_idx; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 5ded85e2c374..b30cea2fbf3f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1594,8 +1594,7 @@ mp_rst: TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); - MPTCP_INC_STATS(sock_net((const struct sock *)tp), - MPTCP_MIB_MPPRIOTX); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX); } mp_capable_done: diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 10fe9771a852..56628b52d100 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -152,7 +152,6 @@ static struct mptcp_pm_addr_entry * select_local_address(const struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) { - const struct sock *sk = (const struct sock *)msk; struct mptcp_pm_addr_entry *entry, *ret = NULL; msk_owned_by_me(msk); @@ -165,16 +164,6 @@ select_local_address(const struct pm_nl_pernet *pernet, if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; - if (entry->addr.family != sk->sk_family) { -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - if ((entry->addr.family == AF_INET && - !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) || - (sk->sk_family == AF_INET && - !ipv6_addr_v4mapped(&entry->addr.addr6))) -#endif - continue; - } - ret = entry; break; } @@ -423,7 +412,9 @@ static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned /* Fill all the remote addresses into the array addrs[], * and return the array size. */ -static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh, +static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *local, + bool fullmesh, struct mptcp_addr_info *addrs) { bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); @@ -443,6 +434,9 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm if (deny_id0) return 0; + if (!mptcp_pm_addr_families_match(sk, local, &remote)) + return 0; + msk->pm.subflows++; addrs[i++] = remote; } else { @@ -453,6 +447,9 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm if (deny_id0 && !addrs[i].id) continue; + if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) + continue; + if (!lookup_address_in_vec(addrs, i, &addrs[i]) && msk->pm.subflows < subflows_max) { msk->pm.subflows++; @@ -603,9 +600,11 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); msk->pm.local_addr_used++; - nr = fill_remote_addresses_vec(msk, fullmesh, addrs); - if (nr) - __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + nr = fill_remote_addresses_vec(msk, &local->addr, fullmesh, addrs); + if (nr == 0) + continue; + spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) __mptcp_subflow_connect(sk, &local->addr, &addrs[i]); @@ -628,11 +627,11 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) * and return the array size. */ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, struct mptcp_addr_info *addrs) { struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info local; struct pm_nl_pernet *pernet; unsigned int subflows_max; int i = 0; @@ -645,15 +644,8 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) continue; - if (entry->addr.family != sk->sk_family) { -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - if ((entry->addr.family == AF_INET && - !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) || - (sk->sk_family == AF_INET && - !ipv6_addr_v4mapped(&entry->addr.addr6))) -#endif - continue; - } + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; if (msk->pm.subflows < subflows_max) { msk->pm.subflows++; @@ -666,8 +658,18 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, * 'IPADDRANY' local address */ if (!i) { + struct mptcp_addr_info local; + memset(&local, 0, sizeof(local)); - local.family = msk->pm.remote.family; + local.family = +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + remote->family == AF_INET6 && + ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : +#endif + remote->family; + + if (!mptcp_pm_addr_families_match(sk, &local, remote)) + return 0; msk->pm.subflows++; addrs[i++] = local; @@ -706,7 +708,9 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ - nr = fill_local_addresses_vec(msk, addrs); + nr = fill_local_addresses_vec(msk, &remote, addrs); + if (nr == 0) + return; msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || @@ -1145,7 +1149,7 @@ void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ss if (!tcp_rtx_and_write_queues_empty(ssk)) { subflow->stale = 1; __mptcp_retransmit_pending_data(sk); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_SUBFLOWSTALE); + MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); } unlock_sock_fast(ssk, slow); @@ -1905,8 +1909,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) } if (token) - return mptcp_userspace_pm_set_flags(sock_net(skb->sk), - token, &addr, &remote, bkup); + return mptcp_userspace_pm_set_flags(net, token, &addr, &remote, bkup); spin_lock_bh(&pernet->lock); entry = __lookup_addr(pernet, &addr.addr, lookup_by_id); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index ea6ad9da7493..a02d3cbf2a1b 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -59,8 +59,8 @@ int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, */ e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); if (!e) { - spin_unlock_bh(&msk->pm.lock); - return -ENOMEM; + ret = -ENOMEM; + goto append_err; } *e = *entry; @@ -74,6 +74,7 @@ int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, ret = entry->addr.id; } +append_err: spin_unlock_bh(&msk->pm.lock); return ret; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bc6c1f62a690..c9817aa0f413 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -923,9 +923,8 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - sock_owned_by_me(sk); + msk_owned_by_me(msk); mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->data_avail)) @@ -1408,7 +1407,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) u64 linger_time; long tout = 0; - sock_owned_by_me(sk); + msk_owned_by_me(msk); if (__mptcp_check_fallback(msk)) { if (!msk->first) @@ -1890,7 +1889,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) u32 time, advmss = 1; u64 rtt_us, mstamp; - sock_owned_by_me(sk); + msk_owned_by_me(msk); if (copied <= 0) return; @@ -2217,7 +2216,7 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; - sock_owned_by_me((const struct sock *)msk); + msk_owned_by_me(msk); if (__mptcp_check_fallback(msk)) return NULL; @@ -2724,8 +2723,8 @@ static int mptcp_init_sock(struct sock *sk) mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); - sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); - sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); + sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); + sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); return 0; } @@ -2892,6 +2891,12 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) return EPOLLIN | EPOLLRDNORM; } +static void mptcp_listen_inuse_dec(struct sock *sk) +{ + if (inet_sk_state_load(sk) == TCP_LISTEN) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); +} + bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; @@ -2902,6 +2907,7 @@ bool __mptcp_close(struct sock *sk, long timeout) sk->sk_shutdown = SHUTDOWN_MASK; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { + mptcp_listen_inuse_dec(sk); inet_sk_state_store(sk, TCP_CLOSE); goto cleanup; } @@ -3010,6 +3016,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) if (msk->fastopening) return 0; + mptcp_listen_inuse_dec(sk); inet_sk_state_store(sk, TCP_CLOSE); mptcp_stop_timer(sk); @@ -3648,12 +3655,13 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct sock *sk = sock->sk; struct socket *ssock; int err; pr_debug("msk=%p", msk); - lock_sock(sock->sk); + lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); if (!ssock) { err = -EINVAL; @@ -3661,18 +3669,20 @@ static int mptcp_listen(struct socket *sock, int backlog) } mptcp_token_destroy(msk); - inet_sk_state_store(sock->sk, TCP_LISTEN); - sock_set_flag(sock->sk, SOCK_RCU_FREE); + inet_sk_state_store(sk, TCP_LISTEN); + sock_set_flag(sk, SOCK_RCU_FREE); err = ssock->ops->listen(ssock, backlog); - inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); - if (!err) - mptcp_copy_inaddrs(sock->sk, ssock->sk); + inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + if (!err) { + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + mptcp_copy_inaddrs(sk, ssock->sk); + } mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); unlock: - release_sock(sock->sk); + release_sock(sk); return err; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 601469249da8..61fd8eabfca2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -755,7 +755,7 @@ static inline void mptcp_token_init_request(struct request_sock *req) int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(struct request_sock *req); -int mptcp_token_new_connect(struct sock *sk); +int mptcp_token_new_connect(struct sock *ssk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 7f2c3727ab23..8a9656248b0f 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -18,7 +18,7 @@ static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { - sock_owned_by_me((const struct sock *)msk); + msk_owned_by_me(msk); if (likely(!__mptcp_check_fallback(msk))) return NULL; @@ -1262,6 +1262,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; ssk->sk_incoming_cpu = sk->sk_incoming_cpu; + ssk->sk_ipv6only = sk->sk_ipv6only; __ip_sock_set_tos(ssk, inet_sk(sk)->tos); if (sk->sk_userlocks & tx_rx_locks) { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 32904c76c6a1..4ae1a7304cf0 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -26,6 +26,7 @@ #include "mib.h" #include <trace/events/mptcp.h> +#include <trace/events/sock.h> static void mptcp_subflow_ops_undo_override(struct sock *ssk); @@ -1446,6 +1447,8 @@ static void subflow_data_ready(struct sock *sk) struct sock *parent = subflow->conn; struct mptcp_sock *msk; + trace_sk_data_ready(sk); + msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { /* MPJ subflow are removed from accept queue before reaching here, diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 65430f314a68..5bb924534387 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -134,7 +134,7 @@ int mptcp_token_new_request(struct request_sock *req) /** * mptcp_token_new_connect - create new key/idsn/token for subflow - * @sk: the socket that will initiate a connection + * @ssk: the socket that will initiate a connection * * This function is called when a new outgoing mptcp connection is * initiated. @@ -148,11 +148,12 @@ int mptcp_token_new_request(struct request_sock *req) * * returns 0 on success. */ -int mptcp_token_new_connect(struct sock *sk) +int mptcp_token_new_connect(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); int retries = MPTCP_TOKEN_MAX_RETRIES; + struct sock *sk = subflow->conn; struct token_bucket *bucket; again: @@ -169,12 +170,13 @@ again: } pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); + ssk, subflow->local_key, subflow->token, subflow->idsn); WRITE_ONCE(msk->token, subflow->token); __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); bucket->chain_len++; spin_unlock_bh(&bucket->lock); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } @@ -190,8 +192,10 @@ void mptcp_token_accept(struct mptcp_subflow_request_sock *req, struct mptcp_sock *msk) { struct mptcp_subflow_request_sock *pos; + struct sock *sk = (struct sock *)msk; struct token_bucket *bucket; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); bucket = token_bucket(req->token); spin_lock_bh(&bucket->lock); @@ -370,12 +374,14 @@ void mptcp_token_destroy_request(struct request_sock *req) */ void mptcp_token_destroy(struct mptcp_sock *msk) { + struct sock *sk = (struct sock *)msk; struct token_bucket *bucket; struct mptcp_sock *pos; if (sk_unhashed((struct sock *)msk)) return; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); bucket = token_bucket(msk->token); spin_lock_bh(&bucket->lock); pos = __token_lookup_msk(bucket, msk->token); diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c index 5d984bec1cd8..0758865ab658 100644 --- a/net/mptcp/token_test.c +++ b/net/mptcp/token_test.c @@ -57,6 +57,9 @@ static struct mptcp_sock *build_msk(struct kunit *test) KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); refcount_set(&((struct sock *)msk)->sk_refcnt, 1); sock_net_set((struct sock *)msk, &init_net); + + /* be sure the token helpers can dereference sk->sk_prot */ + ((struct sock *)msk)->sk_prot = &tcp_prot; return msk; } diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3754eb06fb41..ba2a6b5e93d9 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -98,6 +98,12 @@ nf_tables-objs += nft_set_pipapo_avx2.o endif endif +ifdef CONFIG_NFT_CT +ifdef CONFIG_RETPOLINE +nf_tables-objs += nft_ct_fast.o +endif +endif + obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index 3c273483df23..b1ea054bb82c 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -30,7 +30,7 @@ config IP_SET_BITMAP_IP depends on IP_SET help This option adds the bitmap:ip set type support, by which one - can store IPv4 addresses (or network addresse) from a range. + can store IPv4 addresses (or network addresses) from a range. To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 029171379884..80448885c3d7 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -994,7 +994,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, old_dsfield = ipv4_get_dsfield(old_iph); *ttl = old_iph->ttl; if (payload_len) - *payload_len = ntohs(old_iph->tot_len); + *payload_len = skb_ip_totlen(skb); } /* Implement full-functionality option for ECN encapsulation */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 496c4920505b..9a830573480e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -514,7 +514,6 @@ EXPORT_SYMBOL_GPL(nf_ct_get_id); static void clean_from_lists(struct nf_conn *ct) { - pr_debug("clean_from_lists(%p)\n", ct); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); @@ -582,7 +581,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - pr_debug("%s(%p)\n", __func__, ct); WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { @@ -603,7 +601,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct) if (ct->master) nf_ct_put(ct->master); - pr_debug("%s: returning ct=%p to slab\n", __func__, ct); nf_conntrack_free(ct); } EXPORT_SYMBOL(nf_ct_destroy); @@ -786,8 +783,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - rcu_read_lock(); - h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { /* We have a candidate that matches the tuple we're interested @@ -799,7 +794,7 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, smp_acquire__after_ctrl_dep(); if (likely(nf_ct_key_equal(h, tuple, zone, net))) - goto found; + return h; /* TYPESAFE_BY_RCU recycled the candidate */ nf_ct_put(ct); @@ -807,8 +802,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, h = NULL; } -found: - rcu_read_unlock(); return h; } @@ -820,16 +813,21 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); struct nf_conntrack_tuple_hash *thash; + rcu_read_lock(); + thash = __nf_conntrack_find_get(net, zone, tuple, hash_conntrack_raw(tuple, zone_id, net)); if (thash) - return thash; + goto out_unlock; rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); if (rid != zone_id) - return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, rid, net)); + thash = __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, rid, net)); + +out_unlock: + rcu_read_unlock(); return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -1210,7 +1208,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) goto dying; } - pr_debug("Confirming conntrack %p\n", ct); /* We have to check the DYING flag after unlink to prevent * a race against nf_ct_get_next_corpse() possibly called from * user context, else we insert an already 'dead' hash, blocking @@ -1374,9 +1371,6 @@ static unsigned int early_drop_list(struct net *net, hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) - continue; - if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); continue; @@ -1446,11 +1440,14 @@ static bool gc_worker_skip_ct(const struct nf_conn *ct) static bool gc_worker_can_early_drop(const struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; + u8 protonum = nf_ct_protonum(ct); + if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) + return false; if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; - l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); + l4proto = nf_ct_l4proto_find(protonum); if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) return true; @@ -1507,7 +1504,8 @@ static void gc_worker(struct work_struct *work) if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { nf_ct_offload_timeout(tmp); - continue; + if (!nf_conntrack_max95) + continue; } if (expired_count > GC_SCAN_EXPIRED_MAX) { @@ -1721,10 +1719,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_zone tmp; struct nf_conntrack_net *cnet; - if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { - pr_debug("Can't invert tuple.\n"); + if (!nf_ct_invert_tuple(&repl_tuple, tuple)) return NULL; - } zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, @@ -1764,8 +1760,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); if (exp) { - pr_debug("expectation arrives ct=%p exp=%p\n", - ct, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ @@ -1829,10 +1823,8 @@ resolve_normal_ct(struct nf_conn *tmpl, if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, - &tuple)) { - pr_debug("Can't get tuple\n"); + &tuple)) return 0; - } /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); @@ -1864,17 +1856,15 @@ resolve_normal_ct(struct nf_conn *tmpl, if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { ctinfo = IP_CT_ESTABLISHED_REPLY; } else { + unsigned long status = READ_ONCE(ct->status); + /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - pr_debug("normal packet for %p\n", ct); + if (likely(status & IPS_SEEN_REPLY)) ctinfo = IP_CT_ESTABLISHED; - } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { - pr_debug("related packet for %p\n", ct); + else if (status & IPS_EXPECTED) ctinfo = IP_CT_RELATED; - } else { - pr_debug("new packet for %p\n", ct); + else ctinfo = IP_CT_NEW; - } } nf_ct_set(skb, ct, ctinfo); return 0; @@ -1988,7 +1978,6 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) /* rcu_read_lock()ed by nf_hook_thresh */ dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { - pr_debug("not prepared to track yet or error occurred\n"); NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; @@ -2027,7 +2016,6 @@ repeat: if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ - pr_debug("nf_conntrack_in: Can't track with proto module\n"); nf_ct_put(ct); skb->_nfct = 0; /* Special case: TCP tracker reports an attempt to reopen a @@ -2066,7 +2054,6 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, /* Should be unconfirmed, so not in hash table yet */ WARN_ON(nf_ct_is_confirmed(ct)); - pr_debug("Altering reply tuple of %p to ", ct); nf_ct_dump_tuple(newreply); ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1286ae7d4609..308fc0023c7e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3866,7 +3866,7 @@ static int __init ctnetlink_init(void) { int ret; - BUILD_BUG_ON(sizeof(struct ctnetlink_list_dump_ctx) > sizeof_field(struct netlink_callback, ctx)); + NL_ASSERT_DUMP_CTX_FITS(struct ctnetlink_list_dump_ctx); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index ccef340be575..c928ff63b10e 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -284,16 +284,11 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) /* We only do TCP and SCTP at the moment: is there a better way? */ if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) { - pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + tuple.dst.protonum != IPPROTO_SCTP) return -ENOPROTOOPT; - } - if ((unsigned int)*len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", - *len, sizeof(struct sockaddr_in)); + if ((unsigned int)*len < sizeof(struct sockaddr_in)) return -EINVAL; - } h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (h) { @@ -307,17 +302,12 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u3.ip; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); nf_ct_put(ct); if (copy_to_user(user, &sin, sizeof(sin)) != 0) return -EFAULT; else return 0; } - pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", - &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); return -ENOENT; } @@ -360,12 +350,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (!h) { - pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", - &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); + if (!h) return -ENOENT; - } ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 011d414038ea..91eacc9b0b98 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -153,7 +153,8 @@ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - unsigned long *map) + unsigned long *map, + const struct nf_hook_state *state) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; @@ -162,8 +163,6 @@ static int do_basic_checks(struct nf_conn *ct, flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { - pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); - if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) @@ -178,7 +177,9 @@ static int do_basic_checks(struct nf_conn *ct, sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { - pr_debug("Basic checks failed\n"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "%s failed. chunk num %d, type %d, len %d flag %d\n", + __func__, count, sch->type, sch->length, flag); return 1; } @@ -186,7 +187,6 @@ static int do_basic_checks(struct nf_conn *ct, set_bit(sch->type, map); } - pr_debug("Basic checks passed\n"); return count == 0; } @@ -196,64 +196,47 @@ static int sctp_new_state(enum ip_conntrack_dir dir, { int i; - pr_debug("Chunk type: %d\n", chunk_type); - switch (chunk_type) { case SCTP_CID_INIT: - pr_debug("SCTP_CID_INIT\n"); i = 0; break; case SCTP_CID_INIT_ACK: - pr_debug("SCTP_CID_INIT_ACK\n"); i = 1; break; case SCTP_CID_ABORT: - pr_debug("SCTP_CID_ABORT\n"); i = 2; break; case SCTP_CID_SHUTDOWN: - pr_debug("SCTP_CID_SHUTDOWN\n"); i = 3; break; case SCTP_CID_SHUTDOWN_ACK: - pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); i = 4; break; case SCTP_CID_ERROR: - pr_debug("SCTP_CID_ERROR\n"); i = 5; break; case SCTP_CID_COOKIE_ECHO: - pr_debug("SCTP_CID_COOKIE_ECHO\n"); i = 6; break; case SCTP_CID_COOKIE_ACK: - pr_debug("SCTP_CID_COOKIE_ACK\n"); i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: - pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; case SCTP_CID_HEARTBEAT: - pr_debug("SCTP_CID_HEARTBEAT"); i = 9; break; case SCTP_CID_HEARTBEAT_ACK: - pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ - pr_debug("Unknown chunk type, Will stay in %s\n", - sctp_conntrack_names[cur_state]); + pr_debug("Unknown chunk type %d, Will stay in %s\n", + chunk_type, sctp_conntrack_names[cur_state]); return cur_state; } - pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", - dir, sctp_conntrack_names[cur_state], chunk_type, - sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); - return sctp_conntracks[dir][i][cur_state]; } @@ -370,7 +353,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (sh == NULL) goto out; - if (do_basic_checks(ct, skb, dataoff, map) != 0) + if (do_basic_checks(ct, skb, dataoff, map, state) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { @@ -393,7 +376,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "verification tag check failed %x vs %x for dir %d", + sh->vtag, ct->proto.sctp.vtag[dir], dir); goto out; } @@ -468,9 +453,10 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { - pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " - "conntrack=%u\n", - dir, sch->type, old_state); + nf_ct_l4proto_log_invalid(skb, ct, state, + "Invalid, old_state %d, dir %d, type %d", + old_state, dir, sch->type); + goto out_unlock; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 3ac1af6f59fc..16ee5ebe1ce1 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -930,7 +930,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); - struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; enum nf_ct_tcp_action res; @@ -954,7 +953,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; - tuple = &ct->tuplehash[dir].tuple; switch (new_state) { case TCP_CONNTRACK_SYN_SENT: @@ -1232,13 +1230,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; - pr_debug("tcp_conntracks: "); - nf_ct_dump_tuple(tuple); - pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", - (th->syn ? 1 : 0), (th->ack ? 1 : 0), - (th->fin ? 1 : 0), (th->rst ? 1 : 0), - old_state, new_state); - ct->proto.tcp.state = new_state; if (old_state != new_state && new_state == TCP_CONNTRACK_FIN_WAIT) diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3b516cffc779..6b9206635b24 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -88,6 +88,7 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, const struct nf_hook_state *state) { unsigned int *timeouts; + unsigned long status; if (udp_error(skb, dataoff, state)) return -NF_ACCEPT; @@ -96,26 +97,27 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); - if (!nf_ct_is_confirmed(ct)) + status = READ_ONCE(ct->status); + if ((status & IPS_CONFIRMED) == 0) ct->proto.udp.stream_ts = 2 * HZ + jiffies; /* If we've seen traffic both ways, this is some kind of UDP * stream. Set Assured. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + if (status & IPS_SEEN_REPLY_BIT) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; bool stream = false; /* Still active after two seconds? Extend timeout. */ if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; - stream = true; + stream = (status & IPS_ASSURED) == 0; } nf_ct_refresh_acct(ct, ctinfo, skb, extra); /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ - if (unlikely((ct->status & IPS_NAT_CLASH))) + if (unlikely((status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 81c26a96c30b..04bd0ed4d2ae 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -193,8 +193,11 @@ static void flow_offload_fixup_ct(struct nf_conn *ct) timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); + enum udp_conntrack state = + test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + UDP_CT_REPLIED : UDP_CT_UNREPLIED; - timeout = tn->timeouts[UDP_CT_REPLIED]; + timeout = tn->timeouts[state]; timeout -= tn->offload_timeout; } else { return; diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 0ccabf3fa6aa..9505f9d188ff 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -39,7 +39,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, } static int nf_flow_rule_route_inet(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 4d9b99abe37d..1c26f03fc661 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -679,7 +679,7 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, return 0; } -int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { @@ -704,7 +704,7 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, } EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); -int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { @@ -735,7 +735,7 @@ nf_flow_offload_rule_alloc(struct net *net, { const struct nf_flowtable *flowtable = offload->flowtable; const struct flow_offload_tuple *tuple, *other_tuple; - const struct flow_offload *flow = offload->flow; + struct flow_offload *flow = offload->flow; struct dst_entry *other_dst = NULL; struct nf_flow_rule *flow_rule; int err = -ENOMEM; @@ -895,8 +895,9 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, ok_count += flow_offload_tuple_add(offload, flow_rule[0], FLOW_OFFLOAD_DIR_ORIGINAL); - ok_count += flow_offload_tuple_add(offload, flow_rule[1], - FLOW_OFFLOAD_DIR_REPLY); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + ok_count += flow_offload_tuple_add(offload, flow_rule[1], + FLOW_OFFLOAD_DIR_REPLY); if (ok_count == 0) return -ENOENT; @@ -926,7 +927,8 @@ static void flow_offload_work_del(struct flow_offload_work *offload) { clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); - flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); } @@ -946,7 +948,9 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) u64 lastused; flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); - flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, + &stats[1]); lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index cb894f0d63e9..c66689ad2b49 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -322,7 +322,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", - ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8c09e4d12ac1..974b95dece1d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1401,6 +1401,10 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(table)) { + if (PTR_ERR(table) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(table); } @@ -2639,6 +2643,10 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { + if (PTR_ERR(chain) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); } @@ -3716,6 +3724,10 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { + if (PTR_ERR(rule) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } @@ -3729,6 +3741,10 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_RULE_HANDLE]) { rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { + if (PTR_ERR(rule) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } @@ -4808,6 +4824,10 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(set)) { + if (PTR_ERR(set) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(set); } @@ -6690,6 +6710,10 @@ static int nf_tables_delsetelem(struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); + if (err == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) + continue; + if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; @@ -7334,6 +7358,10 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(obj)) { + if (PTR_ERR(obj) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(obj); } @@ -7964,6 +7992,10 @@ static int nf_tables_delflowtable(struct sk_buff *skb, } if (IS_ERR(flowtable)) { + if (PTR_ERR(flowtable) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } @@ -8373,6 +8405,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, + [NFT_MSG_DESTROYTABLE] = { + .call = nf_tables_deltable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, [NFT_MSG_NEWCHAIN] = { .call = nf_tables_newchain, .type = NFNL_CB_BATCH, @@ -8391,6 +8429,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, + [NFT_MSG_DESTROYCHAIN] = { + .call = nf_tables_delchain, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, [NFT_MSG_NEWRULE] = { .call = nf_tables_newrule, .type = NFNL_CB_BATCH, @@ -8415,6 +8459,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, + [NFT_MSG_DESTROYRULE] = { + .call = nf_tables_delrule, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, [NFT_MSG_NEWSET] = { .call = nf_tables_newset, .type = NFNL_CB_BATCH, @@ -8433,6 +8483,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, + [NFT_MSG_DESTROYSET] = { + .call = nf_tables_delset, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, [NFT_MSG_NEWSETELEM] = { .call = nf_tables_newsetelem, .type = NFNL_CB_BATCH, @@ -8451,6 +8507,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_DESTROYSETELEM] = { + .call = nf_tables_delsetelem, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, [NFT_MSG_GETGEN] = { .call = nf_tables_getgen, .type = NFNL_CB_RCU, @@ -8473,6 +8535,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, + [NFT_MSG_DESTROYOBJ] = { + .call = nf_tables_delobj, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, [NFT_MSG_GETOBJ_RESET] = { .call = nf_tables_getobj, .type = NFNL_CB_RCU, @@ -8497,6 +8565,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, + [NFT_MSG_DESTROYFLOWTABLE] = { + .call = nf_tables_delflowtable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, }; static int nf_tables_validate(struct net *net) @@ -8590,6 +8664,7 @@ static void nft_commit_release(struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: @@ -8597,23 +8672,29 @@ static void nft_commit_release(struct nft_trans *trans) kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: nft_set_destroy(&trans->ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: nf_tables_set_elem_destroy(&trans->ctx, nft_trans_elem_set(trans), nft_trans_elem(trans).priv); break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else @@ -9065,8 +9146,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: list_del_rcu(&trans->ctx.table->list); - nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + nf_tables_table_notify(&trans->ctx, trans->msg_type); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { @@ -9081,8 +9163,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: nft_chain_del(trans->ctx.chain); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); + nf_tables_chain_notify(&trans->ctx, trans->msg_type); nf_tables_unregister_hook(trans->ctx.net, trans->ctx.table, trans->ctx.chain); @@ -9098,10 +9181,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: list_del_rcu(&nft_trans_rule(trans)->list); nf_tables_rule_notify(&trans->ctx, nft_trans_rule(trans), - NFT_MSG_DELRULE); + trans->msg_type); nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), NFT_TRANS_COMMIT); @@ -9129,9 +9213,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), - NFT_MSG_DELSET, GFP_KERNEL); + trans->msg_type, GFP_KERNEL); break; case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; @@ -9143,11 +9228,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_DELSETELEM); + trans->msg_type); nft_setelem_remove(net, te->set, &te->elem); if (!nft_setelem_is_catchall(te->set, &te->elem)) { atomic_dec(&te->set->nelems); @@ -9169,9 +9255,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: nft_obj_del(nft_trans_obj(trans)); nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), - NFT_MSG_DELOBJ); + trans->msg_type); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { @@ -9193,11 +9280,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), - NFT_MSG_DELFLOWTABLE); + trans->msg_type); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable_hooks(trans)); } else { @@ -9205,7 +9293,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list, - NFT_MSG_DELFLOWTABLE); + trans->msg_type); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable(trans)->hook_list); } @@ -9301,6 +9389,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: nft_clear(trans->ctx.net, trans->ctx.table); nft_trans_destroy(trans); break; @@ -9322,6 +9411,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: trans->ctx.table->use++; nft_clear(trans->ctx.net, trans->ctx.chain); nft_trans_destroy(trans); @@ -9336,6 +9426,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: trans->ctx.chain->use++; nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); @@ -9357,6 +9448,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_set(trans)); nft_trans_destroy(trans); @@ -9372,6 +9464,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) atomic_dec(&te->set->nelems); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; nft_setelem_data_activate(net, te->set, &te->elem); @@ -9391,6 +9484,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_obj(trans)); nft_trans_destroy(trans); @@ -9407,6 +9501,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { list_splice(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 709a736c301c..6ecd0ba2e546 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -21,6 +21,26 @@ #include <net/netfilter/nf_log.h> #include <net/netfilter/nft_meta.h> +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_X86) + +static struct static_key_false nf_tables_skip_direct_calls; + +static bool nf_skip_indirect_calls(void) +{ + return static_branch_likely(&nf_tables_skip_direct_calls); +} + +static void __init nf_skip_indirect_calls_enable(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + static_branch_enable(&nf_tables_skip_direct_calls); +} +#else +static inline bool nf_skip_indirect_calls(void) { return false; } + +static inline void nf_skip_indirect_calls_enable(void) { } +#endif + static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, enum nft_trace_types type) @@ -193,7 +213,12 @@ static void expr_call_ops_eval(const struct nft_expr *expr, struct nft_pktinfo *pkt) { #ifdef CONFIG_RETPOLINE - unsigned long e = (unsigned long)expr->ops->eval; + unsigned long e; + + if (nf_skip_indirect_calls()) + goto indirect_call; + + e = (unsigned long)expr->ops->eval; #define X(e, fun) \ do { if ((e) == (unsigned long)(fun)) \ return fun(expr, regs, pkt); } while (0) @@ -203,13 +228,19 @@ static void expr_call_ops_eval(const struct nft_expr *expr, X(e, nft_counter_eval); X(e, nft_meta_get_eval); X(e, nft_lookup_eval); +#if IS_ENABLED(CONFIG_NFT_CT) + X(e, nft_ct_get_fast_eval); +#endif X(e, nft_range_eval); X(e, nft_immediate_eval); X(e, nft_byteorder_eval); X(e, nft_dynset_eval); X(e, nft_rt_get_eval); X(e, nft_bitwise_eval); + X(e, nft_objref_eval); + X(e, nft_objref_map_eval); #undef X +indirect_call: #endif /* CONFIG_RETPOLINE */ expr->ops->eval(expr, regs, pkt); } @@ -369,6 +400,8 @@ int __init nf_tables_core_module_init(void) goto err; } + nf_skip_indirect_calls_enable(); + return 0; err: diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index c68e2151defe..b9c84499438b 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -12,7 +12,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_tuple.h> @@ -23,16 +23,6 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> -struct nft_ct { - enum nft_ct_keys key:8; - enum ip_conntrack_dir dir:8; - u8 len; - union { - u8 dreg; - u8 sreg; - }; -}; - struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; struct nf_conntrack_helper *helper6; @@ -759,6 +749,18 @@ static bool nft_ct_set_reduce(struct nft_regs_track *track, return false; } +#ifdef CONFIG_RETPOLINE +static const struct nft_expr_ops nft_ct_get_fast_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_get_fast_eval, + .init = nft_ct_get_init, + .destroy = nft_ct_get_destroy, + .dump = nft_ct_get_dump, + .reduce = nft_ct_set_reduce, +}; +#endif + static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), @@ -791,8 +793,21 @@ nft_ct_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) return ERR_PTR(-EINVAL); - if (tb[NFTA_CT_DREG]) + if (tb[NFTA_CT_DREG]) { +#ifdef CONFIG_RETPOLINE + u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + + switch (k) { + case NFT_CT_STATE: + case NFT_CT_DIRECTION: + case NFT_CT_STATUS: + case NFT_CT_MARK: + case NFT_CT_SECMARK: + return &nft_ct_get_fast_ops; + } +#endif return &nft_ct_get_ops; + } if (tb[NFTA_CT_SREG]) { #ifdef CONFIG_NF_CONNTRACK_ZONES diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c new file mode 100644 index 000000000000..89983b0613fa --- /dev/null +++ b/net/netfilter/nft_ct_fast.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +#if IS_ENABLED(CONFIG_NFT_CT) +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack.h> + +void nft_ct_get_fast_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int state; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct) { + regs->verdict.code = NFT_BREAK; + return; + } + + switch (priv->key) { + case NFT_CT_STATE: + if (ct) + state = NF_CT_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) + state = NF_CT_STATE_UNTRACKED_BIT; + else + state = NF_CT_STATE_INVALID_BIT; + *dest = state; + return; + case NFT_CT_DIRECTION: + nft_reg_store8(dest, CTINFO2DIR(ctinfo)); + return; + case NFT_CT_STATUS: + *dest = ct->status; + return; +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + *dest = ct->mark; + return; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + *dest = ct->secmark; + return; +#endif + default: + WARN_ON_ONCE(1); + regs->verdict.code = NFT_BREAK; + break; + } +} +EXPORT_SYMBOL_GPL(nft_ct_get_fast_eval); +#endif diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 7b01aa2ef653..cb37169608ba 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -13,9 +13,9 @@ #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) -static void nft_objref_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_objref_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_object *obj = nft_objref_priv(expr); @@ -100,9 +100,9 @@ struct nft_objref_map { struct nft_set_binding binding; }; -static void nft_objref_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_objref_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_objref_map *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index 1873da3a945a..b3d623a52885 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -21,7 +21,7 @@ static bool length_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_length_info *info = par->matchinfo; - u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); + u32 pktlen = skb_ip_totlen(skb); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c8b137649ca4..2172930b1f17 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1103,7 +1103,7 @@ static int ovs_skb_network_trim(struct sk_buff *skb) switch (skb->protocol) { case htons(ETH_P_IP): - len = ntohs(ip_hdr(skb)->tot_len); + len = skb_ip_totlen(skb); break; case htons(ETH_P_IPV6): len = sizeof(struct ipv6hdr) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index e20d1a973417..416976f70322 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -107,7 +107,8 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, rcu_assign_pointer(flow->stats[cpu], new_stats); - cpumask_set_cpu(cpu, &flow->cpu_used_mask); + cpumask_set_cpu(cpu, + flow->cpu_used_mask); goto unlock; } } @@ -135,7 +136,8 @@ void ovs_flow_stats_get(const struct sw_flow *flow, memset(ovs_stats, 0, sizeof(*ovs_stats)); /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { + for (cpu = 0; cpu < nr_cpu_ids; + cpu = cpumask_next(cpu, flow->cpu_used_mask)) { struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); if (stats) { @@ -159,7 +161,8 @@ void ovs_flow_stats_clear(struct sw_flow *flow) int cpu; /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { + for (cpu = 0; cpu < nr_cpu_ids; + cpu = cpumask_next(cpu, flow->cpu_used_mask)) { struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]); if (stats) { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 073ab73ffeaa..b5711aff6e76 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -229,7 +229,7 @@ struct sw_flow { */ struct sw_flow_key key; struct sw_flow_id id; - struct cpumask cpu_used_mask; + struct cpumask *cpu_used_mask; struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; struct sw_flow_stats __rcu *stats[]; /* One for each CPU. First one diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 0a0e4c283f02..791504b7f42b 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -79,6 +79,7 @@ struct sw_flow *ovs_flow_alloc(void) return ERR_PTR(-ENOMEM); flow->stats_last_writer = -1; + flow->cpu_used_mask = (struct cpumask *)&flow->stats[nr_cpu_ids]; /* Initialize the default stat node. */ stats = kmem_cache_alloc_node(flow_stats_cache, @@ -91,7 +92,7 @@ struct sw_flow *ovs_flow_alloc(void) RCU_INIT_POINTER(flow->stats[0], stats); - cpumask_set_cpu(0, &flow->cpu_used_mask); + cpumask_set_cpu(0, flow->cpu_used_mask); return flow; err: @@ -115,7 +116,7 @@ static void flow_free(struct sw_flow *flow) flow->sf_acts); /* We open code this to make sure cpu 0 is always considered */ for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { + cpu = cpumask_next(cpu, flow->cpu_used_mask)) { if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, (struct sw_flow_stats __force *)flow->stats[cpu]); @@ -1196,7 +1197,8 @@ int ovs_flow_init(void) flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) + (nr_cpu_ids - * sizeof(struct sw_flow_stats *)), + * sizeof(struct sw_flow_stats *)) + + cpumask_size(), 0, 0, NULL); if (flow_cache == NULL) return -ENOMEM; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b5ab98ca2511..8ffb19c643ab 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2296,6 +2296,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; + if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) + status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; @@ -3522,6 +3524,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; + if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) + aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c index 1f5df0432d37..7f68d8662cfb 100644 --- a/net/phonet/pep-gprs.c +++ b/net/phonet/pep-gprs.c @@ -19,6 +19,8 @@ #include <net/tcp_states.h> #include <net/phonet/gprs.h> +#include <trace/events/sock.h> + #define GPRS_DEFAULT_MTU 1400 struct gprs_dev { @@ -138,6 +140,8 @@ static void gprs_data_ready(struct sock *sk) struct gprs_dev *gp = sk->sk_user_data; struct sk_buff *skb; + trace_sk_data_ready(sk); + while ((skb = pep_read(sk)) != NULL) { skb_orphan(skb); gprs_recv(gp, skb); diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index e595079c2caf..722936f7dd98 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -12,6 +12,7 @@ #include "qrtr.h" +#include <trace/events/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/qrtr.h> @@ -755,6 +756,8 @@ static void qrtr_ns_worker(struct work_struct *work) static void qrtr_ns_data_ready(struct sock *sk) { + trace_sk_data_ready(sk); + queue_work(qrtr_ns.workqueue, &qrtr_ns.work); } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index cfbf0e129cba..e53b7f266bd7 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -31,6 +31,7 @@ * */ #include <linux/kernel.h> +#include <linux/sched/clock.h> #include <linux/slab.h> #include <linux/pci.h> #include <linux/dma-mapping.h> diff --git a/net/rds/recv.c b/net/rds/recv.c index 5b426dc3634d..c71b923764fd 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -35,6 +35,7 @@ #include <net/sock.h> #include <linux/in.h> #include <linux/export.h> +#include <linux/sched/clock.h> #include <linux/time.h> #include <linux/rds.h> diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 7edf2e69d3fe..014fa24418c1 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -34,6 +34,7 @@ #include <linux/gfp.h> #include <linux/in.h> #include <net/tcp.h> +#include <trace/events/sock.h> #include "rds.h" #include "tcp.h" @@ -234,6 +235,7 @@ void rds_tcp_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); + trace_sk_data_ready(sk); rdsdebug("listen data ready sk %p\n", sk); read_lock_bh(&sk->sk_callback_lock); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index f4ee13da90c7..c00f04a1a534 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -33,6 +33,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <net/tcp.h> +#include <trace/events/sock.h> #include "rds.h" #include "tcp.h" @@ -309,6 +310,7 @@ void rds_tcp_data_ready(struct sock *sk) struct rds_conn_path *cp; struct rds_tcp_connection *tc; + trace_sk_data_ready(sk); rdsdebug("data ready sk %p\n", sk); read_lock_bh(&sk->sk_callback_lock); diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index f5afc9bcdee6..786dbfdad772 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -75,6 +75,8 @@ static int rfkill_gpio_probe(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill; struct gpio_desc *gpio; + const char *name_property; + const char *type_property; const char *type_name; int ret; @@ -82,8 +84,15 @@ static int rfkill_gpio_probe(struct platform_device *pdev) if (!rfkill) return -ENOMEM; - device_property_read_string(&pdev->dev, "name", &rfkill->name); - device_property_read_string(&pdev->dev, "type", &type_name); + if (dev_of_node(&pdev->dev)) { + name_property = "label"; + type_property = "radio-type"; + } else { + name_property = "name"; + type_property = "type"; + } + device_property_read_string(&pdev->dev, name_property, &rfkill->name); + device_property_read_string(&pdev->dev, type_property, &type_name); if (!rfkill->name) rfkill->name = dev_name(&pdev->dev); @@ -157,12 +166,19 @@ static const struct acpi_device_id rfkill_acpi_match[] = { MODULE_DEVICE_TABLE(acpi, rfkill_acpi_match); #endif +static const struct of_device_id rfkill_of_match[] __maybe_unused = { + { .compatible = "rfkill-gpio", }, + { }, +}; +MODULE_DEVICE_TABLE(of, rfkill_of_match); + static struct platform_driver rfkill_gpio_driver = { .probe = rfkill_gpio_probe, .remove = rfkill_gpio_remove, .driver = { .name = "rfkill_gpio", .acpi_match_table = ACPI_PTR(rfkill_acpi_match), + .of_match_table = of_match_ptr(rfkill_of_match), }, }; diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 7ae023b37a83..a20986806fea 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -36,6 +36,15 @@ config AF_RXRPC_INJECT_LOSS Say Y here to inject packet loss by discarding some received and some transmitted packets. +config AF_RXRPC_INJECT_RX_DELAY + bool "Inject delay into packet reception" + depends on SYSCTL + help + Say Y here to inject a delay into packet reception, allowing an + extended RTT time to be modelled. The delay can be configured using + /proc/sys/net/rxrpc/rxrpc_inject_rx_delay, setting a number of + milliseconds up to 0.5s (note that the granularity is actually in + jiffies). config AF_RXRPC_DEBUG bool "RxRPC dynamic debugging" diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index ebbd4a1c3f86..102f5cbff91a 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -786,7 +786,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); INIT_LIST_HEAD(&rx->recvmsg_q); - rwlock_init(&rx->recvmsg_lock); + spin_lock_init(&rx->recvmsg_lock); rwlock_init(&rx->call_lock); memset(&rx->srx, 0, sizeof(rx->srx)); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 433060cade03..9e19688b0e06 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -149,7 +149,7 @@ struct rxrpc_sock { struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */ - rwlock_t recvmsg_lock; /* Lock for recvmsg_q */ + spinlock_t recvmsg_lock; /* Lock for recvmsg_q */ struct key *key; /* security for this socket */ struct key *securities; /* list of server security descriptors */ struct rb_root calls; /* User ID -> call mapping */ @@ -284,7 +284,9 @@ struct rxrpc_local { struct task_struct *io_thread; struct completion io_thread_ready; /* Indication that the I/O thread started */ struct rxrpc_sock *service; /* Service(s) listening on this endpoint */ - struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + struct sk_buff_head rx_delay_queue; /* Delay injection queue */ +#endif struct sk_buff_head rx_queue; /* Received packets */ struct list_head conn_attend_q; /* Conns requiring immediate attention */ struct list_head call_attend_q; /* Calls requiring immediate attention */ @@ -688,9 +690,11 @@ struct rxrpc_call { /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ + u16 ackr_sack_base; /* Starting slot in SACK table ring */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ - atomic64_t ackr_window; /* Base (in LSW) and top (in MSW) of SACK window */ - atomic_t ackr_nr_unacked; /* Number of unacked packets */ + rxrpc_seq_t ackr_window; /* Base of SACK window */ + rxrpc_seq_t ackr_wtop; /* Base of SACK window */ + unsigned int ackr_nr_unacked; /* Number of unacked packets */ atomic_t ackr_nr_consumed; /* Number of packets needing hard ACK */ struct { #define RXRPC_SACK_SIZE 256 @@ -1109,6 +1113,9 @@ extern unsigned long rxrpc_idle_ack_delay; extern unsigned int rxrpc_rx_window_size; extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY +extern unsigned long rxrpc_inject_rx_delay; +#endif /* * net_ns.c diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 3e8689fdc437..0f5a1d77b890 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -195,7 +195,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->peer_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_peer *peer = b->peer_backlog[tail]; - rxrpc_put_local(peer->local, rxrpc_local_put_prealloc_conn); + rxrpc_put_local(peer->local, rxrpc_local_put_prealloc_peer); kfree(peer); tail = (tail + 1) & (size - 1); } diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 1abdef15debc..e363f21a2014 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -498,9 +498,18 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, rxrpc_propose_ack_rx_idle); - if (atomic_read(&call->ackr_nr_unacked) > 2) - rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, - rxrpc_propose_ack_input_data); + if (call->ackr_nr_unacked > 2) { + if (call->peer->rtt_count < 3) + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_rtt); + else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + ktime_get_real())) + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_old_rtt); + else + rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, + rxrpc_propose_ack_input_data); + } /* Make sure the timer is restarted */ if (!__rxrpc_call_is_complete(call)) { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f3c9f0201c15..6eaffb0d8fdc 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -167,7 +167,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->tx_total_len = -1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; - atomic64_set(&call->ackr_window, 0x100000001ULL); + call->ackr_window = 1; + call->ackr_wtop = 1; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); @@ -560,7 +561,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_put_call_slot(call); /* Make sure we don't get any more notifications */ - write_lock(&rx->recvmsg_lock); + spin_lock(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", @@ -573,7 +574,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; - write_unlock(&rx->recvmsg_lock); + spin_unlock(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index f30323de82bd..89ac05a711a4 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -8,11 +8,6 @@ #include <linux/slab.h> #include "ar-internal.h" -static struct rxrpc_bundle rxrpc_service_dummy_bundle = { - .ref = REFCOUNT_INIT(1), - .debug_id = UINT_MAX, -}; - /* * Find a service connection under RCU conditions. * @@ -132,8 +127,6 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn */ conn->state = RXRPC_CONN_SERVICE_PREALLOC; refcount_set(&conn->ref, 2); - conn->bundle = rxrpc_get_bundle(&rxrpc_service_dummy_bundle, - rxrpc_bundle_get_service_conn); atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 367927a99881..d68848fce51f 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -338,7 +338,8 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) static void rxrpc_input_update_ack_window(struct rxrpc_call *call, rxrpc_seq_t window, rxrpc_seq_t wtop) { - atomic64_set_release(&call->ackr_window, ((u64)wtop) << 32 | window); + call->ackr_window = window; + call->ackr_wtop = wtop; } /* @@ -367,9 +368,9 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct sk_buff *oos; rxrpc_serial_t serial = sp->hdr.serial; - u64 win = atomic64_read(&call->ackr_window); - rxrpc_seq_t window = lower_32_bits(win); - rxrpc_seq_t wtop = upper_32_bits(win); + unsigned int sack = call->ackr_sack_base; + rxrpc_seq_t window = call->ackr_window; + rxrpc_seq_t wtop = call->ackr_wtop; rxrpc_seq_t wlimit = window + call->rx_winsize - 1; rxrpc_seq_t seq = sp->hdr.seq; bool last = sp->hdr.flags & RXRPC_LAST_PACKET; @@ -410,20 +411,23 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, /* Queue the packet. */ if (seq == window) { - rxrpc_seq_t reset_from; - bool reset_sack = false; - if (sp->hdr.flags & RXRPC_REQUEST_ACK) ack_reason = RXRPC_ACK_REQUESTED; /* Send an immediate ACK if we fill in a hole */ else if (!skb_queue_empty(&call->rx_oos_queue)) ack_reason = RXRPC_ACK_DELAY; else - atomic_inc_return(&call->ackr_nr_unacked); + call->ackr_nr_unacked++; window++; - if (after(window, wtop)) + if (after(window, wtop)) { + trace_rxrpc_sack(call, seq, sack, rxrpc_sack_none); wtop = window; + } else { + trace_rxrpc_sack(call, seq, sack, rxrpc_sack_advance); + sack = (sack + 1) % RXRPC_SACK_SIZE; + } + rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg); @@ -440,43 +444,39 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, __skb_unlink(oos, &call->rx_oos_queue); last = osp->hdr.flags & RXRPC_LAST_PACKET; seq = osp->hdr.seq; - if (!reset_sack) { - reset_from = seq; - reset_sack = true; - } + call->ackr_sack_table[sack] = 0; + trace_rxrpc_sack(call, seq, sack, rxrpc_sack_fill); + sack = (sack + 1) % RXRPC_SACK_SIZE; window++; rxrpc_input_queue_data(call, oos, window, wtop, - rxrpc_receive_queue_oos); + rxrpc_receive_queue_oos); } spin_unlock(&call->recvmsg_queue.lock); - if (reset_sack) { - do { - call->ackr_sack_table[reset_from % RXRPC_SACK_SIZE] = 0; - } while (reset_from++, before(reset_from, window)); - } + call->ackr_sack_base = sack; } else { - bool keep = false; + unsigned int slot; ack_reason = RXRPC_ACK_OUT_OF_SEQUENCE; - if (!call->ackr_sack_table[seq % RXRPC_SACK_SIZE]) { - call->ackr_sack_table[seq % RXRPC_SACK_SIZE] = 1; - keep = 1; + slot = seq - window; + sack = (sack + slot) % RXRPC_SACK_SIZE; + + if (call->ackr_sack_table[sack % RXRPC_SACK_SIZE]) { + ack_reason = RXRPC_ACK_DUPLICATE; + goto send_ack; } + call->ackr_sack_table[sack % RXRPC_SACK_SIZE] |= 1; + trace_rxrpc_sack(call, seq, sack, rxrpc_sack_oos); + if (after(seq + 1, wtop)) { wtop = seq + 1; rxrpc_input_update_ack_window(call, window, wtop); } - if (!keep) { - ack_reason = RXRPC_ACK_DUPLICATE; - goto send_ack; - } - skb_queue_walk(&call->rx_oos_queue, oos) { struct rxrpc_skb_priv *osp = rxrpc_skb(oos); @@ -567,8 +567,8 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_serial_t serial = sp->hdr.serial; rxrpc_seq_t seq0 = sp->hdr.seq; - _enter("{%llx,%x},{%u,%x}", - atomic64_read(&call->ackr_window), call->rx_highest_seq, + _enter("{%x,%x,%x},{%u,%x}", + call->ackr_window, call->ackr_wtop, call->rx_highest_seq, skb->len, seq0); if (__rxrpc_call_is_complete(call)) diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 9e9dfb2fc559..4a3a08a0e2cd 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -25,6 +25,7 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, */ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) { + struct sk_buff_head *rx_queue; struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); if (unlikely(!local)) { @@ -36,7 +37,16 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) skb->mark = RXRPC_SKB_MARK_PACKET; rxrpc_new_skb(skb, rxrpc_skb_new_encap_rcv); - skb_queue_tail(&local->rx_queue, skb); + rx_queue = &local->rx_queue; +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + if (rxrpc_inject_rx_delay || + !skb_queue_empty(&local->rx_delay_queue)) { + skb->tstamp = ktime_add_ms(skb->tstamp, rxrpc_inject_rx_delay); + rx_queue = &local->rx_delay_queue; + } +#endif + + skb_queue_tail(rx_queue, skb); rxrpc_wake_up_io_thread(local); return 0; } @@ -407,6 +417,9 @@ int rxrpc_io_thread(void *data) struct rxrpc_local *local = data; struct rxrpc_call *call; struct sk_buff *skb; +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + ktime_t now; +#endif bool should_stop; complete(&local->io_thread_ready); @@ -481,6 +494,17 @@ int rxrpc_io_thread(void *data) continue; } + /* Inject a delay into packets if requested. */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + now = ktime_get_real(); + while ((skb = skb_peek(&local->rx_delay_queue))) { + if (ktime_before(now, skb->tstamp)) + break; + skb = skb_dequeue(&local->rx_delay_queue); + skb_queue_tail(&local->rx_queue, skb); + } +#endif + if (!skb_queue_empty(&local->rx_queue)) { spin_lock_irq(&local->rx_queue.lock); skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); @@ -502,6 +526,28 @@ int rxrpc_io_thread(void *data) if (should_stop) break; + +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + skb = skb_peek(&local->rx_delay_queue); + if (skb) { + unsigned long timeout; + ktime_t tstamp = skb->tstamp; + ktime_t now = ktime_get_real(); + s64 delay_ns = ktime_to_ns(ktime_sub(tstamp, now)); + + if (delay_ns <= 0) { + __set_current_state(TASK_RUNNING); + continue; + } + + timeout = nsecs_to_jiffies(delay_ns); + timeout = max(timeout, 1UL); + schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + continue; + } +#endif + schedule(); } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index b8eaca5d9f22..7d910aee4f8c 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -108,8 +108,10 @@ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, local->net = net; local->rxnet = rxrpc_net(net); INIT_HLIST_NODE(&local->link); - init_rwsem(&local->defrag_sem); init_completion(&local->io_thread_ready); +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + skb_queue_head_init(&local->rx_delay_queue); +#endif skb_queue_head_init(&local->rx_queue); INIT_LIST_HEAD(&local->conn_attend_q); INIT_LIST_HEAD(&local->call_attend_q); @@ -434,6 +436,9 @@ void rxrpc_destroy_local(struct rxrpc_local *local) /* At this point, there should be no more packets coming in to the * local endpoint. */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + rxrpc_purge_queue(&local->rx_delay_queue); +#endif rxrpc_purge_queue(&local->rx_queue); rxrpc_purge_client_connections(local); } diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 056c428d8bf3..825b81183046 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -53,3 +53,10 @@ unsigned int rxrpc_rx_mtu = 5692; * sender that we're willing to handle. */ unsigned int rxrpc_rx_jumbo_max = 4; + +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY +/* + * The delay to inject into packet reception. + */ +unsigned long rxrpc_inject_rx_delay; +#endif diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index a9746be29634..6b2022240076 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -83,59 +83,36 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, struct rxrpc_txbuf *txb) { struct rxrpc_ackinfo ackinfo; - unsigned int qsize; - rxrpc_seq_t window, wtop, wrap_point, ix, first; + unsigned int qsize, sack, wrap, to; + rxrpc_seq_t window, wtop; int rsize; - u64 wtmp; u32 mtu, jmax; u8 *ackp = txb->acks; - u8 sack_buffer[sizeof(call->ackr_sack_table)] __aligned(8); - atomic_set(&call->ackr_nr_unacked, 0); + call->ackr_nr_unacked = 0; atomic_set(&call->ackr_nr_consumed, 0); rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); + clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); - /* Barrier against rxrpc_input_data(). */ -retry: - wtmp = atomic64_read_acquire(&call->ackr_window); - window = lower_32_bits(wtmp); - wtop = upper_32_bits(wtmp); + window = call->ackr_window; + wtop = call->ackr_wtop; + sack = call->ackr_sack_base % RXRPC_SACK_SIZE; txb->ack.firstPacket = htonl(window); - txb->ack.nAcks = 0; + txb->ack.nAcks = wtop - window; if (after(wtop, window)) { - /* Try to copy the SACK ring locklessly. We can use the copy, - * only if the now-current top of the window didn't go past the - * previously read base - otherwise we can't know whether we - * have old data or new data. - */ - memcpy(sack_buffer, call->ackr_sack_table, sizeof(sack_buffer)); - wrap_point = window + RXRPC_SACK_SIZE - 1; - wtmp = atomic64_read_acquire(&call->ackr_window); - window = lower_32_bits(wtmp); - wtop = upper_32_bits(wtmp); - if (after(wtop, wrap_point)) { - cond_resched(); - goto retry; - } - - /* The buffer is maintained as a ring with an invariant mapping - * between bit position and sequence number, so we'll probably - * need to rotate it. - */ - txb->ack.nAcks = wtop - window; - ix = window % RXRPC_SACK_SIZE; - first = sizeof(sack_buffer) - ix; + wrap = RXRPC_SACK_SIZE - sack; + to = min_t(unsigned int, txb->ack.nAcks, RXRPC_SACK_SIZE); - if (ix + txb->ack.nAcks <= RXRPC_SACK_SIZE) { - memcpy(txb->acks, sack_buffer + ix, txb->ack.nAcks); + if (sack + txb->ack.nAcks <= RXRPC_SACK_SIZE) { + memcpy(txb->acks, call->ackr_sack_table + sack, txb->ack.nAcks); } else { - memcpy(txb->acks, sack_buffer + ix, first); - memcpy(txb->acks + first, sack_buffer, - txb->ack.nAcks - first); + memcpy(txb->acks, call->ackr_sack_table + sack, wrap); + memcpy(txb->acks + wrap, call->ackr_sack_table, + to - wrap); } - ackp += txb->ack.nAcks; + ackp += to; } else if (before(wtop, window)) { pr_warn("ack window backward %x %x", window, wtop); } else if (txb->ack.reason == RXRPC_ACK_DELAY) { @@ -253,12 +230,15 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); ret = do_udp_sendmsg(conn->local->socket, &msg, len); call->peer->last_tx_at = ktime_get_seconds(); - if (ret < 0) + if (ret < 0) { trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); - else + } else { trace_rxrpc_tx_packet(call->debug_id, &txb->wire, rxrpc_tx_point_call_ack); + if (txb->wire.flags & RXRPC_REQUEST_ACK) + call->peer->rtt_last_req = ktime_get_real(); + } rxrpc_tx_backoff(call, ret); if (!__rxrpc_call_is_complete(call)) { @@ -429,8 +409,6 @@ dont_set_request_ack: if (txb->len >= call->peer->maxdata) goto send_fragmentable; - down_read(&conn->local->defrag_sem); - txb->last_sent = ktime_get_real(); if (txb->wire.flags & RXRPC_REQUEST_ACK) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); @@ -445,7 +423,6 @@ dont_set_request_ack: ret = do_udp_sendmsg(conn->local->socket, &msg, len); conn->peer->last_tx_at = ktime_get_seconds(); - up_read(&conn->local->defrag_sem); if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); rxrpc_cancel_rtt_probe(call, serial, rtt_slot); @@ -506,8 +483,6 @@ send_fragmentable: /* attempt to send this message with fragmentation enabled */ _debug("send fragment"); - down_write(&conn->local->defrag_sem); - txb->last_sent = ktime_get_real(); if (txb->wire.flags & RXRPC_REQUEST_ACK) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); @@ -539,8 +514,6 @@ send_fragmentable: rxrpc_tx_point_call_data_frag); } rxrpc_tx_backoff(call, ret); - - up_write(&conn->local->defrag_sem); goto done; } diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 750158a085cd..682636d3b060 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -55,7 +55,6 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) unsigned long timeout = 0; rxrpc_seq_t acks_hard_ack; char lbuff[50], rbuff[50]; - u64 wtmp; if (v == &rxnet->calls) { seq_puts(seq, @@ -83,7 +82,6 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) } acks_hard_ack = READ_ONCE(call->acks_hard_ack); - wtmp = atomic64_read_acquire(&call->ackr_window); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" " %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n", @@ -98,7 +96,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call->abort_code, call->debug_id, acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack, - lower_32_bits(wtmp), upper_32_bits(wtmp) - lower_32_bits(wtmp), + call->ackr_window, call->ackr_wtop - call->ackr_window, call->rx_serial, call->cong_cwnd, timeout); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index dd54ceee7bcc..50d263a6359d 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -40,12 +40,12 @@ void rxrpc_notify_socket(struct rxrpc_call *call) call->notify_rx(sk, call, call->user_call_ID); spin_unlock(&call->notify_lock); } else { - write_lock(&rx->recvmsg_lock); + spin_lock(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } - write_unlock(&rx->recvmsg_lock); + spin_unlock(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); @@ -95,7 +95,7 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) } trace_rxrpc_recvdata(call, rxrpc_recvmsg_terminal, - lower_32_bits(atomic64_read(&call->ackr_window)) - 1, + call->ackr_window - 1, call->rx_pkt_offset, call->rx_pkt_len, ret); return ret; } @@ -175,13 +175,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rx_pkt_len = call->rx_pkt_len; if (rxrpc_call_has_failed(call)) { - seq = lower_32_bits(atomic64_read(&call->ackr_window)) - 1; + seq = call->ackr_window - 1; ret = -EIO; goto done; } if (test_bit(RXRPC_CALL_RECVMSG_READ_ALL, &call->flags)) { - seq = lower_32_bits(atomic64_read(&call->ackr_window)) - 1; + seq = call->ackr_window - 1; ret = 1; goto done; } @@ -335,14 +335,14 @@ try_again: /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. */ - write_lock(&rx->recvmsg_lock); + spin_lock(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!(flags & MSG_PEEK)) list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_get_recvmsg); - write_unlock(&rx->recvmsg_lock); + spin_unlock(&rx->recvmsg_lock); call_debug_id = call->debug_id; trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); @@ -431,9 +431,9 @@ error_unlock_call: error_requeue_call: if (!(flags & MSG_PEEK)) { - write_lock(&rx->recvmsg_lock); + spin_lock(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); - write_unlock(&rx->recvmsg_lock); + spin_unlock(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index ebe0c75e7b07..944320e65ea8 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -63,7 +63,7 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) if (skb) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_CONSUMED); } } @@ -78,6 +78,6 @@ void rxrpc_purge_queue(struct sk_buff_head *list) int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, rxrpc_skb_put_purge); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_CONSUMED); } } diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index cde3224a5cd2..ecaeb4ecfb58 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -17,6 +17,9 @@ static const unsigned int n_65535 = 65535; static const unsigned int n_max_acks = 255; static const unsigned long one_jiffy = 1; static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY +static const unsigned long max_500 = 500; +#endif /* * RxRPC operating parameters. @@ -63,6 +66,19 @@ static struct ctl_table rxrpc_sysctl_table[] = { .extra2 = (void *)&max_jiffies, }, + /* Values used in milliseconds */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + { + .procname = "inject_rx_delay", + .data = &rxrpc_inject_rx_delay, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = (void *)SYSCTL_LONG_ZERO, + .extra2 = (void *)&max_500, + }, +#endif + /* Non-time values */ { .procname = "reap_client_conns", @@ -109,7 +125,6 @@ static struct ctl_table rxrpc_sysctl_table[] = { .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&four, }, - { } }; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index d2cf2aac3adb..d43be8512386 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -110,12 +110,8 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top); - for (;;) { - spin_lock(&call->tx_lock); - txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link); - if (!txb) - break; + while ((txb = list_first_entry_or_null(&call->tx_buffer, + struct rxrpc_txbuf, call_link))) { hard_ack = smp_load_acquire(&call->acks_hard_ack); if (before(hard_ack, txb->seq)) break; @@ -128,15 +124,11 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue); - spin_unlock(&call->tx_lock); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated); if (after(call->acks_hard_ack, call->tx_bottom + 128)) wake = true; } - spin_unlock(&call->tx_lock); - if (wake) wake_up(&call->waitq); } diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 777d6b50505c..f5acb535413d 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -195,8 +195,14 @@ config NET_SCH_ETF To compile this code as a module, choose M here: the module will be called sch_etf. +config NET_SCH_MQPRIO_LIB + tristate + help + Common library for manipulating mqprio queue configurations. + config NET_SCH_TAPRIO tristate "Time Aware Priority (taprio) Scheduler" + select NET_SCH_MQPRIO_LIB help Say Y here if you want to use the Time Aware Priority (taprio) packet scheduling algorithm. @@ -253,6 +259,7 @@ config NET_SCH_DRR config NET_SCH_MQPRIO tristate "Multi-queue priority scheduler (MQPRIO)" + select NET_SCH_MQPRIO_LIB help Say Y here if you want to use the Multi-queue Priority scheduler. This scheduler allows QOS to be offloaded on NICs that have support @@ -337,7 +344,7 @@ config NET_SCH_FQ Say Y here if you want to use the FQ packet scheduling algorithm. FQ does flow separation, and is able to respect pacing requirements - set by TCP stack into sk->sk_pacing_rate (for localy generated + set by TCP stack into sk->sk_pacing_rate (for locally generated traffic) To compile this driver as a module, choose M here: the module diff --git a/net/sched/Makefile b/net/sched/Makefile index dd14ef413fda..7911eec09837 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o +obj-$(CONFIG_NET_SCH_MQPRIO_LIB) += sch_mqprio_lib.o obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 5b3c0ac495be..cd09ef49df22 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1582,7 +1582,7 @@ errout: static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, - int ref) + int ref, struct netlink_ext_ack *extack) { struct tcamsg *t; struct nlmsghdr *nlh; @@ -1606,7 +1606,12 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], nla_nest_end(skb, nest); + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1625,7 +1630,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, - 0, 1) <= 0) { + 0, 1, NULL) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; @@ -1799,7 +1804,7 @@ tcf_reoffload_del_notify(struct net *net, struct tc_action *action) if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1) <= 0) { + if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1, NULL) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1886,7 +1891,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, - 0, 2) <= 0) { + 0, 2, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; @@ -1965,7 +1970,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, - RTM_NEWACTION, 0, 0) <= 0) { + RTM_NEWACTION, 0, 0, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 0ca2bb8ed026..b126f03c1bb6 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -170,11 +170,11 @@ tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple, static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, enum ip_conntrack_dir dir, + enum ip_conntrack_info ctinfo, struct flow_action *action) { struct nf_conn_labels *ct_labels; struct flow_action_entry *entry; - enum ip_conntrack_info ctinfo; u32 *act_ct_labels; entry = tcf_ct_flow_table_flow_action_get_next(action); @@ -182,8 +182,6 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) entry->ct_metadata.mark = READ_ONCE(ct->mark); #endif - ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED : - IP_CT_ESTABLISHED_REPLY; /* aligns with the CT reference on the SKB nf_ct_set */ entry->ct_metadata.cookie = (unsigned long)ct | ctinfo; entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL; @@ -237,22 +235,28 @@ static int tcf_ct_flow_table_add_action_nat(struct net *net, } static int tcf_ct_flow_table_fill_actions(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir tdir, struct nf_flow_rule *flow_rule) { struct flow_action *action = &flow_rule->rule->action; int num_entries = action->num_entries; struct nf_conn *ct = flow->ct; + enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; int i, err; switch (tdir) { case FLOW_OFFLOAD_DIR_ORIGINAL: dir = IP_CT_DIR_ORIGINAL; + ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + IP_CT_ESTABLISHED : IP_CT_NEW; + if (ctinfo == IP_CT_ESTABLISHED) + set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); break; case FLOW_OFFLOAD_DIR_REPLY: dir = IP_CT_DIR_REPLY; + ctinfo = IP_CT_ESTABLISHED_REPLY; break; default: return -EOPNOTSUPP; @@ -262,7 +266,7 @@ static int tcf_ct_flow_table_fill_actions(struct net *net, if (err) goto err_nat; - tcf_ct_flow_table_add_action_meta(ct, dir, action); + tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action); return 0; err_nat: @@ -365,7 +369,7 @@ static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, - bool tcp) + bool tcp, bool bidirectional) { struct nf_conn_act_ct_ext *act_ct_ext; struct flow_offload *entry; @@ -384,6 +388,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } + if (bidirectional) + __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags); act_ct_ext = nf_conn_act_ct_ext_find(ct); if (act_ct_ext) { @@ -407,26 +413,34 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - bool tcp = false; - - if ((ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) || - !test_bit(IPS_ASSURED_BIT, &ct->status)) - return; + bool tcp = false, bidirectional = true; switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: - tcp = true; - if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) + if ((ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) || + !test_bit(IPS_ASSURED_BIT, &ct->status) || + ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) return; + + tcp = true; break; case IPPROTO_UDP: + if (!nf_ct_is_confirmed(ct)) + return; + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) + bidirectional = false; break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; - if (ct->status & IPS_NAT_MASK) + if ((ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) || + !test_bit(IPS_ASSURED_BIT, &ct->status) || + ct->status & IPS_NAT_MASK) return; + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) @@ -442,7 +456,7 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, ct->status & IPS_SEQ_ADJUST) return; - tcf_ct_flow_table_add(ct_ft, ct, tcp); + tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional); } static bool @@ -621,13 +635,30 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); ct = flow->ct; + if (dir == FLOW_OFFLOAD_DIR_REPLY && + !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) { + /* Only offload reply direction after connection became + * assured. + */ + if (test_bit(IPS_ASSURED_BIT, &ct->status)) + set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); + else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags)) + /* If flow_table flow has already been updated to the + * established state, then don't refresh. + */ + return false; + } + if (tcph && (unlikely(tcph->fin || tcph->rst))) { flow_offload_teardown(flow); return false; } - ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : - IP_CT_ESTABLISHED_REPLY; + if (dir == FLOW_OFFLOAD_DIR_ORIGINAL) + ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + IP_CT_ESTABLISHED : IP_CT_NEW; + else + ctinfo = IP_CT_ESTABLISHED_REPLY; flow_offload_refresh(nf_ft, flow); nf_conntrack_get(&ct->ct_general); @@ -707,7 +738,7 @@ static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family) switch (family) { case NFPROTO_IPV4: - len = ntohs(ip_hdr(skb)->tot_len); + len = skb_ip_totlen(skb); break; case NFPROTO_IPV6: len = sizeof(struct ipv6hdr) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 7284bcea7b0b..8037ec9b1d31 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -29,8 +29,8 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); -#define MIRRED_RECURSION_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_rec_level); +#define MIRRED_NEST_LIMIT 4 +static DEFINE_PER_CPU(unsigned int, mirred_nest_level); static bool tcf_mirred_is_act_redirect(int action) { @@ -206,12 +206,19 @@ release_idr: return err; } +static bool is_mirred_nested(void) +{ + return unlikely(__this_cpu_read(mirred_nest_level) > 1); +} + static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb) { int err; if (!want_ingress) err = tcf_dev_queue_xmit(skb, dev_queue_xmit); + else if (is_mirred_nested()) + err = netif_rx(skb); else err = netif_receive_skb(skb); @@ -226,7 +233,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, struct sk_buff *skb2 = skb; bool m_mac_header_xmit; struct net_device *dev; - unsigned int rec_level; + unsigned int nest_level; int retval, err = 0; bool use_reinsert; bool want_ingress; @@ -237,11 +244,11 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, int mac_len; bool at_nh; - rec_level = __this_cpu_inc_return(mirred_rec_level); - if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) { + nest_level = __this_cpu_inc_return(mirred_nest_level); + if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return TC_ACT_SHOT; } @@ -310,7 +317,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, err = tcf_mirred_forward(want_ingress, skb); if (err) tcf_action_inc_overlimit_qstats(&m->common); - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return TC_ACT_CONSUMED; } } @@ -322,7 +329,7 @@ out: if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return retval; } diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index a0378e9f0121..c42fcc47dd6d 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -134,6 +134,17 @@ nla_failure: return -EINVAL; } +static void tcf_pedit_cleanup_rcu(struct rcu_head *head) +{ + struct tcf_pedit_parms *parms = + container_of(head, struct tcf_pedit_parms, rcu); + + kfree(parms->tcfp_keys_ex); + kfree(parms->tcfp_keys); + + kfree(parms); +} + static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, @@ -141,10 +152,9 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; - struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; - struct tc_pedit_key *keys = NULL; - struct tcf_pedit_key_ex *keys_ex; + struct tcf_pedit_parms *oparms, *nparms; + struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; struct nlattr *pattr; struct tcf_pedit *p; @@ -181,18 +191,25 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, return -EINVAL; } - keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); - if (IS_ERR(keys_ex)) - return PTR_ERR(keys_ex); + nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); + if (!nparms) + return -ENOMEM; + + nparms->tcfp_keys_ex = + tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); + if (IS_ERR(nparms->tcfp_keys_ex)) { + ret = PTR_ERR(nparms->tcfp_keys_ex); + goto out_free; + } index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_pedit_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_pedit_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); - goto out_free; + goto out_free_ex; } ret = ACT_P_CREATED; } else if (err > 0) { @@ -204,7 +221,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } } else { ret = err; - goto out_free; + goto out_free_ex; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); @@ -212,48 +229,50 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, ret = err; goto out_release; } - p = to_pedit(*a); - spin_lock_bh(&p->tcf_lock); - if (ret == ACT_P_CREATED || - (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) { - keys = kmalloc(ksize, GFP_ATOMIC); - if (!keys) { - spin_unlock_bh(&p->tcf_lock); - ret = -ENOMEM; - goto put_chain; - } - kfree(p->tcfp_keys); - p->tcfp_keys = keys; - p->tcfp_nkeys = parm->nkeys; + nparms->tcfp_off_max_hint = 0; + nparms->tcfp_flags = parm->flags; + nparms->tcfp_nkeys = parm->nkeys; + + nparms->tcfp_keys = kmalloc(ksize, GFP_KERNEL); + if (!nparms->tcfp_keys) { + ret = -ENOMEM; + goto put_chain; } - memcpy(p->tcfp_keys, parm->keys, ksize); - p->tcfp_off_max_hint = 0; - for (i = 0; i < p->tcfp_nkeys; ++i) { - u32 cur = p->tcfp_keys[i].off; + + memcpy(nparms->tcfp_keys, parm->keys, ksize); + + for (i = 0; i < nparms->tcfp_nkeys; ++i) { + u32 cur = nparms->tcfp_keys[i].off; /* sanitize the shift value for any later use */ - p->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, - p->tcfp_keys[i].shift); + nparms->tcfp_keys[i].shift = min_t(size_t, + BITS_PER_TYPE(int) - 1, + nparms->tcfp_keys[i].shift); /* The AT option can read a single byte, we can bound the actual * value with uchar max. */ - cur += (0xff & p->tcfp_keys[i].offmask) >> p->tcfp_keys[i].shift; + cur += (0xff & nparms->tcfp_keys[i].offmask) >> nparms->tcfp_keys[i].shift; /* Each key touches 4 bytes starting from the computed offset */ - p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4); + nparms->tcfp_off_max_hint = + max(nparms->tcfp_off_max_hint, cur + 4); } - p->tcfp_flags = parm->flags; + p = to_pedit(*a); + + spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + oparms = rcu_replace_pointer(p->parms, nparms, 1); + spin_unlock_bh(&p->tcf_lock); - kfree(p->tcfp_keys_ex); - p->tcfp_keys_ex = keys_ex; + if (oparms) + call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); - spin_unlock_bh(&p->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); + return ret; put_chain: @@ -261,19 +280,22 @@ put_chain: tcf_chain_put_by_act(goto_ch); out_release: tcf_idr_release(*a, bind); +out_free_ex: + kfree(nparms->tcfp_keys_ex); out_free: - kfree(keys_ex); + kfree(nparms); return ret; - } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); - struct tc_pedit_key *keys = p->tcfp_keys; + struct tcf_pedit_parms *parms; + + parms = rcu_dereference_protected(p->parms, 1); - kfree(keys); - kfree(p->tcfp_keys_ex); + if (parms) + call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu); } static bool offset_valid(struct sk_buff *skb, int offset) @@ -324,109 +346,107 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { + enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; + enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; struct tcf_pedit *p = to_pedit(a); + struct tcf_pedit_key_ex *tkey_ex; + struct tcf_pedit_parms *parms; + struct tc_pedit_key *tkey; u32 max_offset; int i; - spin_lock(&p->tcf_lock); + parms = rcu_dereference_bh(p->parms); max_offset = (skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : skb_network_offset(skb)) + - p->tcfp_off_max_hint; + parms->tcfp_off_max_hint; if (skb_ensure_writable(skb, min(skb->len, max_offset))) - goto unlock; + goto done; tcf_lastuse_update(&p->tcf_tm); + tcf_action_update_bstats(&p->common, skb); - if (p->tcfp_nkeys > 0) { - struct tc_pedit_key *tkey = p->tcfp_keys; - struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex; - enum pedit_header_type htype = - TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; - enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; - - for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { - u32 *ptr, hdata; - int offset = tkey->off; - int hoffset; - u32 val; - int rc; - - if (tkey_ex) { - htype = tkey_ex->htype; - cmd = tkey_ex->cmd; - - tkey_ex++; - } + tkey = parms->tcfp_keys; + tkey_ex = parms->tcfp_keys_ex; - rc = pedit_skb_hdr_offset(skb, htype, &hoffset); - if (rc) { - pr_info("tc action pedit bad header type specified (0x%x)\n", - htype); - goto bad; - } + for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) { + int offset = tkey->off; + u32 *ptr, hdata; + int hoffset; + u32 val; + int rc; - if (tkey->offmask) { - u8 *d, _d; - - if (!offset_valid(skb, hoffset + tkey->at)) { - pr_info("tc action pedit 'at' offset %d out of bounds\n", - hoffset + tkey->at); - goto bad; - } - d = skb_header_pointer(skb, hoffset + tkey->at, - sizeof(_d), &_d); - if (!d) - goto bad; - offset += (*d & tkey->offmask) >> tkey->shift; - } + if (tkey_ex) { + htype = tkey_ex->htype; + cmd = tkey_ex->cmd; - if (offset % 4) { - pr_info("tc action pedit offset must be on 32 bit boundaries\n"); - goto bad; - } + tkey_ex++; + } - if (!offset_valid(skb, hoffset + offset)) { - pr_info("tc action pedit offset %d out of bounds\n", - hoffset + offset); - goto bad; - } + rc = pedit_skb_hdr_offset(skb, htype, &hoffset); + if (rc) { + pr_info("tc action pedit bad header type specified (0x%x)\n", + htype); + goto bad; + } - ptr = skb_header_pointer(skb, hoffset + offset, - sizeof(hdata), &hdata); - if (!ptr) - goto bad; - /* just do it, baby */ - switch (cmd) { - case TCA_PEDIT_KEY_EX_CMD_SET: - val = tkey->val; - break; - case TCA_PEDIT_KEY_EX_CMD_ADD: - val = (*ptr + tkey->val) & ~tkey->mask; - break; - default: - pr_info("tc action pedit bad command (%d)\n", - cmd); + if (tkey->offmask) { + u8 *d, _d; + + if (!offset_valid(skb, hoffset + tkey->at)) { + pr_info("tc action pedit 'at' offset %d out of bounds\n", + hoffset + tkey->at); goto bad; } + d = skb_header_pointer(skb, hoffset + tkey->at, + sizeof(_d), &_d); + if (!d) + goto bad; + offset += (*d & tkey->offmask) >> tkey->shift; + } - *ptr = ((*ptr & tkey->mask) ^ val); - if (ptr == &hdata) - skb_store_bits(skb, hoffset + offset, ptr, 4); + if (offset % 4) { + pr_info("tc action pedit offset must be on 32 bit boundaries\n"); + goto bad; } - goto done; - } else { - WARN(1, "pedit BUG: index %d\n", p->tcf_index); + if (!offset_valid(skb, hoffset + offset)) { + pr_info("tc action pedit offset %d out of bounds\n", + hoffset + offset); + goto bad; + } + + ptr = skb_header_pointer(skb, hoffset + offset, + sizeof(hdata), &hdata); + if (!ptr) + goto bad; + /* just do it, baby */ + switch (cmd) { + case TCA_PEDIT_KEY_EX_CMD_SET: + val = tkey->val; + break; + case TCA_PEDIT_KEY_EX_CMD_ADD: + val = (*ptr + tkey->val) & ~tkey->mask; + break; + default: + pr_info("tc action pedit bad command (%d)\n", + cmd); + goto bad; + } + + *ptr = ((*ptr & tkey->mask) ^ val); + if (ptr == &hdata) + skb_store_bits(skb, hoffset + offset, ptr, 4); } + goto done; + bad: + spin_lock(&p->tcf_lock); p->tcf_qstats.overlimits++; -done: - bstats_update(&p->tcf_bstats, skb); -unlock: spin_unlock(&p->tcf_lock); +done: return p->tcf_action; } @@ -445,30 +465,33 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_pedit *p = to_pedit(a); + struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; - s = struct_size(opt, keys, p->tcfp_nkeys); + spin_lock_bh(&p->tcf_lock); + parms = rcu_dereference_protected(p->parms, 1); + s = struct_size(opt, keys, parms->tcfp_nkeys); - /* netlink spinlocks held above us - must use ATOMIC */ opt = kzalloc(s, GFP_ATOMIC); - if (unlikely(!opt)) + if (unlikely(!opt)) { + spin_unlock_bh(&p->tcf_lock); return -ENOBUFS; + } - spin_lock_bh(&p->tcf_lock); - memcpy(opt->keys, p->tcfp_keys, flex_array_size(opt, keys, p->tcfp_nkeys)); + memcpy(opt->keys, parms->tcfp_keys, + flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; - opt->nkeys = p->tcfp_nkeys; - opt->flags = p->tcfp_flags; + opt->nkeys = parms->tcfp_nkeys; + opt->flags = parms->tcfp_flags; opt->action = p->tcf_action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; - if (p->tcfp_keys_ex) { - if (tcf_pedit_key_ex_dump(skb, - p->tcfp_keys_ex, - p->tcfp_nkeys)) + if (parms->tcfp_keys_ex) { + if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex, + parms->tcfp_nkeys)) goto nla_put_failure; if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 668130f08903..5b4a95e8a1ee 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -488,7 +488,8 @@ static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block, #endif static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, - u32 seq, u16 flags, int event, bool unicast); + u32 seq, u16 flags, int event, bool unicast, + struct netlink_ext_ack *extack); static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create, @@ -521,7 +522,7 @@ static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, */ if (is_first_reference && !by_act) tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, - RTM_NEWCHAIN, false); + RTM_NEWCHAIN, false, NULL); return chain; @@ -1817,7 +1818,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, u32 portid, u32 seq, u16 flags, int event, - bool terse_dump, bool rtnl_held) + bool terse_dump, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1857,7 +1859,13 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) goto nla_put_failure; } + + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto nla_put_failure; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1871,7 +1879,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, int event, bool unicast, - bool rtnl_held) + bool rtnl_held, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1883,7 +1891,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event, - false, rtnl_held) <= 0) { + false, rtnl_held, extack) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1912,7 +1920,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - false, rtnl_held) <= 0) { + false, rtnl_held, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; @@ -1938,14 +1946,15 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct tcf_block *block, struct Qdisc *q, u32 parent, struct nlmsghdr *n, - struct tcf_chain *chain, int event) + struct tcf_chain *chain, int event, + struct netlink_ext_ack *extack) { struct tcf_proto *tp; for (tp = tcf_get_next_proto(chain, NULL); tp; tp = tcf_get_next_proto(chain, tp)) - tfilter_notify(net, oskb, n, tp, block, - q, parent, NULL, event, false, true); + tfilter_notify(net, oskb, n, tp, block, q, parent, NULL, + event, false, true, extack); } static void tfilter_put(struct tcf_proto *tp, void *fh) @@ -2156,7 +2165,7 @@ replay: flags, extack); if (err == 0) { tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_NEWTFILTER, false, rtnl_held); + RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); /* q pointer is NULL for shared blocks */ if (q) @@ -2284,7 +2293,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (prio == 0) { tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); + chain, RTM_DELTFILTER, extack); tcf_chain_flush(chain, rtnl_held); err = 0; goto errout; @@ -2308,7 +2317,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, tcf_proto_put(tp, rtnl_held, NULL); tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_DELTFILTER, false, rtnl_held); + RTM_DELTFILTER, false, rtnl_held, extack); err = 0; goto errout; } @@ -2452,7 +2461,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, err = -ENOENT; } else { err = tfilter_notify(net, skb, n, tp, block, q, parent, - fh, RTM_NEWTFILTER, true, rtnl_held); + fh, RTM_NEWTFILTER, true, rtnl_held, NULL); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); } @@ -2490,7 +2499,7 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, a->terse_dump, true); + RTM_NEWTFILTER, a->terse_dump, true, NULL); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, @@ -2524,7 +2533,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, false, true) <= 0) + RTM_NEWTFILTER, false, true, NULL) <= 0) goto errout; cb->args[1] = 1; } @@ -2667,7 +2676,8 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct net *net, struct sk_buff *skb, struct tcf_block *block, - u32 portid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event, + struct netlink_ext_ack *extack) { unsigned char *b = skb_tail_pointer(skb); const struct tcf_proto_ops *ops; @@ -2704,7 +2714,12 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, goto nla_put_failure; } + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -2714,7 +2729,8 @@ nla_put_failure: } static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, - u32 seq, u16 flags, int event, bool unicast) + u32 seq, u16 flags, int event, bool unicast, + struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct tcf_block *block = chain->block; @@ -2728,7 +2744,7 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, if (tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, chain->index, net, skb, block, portid, - seq, flags, event) <= 0) { + seq, flags, event, extack) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -2756,7 +2772,7 @@ static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, return -ENOBUFS; if (tc_chain_fill_node(tmplt_ops, tmplt_priv, chain_index, net, skb, - block, portid, seq, flags, RTM_DELCHAIN) <= 0) { + block, portid, seq, flags, RTM_DELCHAIN, NULL) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -2908,11 +2924,11 @@ replay: } tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, - RTM_NEWCHAIN, false); + RTM_NEWCHAIN, false, extack); break; case RTM_DELCHAIN: tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); + chain, RTM_DELTFILTER, extack); /* Flush the chain first as the user requested chain removal. */ tcf_chain_flush(chain, true); /* In case the chain was successfully deleted, put a reference @@ -2922,7 +2938,7 @@ replay: break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, - n->nlmsg_flags, n->nlmsg_type, true); + n->nlmsg_flags, n->nlmsg_type, true, extack); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); break; @@ -3022,7 +3038,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) chain->index, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWCHAIN); + RTM_NEWCHAIN, NULL); if (err <= 0) break; index++; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 72d2c204d5f3..e9780631b5b5 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -902,7 +902,8 @@ static void qdisc_offload_graft_root(struct net_device *dev, } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, - u32 portid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event, + struct netlink_ext_ack *extack) { struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; @@ -970,7 +971,12 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -991,7 +997,8 @@ static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, - struct Qdisc *old, struct Qdisc *new) + struct Qdisc *old, struct Qdisc *new, + struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1002,12 +1009,12 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, if (old && !tc_qdisc_dump_ignore(old, false)) { if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, - 0, RTM_DELQDISC) < 0) + 0, RTM_DELQDISC, extack) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new, false)) { if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, - old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) goto err_out; } @@ -1022,10 +1029,11 @@ err_out: static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, - struct Qdisc *old, struct Qdisc *new) + struct Qdisc *old, struct Qdisc *new, + struct netlink_ext_ack *extack) { if (new || old) - qdisc_notify(net, skb, n, clid, old, new); + qdisc_notify(net, skb, n, clid, old, new, extack); if (old) qdisc_put(old); @@ -1105,12 +1113,12 @@ skip: qdisc_refcount_inc(new); rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); if (new && new->ops->attach) new->ops->attach(new); } else { - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); } if (dev->flags & IFF_UP) @@ -1141,7 +1149,7 @@ skip: err = cops->graft(parent, cl, new, &old, extack); if (err) return err; - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); } return 0; } @@ -1274,12 +1282,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev, if (err) goto err_out3; - if (ops->init) { - err = ops->init(sch, tca[TCA_OPTIONS], extack); - if (err != 0) - goto err_out5; - } - if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) { @@ -1288,11 +1290,18 @@ static struct Qdisc *qdisc_create(struct net_device *dev, } rcu_assign_pointer(sch->stab, stab); } + + if (ops->init) { + err = ops->init(sch, tca[TCA_OPTIONS], extack); + if (err != 0) + goto err_out5; + } + if (tca[TCA_RATE]) { err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); - goto err_out4; + goto err_out5; } err = gen_new_estimator(&sch->bstats, @@ -1303,7 +1312,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); - goto err_out4; + goto err_out5; } } @@ -1313,6 +1322,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev, return sch; err_out5: + qdisc_put_stab(rtnl_dereference(sch->stab)); +err_out4: /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */ if (ops->destroy) ops->destroy(sch); @@ -1324,16 +1335,6 @@ err_out2: err_out: *errp = err; return NULL; - -err_out4: - /* - * Any broken qdiscs that would require a ops->reset() here? - * The qdisc was never in action so it shouldn't be necessary. - */ - qdisc_put_stab(rtnl_dereference(sch->stab)); - if (ops->destroy) - ops->destroy(sch); - goto err_out3; } static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, @@ -1509,7 +1510,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, if (err != 0) return err; } else { - qdisc_notify(net, skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q, NULL); } return 0; } @@ -1648,7 +1649,7 @@ replay: } err = qdisc_change(q, tca, extack); if (err == 0) - qdisc_notify(net, skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q, extack); return err; create_n_graft: @@ -1715,7 +1716,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWQDISC) <= 0) + RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } @@ -1737,7 +1738,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWQDISC) <= 0) + RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } @@ -1810,8 +1811,8 @@ done: ************************************************/ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, - unsigned long cl, - u32 portid, u32 seq, u16 flags, int event) + unsigned long cl, u32 portid, u32 seq, u16 flags, + int event, struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1846,7 +1847,12 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1857,7 +1863,7 @@ nla_put_failure: static int tclass_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, - unsigned long cl, int event) + unsigned long cl, int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1866,7 +1872,7 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) { + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { kfree_skb(skb); return -EINVAL; } @@ -1893,7 +1899,7 @@ static int tclass_del_notify(struct net *net, return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, - RTM_DELTCLASS) < 0) { + RTM_DELTCLASS, extack) < 0) { kfree_skb(skb); return -EINVAL; } @@ -2100,7 +2106,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, tc_bind_tclass(q, portid, clid, 0); goto out; case RTM_GETTCLASS: - err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); + err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack); goto out; default: err = -EINVAL; @@ -2118,7 +2124,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, if (cops->change) err = cops->change(q, clid, portid, tca, &new_cl, extack); if (err == 0) { - tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); + tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); /* We just create a new class, need to do reverse binding. */ if (cl != new_cl) tc_bind_tclass(q, portid, clid, new_cl); @@ -2140,7 +2146,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTCLASS); + RTM_NEWTCLASS, NULL); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 3ed0c3342189..7970217b565a 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1209,7 +1209,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, iph_check->daddr != iph->daddr) continue; - seglen = ntohs(iph_check->tot_len) - + seglen = iph_totlen(skb, iph_check) - (4 * iph_check->ihl); } else if (iph_check->version == 6) { ipv6h = (struct ipv6hdr *)iph; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 4c68abaa289b..48ed87b91086 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -17,6 +17,8 @@ #include <net/sch_generic.h> #include <net/pkt_cls.h> +#include "sch_mqprio_lib.h" + struct mqprio_sched { struct Qdisc **qdiscs; u16 mode; @@ -27,6 +29,62 @@ struct mqprio_sched { u64 max_rate[TC_QOPT_MAX_QUEUE]; }; +static int mqprio_enable_offload(struct Qdisc *sch, + const struct tc_mqprio_qopt *qopt, + struct netlink_ext_ack *extack) +{ + struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt}; + struct mqprio_sched *priv = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int err, i; + + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + if (priv->shaper != TC_MQPRIO_SHAPER_DCB) + return -EINVAL; + break; + case TC_MQPRIO_MODE_CHANNEL: + mqprio.flags = priv->flags; + if (priv->flags & TC_MQPRIO_F_MODE) + mqprio.mode = priv->mode; + if (priv->flags & TC_MQPRIO_F_SHAPER) + mqprio.shaper = priv->shaper; + if (priv->flags & TC_MQPRIO_F_MIN_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.min_rate[i] = priv->min_rate[i]; + if (priv->flags & TC_MQPRIO_F_MAX_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.max_rate[i] = priv->max_rate[i]; + break; + default: + return -EINVAL; + } + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, + &mqprio); + if (err) + return err; + + priv->hw_offload = mqprio.qopt.hw; + + return 0; +} + +static void mqprio_disable_offload(struct Qdisc *sch) +{ + struct tc_mqprio_qopt_offload mqprio = { { 0 } }; + struct mqprio_sched *priv = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + case TC_MQPRIO_MODE_CHANNEL: + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, + &mqprio); + break; + } +} + static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); @@ -41,37 +99,17 @@ static void mqprio_destroy(struct Qdisc *sch) kfree(priv->qdiscs); } - if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) { - struct tc_mqprio_qopt_offload mqprio = { { 0 } }; - - switch (priv->mode) { - case TC_MQPRIO_MODE_DCB: - case TC_MQPRIO_MODE_CHANNEL: - dev->netdev_ops->ndo_setup_tc(dev, - TC_SETUP_QDISC_MQPRIO, - &mqprio); - break; - default: - return; - } - } else { + if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) + mqprio_disable_offload(sch); + else netdev_set_num_tc(dev, 0); - } } -static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) +static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, + const struct tc_mqprio_caps *caps, + struct netlink_ext_ack *extack) { - int i, j; - - /* Verify num_tc is not out of max range */ - if (qopt->num_tc > TC_MAX_QUEUE) - return -EINVAL; - - /* Verify priority mapping uses valid tcs */ - for (i = 0; i < TC_BITMASK + 1; i++) { - if (qopt->prio_tc_map[i] >= qopt->num_tc) - return -EINVAL; - } + int err; /* Limit qopt->hw to maximum supported offload value. Drivers have * the option of overriding this later if they don't support the a @@ -80,31 +118,23 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX) qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX; - /* If hardware offload is requested we will leave it to the device - * to either populate the queue counts itself or to validate the - * provided queue counts. If ndo_setup_tc is not present then - * hardware doesn't support offload and we should return an error. + /* If hardware offload is requested, we will leave 3 options to the + * device driver: + * - populate the queue counts itself (and ignore what was requested) + * - validate the provided queue counts by itself (and apply them) + * - request queue count validation here (and apply them) */ - if (qopt->hw) - return dev->netdev_ops->ndo_setup_tc ? 0 : -EINVAL; - - for (i = 0; i < qopt->num_tc; i++) { - unsigned int last = qopt->offset[i] + qopt->count[i]; - - /* Verify the queue count is in tx range being equal to the - * real_num_tx_queues indicates the last queue is in use. - */ - if (qopt->offset[i] >= dev->real_num_tx_queues || - !qopt->count[i] || - last > dev->real_num_tx_queues) - return -EINVAL; - - /* Verify that the offset and counts do not overlap */ - for (j = i + 1; j < qopt->num_tc; j++) { - if (last > qopt->offset[j]) - return -EINVAL; - } - } + err = mqprio_validate_qopt(dev, qopt, + !qopt->hw || caps->validate_queue_counts, + false, extack); + if (err) + return err; + + /* If ndo_setup_tc is not present then hardware doesn't support offload + * and we should return an error. + */ + if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) + return -EINVAL; return 0; } @@ -130,6 +160,67 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } +static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt, + struct nlattr *opt) +{ + struct mqprio_sched *priv = qdisc_priv(sch); + struct nlattr *tb[TCA_MQPRIO_MAX + 1]; + struct nlattr *attr; + int i, rem, err; + + err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, + sizeof(*qopt)); + if (err < 0) + return err; + + if (!qopt->hw) + return -EINVAL; + + if (tb[TCA_MQPRIO_MODE]) { + priv->flags |= TC_MQPRIO_F_MODE; + priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); + } + + if (tb[TCA_MQPRIO_SHAPER]) { + priv->flags |= TC_MQPRIO_F_SHAPER; + priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); + } + + if (tb[TCA_MQPRIO_MIN_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->min_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MIN_RATE; + } + + if (tb[TCA_MQPRIO_MAX_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->max_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MAX_RATE; + } + + return 0; +} + static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -139,9 +230,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, struct Qdisc *qdisc; int i, err = -EOPNOTSUPP; struct tc_mqprio_qopt *qopt = NULL; - struct nlattr *tb[TCA_MQPRIO_MAX + 1]; - struct nlattr *attr; - int rem; + struct tc_mqprio_caps caps; int len; BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); @@ -160,61 +249,18 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; + qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO, + &caps, sizeof(caps)); + qopt = nla_data(opt); - if (mqprio_parse_opt(dev, qopt)) + if (mqprio_parse_opt(dev, qopt, &caps, extack)) return -EINVAL; len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); if (len > 0) { - err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, - sizeof(*qopt)); - if (err < 0) + err = mqprio_parse_nlattr(sch, qopt, opt); + if (err) return err; - - if (!qopt->hw) - return -EINVAL; - - if (tb[TCA_MQPRIO_MODE]) { - priv->flags |= TC_MQPRIO_F_MODE; - priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); - } - - if (tb[TCA_MQPRIO_SHAPER]) { - priv->flags |= TC_MQPRIO_F_SHAPER; - priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); - } - - if (tb[TCA_MQPRIO_MIN_RATE64]) { - if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) - return -EINVAL; - i = 0; - nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], - rem) { - if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) - return -EINVAL; - if (i >= qopt->num_tc) - break; - priv->min_rate[i] = *(u64 *)nla_data(attr); - i++; - } - priv->flags |= TC_MQPRIO_F_MIN_RATE; - } - - if (tb[TCA_MQPRIO_MAX_RATE64]) { - if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) - return -EINVAL; - i = 0; - nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], - rem) { - if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) - return -EINVAL; - if (i >= qopt->num_tc) - break; - priv->max_rate[i] = *(u64 *)nla_data(attr); - i++; - } - priv->flags |= TC_MQPRIO_F_MAX_RATE; - } } /* pre-allocate qdisc, attachment can't fail */ @@ -241,36 +287,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, * supplied and verified mapping */ if (qopt->hw) { - struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt}; - - switch (priv->mode) { - case TC_MQPRIO_MODE_DCB: - if (priv->shaper != TC_MQPRIO_SHAPER_DCB) - return -EINVAL; - break; - case TC_MQPRIO_MODE_CHANNEL: - mqprio.flags = priv->flags; - if (priv->flags & TC_MQPRIO_F_MODE) - mqprio.mode = priv->mode; - if (priv->flags & TC_MQPRIO_F_SHAPER) - mqprio.shaper = priv->shaper; - if (priv->flags & TC_MQPRIO_F_MIN_RATE) - for (i = 0; i < mqprio.qopt.num_tc; i++) - mqprio.min_rate[i] = priv->min_rate[i]; - if (priv->flags & TC_MQPRIO_F_MAX_RATE) - for (i = 0; i < mqprio.qopt.num_tc; i++) - mqprio.max_rate[i] = priv->max_rate[i]; - break; - default: - return -EINVAL; - } - err = dev->netdev_ops->ndo_setup_tc(dev, - TC_SETUP_QDISC_MQPRIO, - &mqprio); + err = mqprio_enable_offload(sch, qopt, extack); if (err) return err; - - priv->hw_offload = mqprio.qopt.hw; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) @@ -387,7 +406,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; - unsigned int ntx, tc; + unsigned int ntx; sch->q.qlen = 0; gnet_stats_basic_sync_init(&sch->bstats); @@ -411,15 +430,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) spin_unlock_bh(qdisc_lock(qdisc)); } - opt.num_tc = netdev_get_num_tc(dev); - memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); + mqprio_qopt_reconstruct(dev, &opt); opt.hw = priv->hw_offload; - for (tc = 0; tc < netdev_get_num_tc(dev); tc++) { - opt.count[tc] = dev->tc_to_txq[tc].count; - opt.offset[tc] = dev->tc_to_txq[tc].offset; - } - if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c new file mode 100644 index 000000000000..c58a533b8ec5 --- /dev/null +++ b/net/sched/sch_mqprio_lib.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/netlink.h> +#include <linux/types.h> +#include <net/pkt_sched.h> + +#include "sch_mqprio_lib.h" + +/* Returns true if the intervals [a, b) and [c, d) overlap. */ +static bool intervals_overlap(int a, int b, int c, int d) +{ + int left = max(a, c), right = min(b, d); + + return left < right; +} + +static int mqprio_validate_queue_counts(struct net_device *dev, + const struct tc_mqprio_qopt *qopt, + bool allow_overlapping_txqs, + struct netlink_ext_ack *extack) +{ + int i, j; + + for (i = 0; i < qopt->num_tc; i++) { + unsigned int last = qopt->offset[i] + qopt->count[i]; + + if (!qopt->count[i]) { + NL_SET_ERR_MSG_FMT_MOD(extack, "No queues for TC %d", + i); + return -EINVAL; + } + + /* Verify the queue count is in tx range being equal to the + * real_num_tx_queues indicates the last queue is in use. + */ + if (qopt->offset[i] >= dev->real_num_tx_queues || + last > dev->real_num_tx_queues) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "Queues %d:%d for TC %d exceed the %d TX queues available", + qopt->count[i], qopt->offset[i], + i, dev->real_num_tx_queues); + return -EINVAL; + } + + if (allow_overlapping_txqs) + continue; + + /* Verify that the offset and counts do not overlap */ + for (j = i + 1; j < qopt->num_tc; j++) { + if (intervals_overlap(qopt->offset[i], last, + qopt->offset[j], + qopt->offset[j] + + qopt->count[j])) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "TC %d queues %d@%d overlap with TC %d queues %d@%d", + i, qopt->count[i], qopt->offset[i], + j, qopt->count[j], qopt->offset[j]); + return -EINVAL; + } + } + } + + return 0; +} + +int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt, + bool validate_queue_counts, + bool allow_overlapping_txqs, + struct netlink_ext_ack *extack) +{ + int i, err; + + /* Verify num_tc is not out of max range */ + if (qopt->num_tc > TC_MAX_QUEUE) { + NL_SET_ERR_MSG(extack, + "Number of traffic classes is outside valid range"); + return -EINVAL; + } + + /* Verify priority mapping uses valid tcs */ + for (i = 0; i <= TC_BITMASK; i++) { + if (qopt->prio_tc_map[i] >= qopt->num_tc) { + NL_SET_ERR_MSG(extack, + "Invalid traffic class in priority to traffic class mapping"); + return -EINVAL; + } + } + + if (validate_queue_counts) { + err = mqprio_validate_queue_counts(dev, qopt, + allow_overlapping_txqs, + extack); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mqprio_validate_qopt); + +void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt) +{ + int tc, num_tc = netdev_get_num_tc(dev); + + qopt->num_tc = num_tc; + memcpy(qopt->prio_tc_map, dev->prio_tc_map, sizeof(qopt->prio_tc_map)); + + for (tc = 0; tc < num_tc; tc++) { + qopt->count[tc] = dev->tc_to_txq[tc].count; + qopt->offset[tc] = dev->tc_to_txq[tc].offset; + } +} +EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct); + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h new file mode 100644 index 000000000000..63f725ab8761 --- /dev/null +++ b/net/sched/sch_mqprio_lib.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __SCH_MQPRIO_LIB_H +#define __SCH_MQPRIO_LIB_H + +#include <linux/types.h> + +struct net_device; +struct netlink_ext_ack; +struct tc_mqprio_qopt; + +int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt, + bool validate_queue_counts, + bool allow_overlapping_txqs, + struct netlink_ext_ack *extack); +void mqprio_qopt_reconstruct(struct net_device *dev, + struct tc_mqprio_qopt *qopt); + +#endif diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index c322a61eaeea..9781b47962bb 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -26,7 +26,11 @@ #include <net/sock.h> #include <net/tcp.h> +#include "sch_mqprio_lib.h" + static LIST_HEAD(taprio_list); +static struct static_key_false taprio_have_broken_mqprio; +static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_ALL_GATES_OPEN -1 @@ -35,15 +39,19 @@ static LIST_HEAD(taprio_list); #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { - struct list_head list; - - /* The instant that this entry "closes" and the next one - * should open, the qdisc will make some effort so that no - * packet leaves after this time. + /* Durations between this GCL entry and the GCL entry where the + * respective traffic class gate closes */ - ktime_t close_time; + u64 gate_duration[TC_MAX_QUEUE]; + atomic_t budget[TC_MAX_QUEUE]; + /* The qdisc makes some effort so that no packet leaves + * after this time + */ + ktime_t gate_close_time[TC_MAX_QUEUE]; + struct list_head list; + /* Used to calculate when to advance the schedule */ + ktime_t end_time; ktime_t next_txtime; - atomic_t budget; int index; u32 gate_mask; u32 interval; @@ -51,10 +59,16 @@ struct sched_entry { }; struct sched_gate_list { + /* Longest non-zero contiguous gate durations per traffic class, + * or 0 if a traffic class gate never opens during the schedule. + */ + u64 max_open_gate_duration[TC_MAX_QUEUE]; + u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ + u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries; size_t num_entries; - ktime_t cycle_close_time; + ktime_t cycle_end_time; s64 cycle_time; s64 cycle_time_extension; s64 base_time; @@ -67,6 +81,8 @@ struct taprio_sched { enum tk_offsets tk_offset; int clockid; bool offloaded; + bool detected_mqprio; + bool broken_mqprio; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ @@ -78,8 +94,8 @@ struct taprio_sched { struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; - u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ - u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */ + int cur_txq[TC_MAX_QUEUE]; + u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ u32 txtime_delay; }; @@ -88,6 +104,57 @@ struct __tc_taprio_qopt_offload { struct tc_taprio_qopt_offload offload; }; +static void taprio_calculate_gate_durations(struct taprio_sched *q, + struct sched_gate_list *sched) +{ + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); + struct sched_entry *entry, *cur; + int tc; + + list_for_each_entry(entry, &sched->entries, list) { + u32 gates_still_open = entry->gate_mask; + + /* For each traffic class, calculate each open gate duration, + * starting at this schedule entry and ending at the schedule + * entry containing a gate close event for that TC. + */ + cur = entry; + + do { + if (!gates_still_open) + break; + + for (tc = 0; tc < num_tc; tc++) { + if (!(gates_still_open & BIT(tc))) + continue; + + if (cur->gate_mask & BIT(tc)) + entry->gate_duration[tc] += cur->interval; + else + gates_still_open &= ~BIT(tc); + } + + cur = list_next_entry_circular(cur, &sched->entries, list); + } while (cur != entry); + + /* Keep track of the maximum gate duration for each traffic + * class, taking care to not confuse a traffic class which is + * temporarily closed with one that is always closed. + */ + for (tc = 0; tc < num_tc; tc++) + if (entry->gate_duration[tc] && + sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) + sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; + } +} + +static bool taprio_entry_allows_tx(ktime_t skb_end_time, + struct sched_entry *entry, int tc) +{ + return ktime_before(skb_end_time, entry->gate_close_time[tc]); +} + static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) @@ -180,6 +247,55 @@ static int length_to_duration(struct taprio_sched *q, int len) return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } +static int duration_to_length(struct taprio_sched *q, u64 duration) +{ + return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); +} + +/* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the + * q->max_sdu[] requested by the user and the max_sdu dynamically determined by + * the maximum open gate durations at the given link speed. + */ +static void taprio_update_queue_max_sdu(struct taprio_sched *q, + struct sched_gate_list *sched, + struct qdisc_size_table *stab) +{ + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); + u32 max_sdu_from_user; + u32 max_sdu_dynamic; + u32 max_sdu; + int tc; + + for (tc = 0; tc < num_tc; tc++) { + max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; + + /* TC gate never closes => keep the queueMaxSDU + * selected by the user + */ + if (sched->max_open_gate_duration[tc] == sched->cycle_time) { + max_sdu_dynamic = U32_MAX; + } else { + u32 max_frm_len; + + max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); + if (stab) + max_frm_len -= stab->szopts.overhead; + max_sdu_dynamic = max_frm_len - dev->hard_header_len; + } + + max_sdu = min(max_sdu_dynamic, max_sdu_from_user); + + if (max_sdu != U32_MAX) { + sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; + sched->max_sdu[tc] = max_sdu; + } else { + sched->max_frm_len[tc] = U32_MAX; /* never oversized */ + sched->max_sdu[tc] = 0; + } + } +} + /* Returns the entry corresponding to next available interval. If * validate_interval is set, it only validates whether the timestamp occurs * when the gate corresponding to the skb's traffic class is open. @@ -413,14 +529,33 @@ done: return txtime; } -static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, - struct Qdisc *child, struct sk_buff **to_free) +/* Devices with full offload are expected to honor this in hardware */ +static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, + struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct sched_gate_list *sched; int prio = skb->priority; + bool exceeds = false; u8 tc; + tc = netdev_get_prio_tc_map(dev, prio); + + rcu_read_lock(); + sched = rcu_dereference(q->oper_sched); + if (sched && skb->len > sched->max_frm_len[tc]) + exceeds = true; + rcu_read_unlock(); + + return exceeds; +} + +static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, + struct Qdisc *child, struct sk_buff **to_free) +{ + struct taprio_sched *q = qdisc_priv(sch); + /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) @@ -431,17 +566,53 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); } - /* Devices with full offload are expected to honor this in hardware */ - tc = netdev_get_prio_tc_map(dev, prio); - if (skb->len > q->max_frm_len[tc]) - return qdisc_drop(skb, sch, to_free); - qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } +static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, + struct Qdisc *child, + struct sk_buff **to_free) +{ + unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); + netdev_features_t features = netif_skb_features(skb); + struct sk_buff *segs, *nskb; + int ret; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(segs)) + return qdisc_drop(skb, sch, to_free); + + skb_list_walk_safe(segs, segs, nskb) { + skb_mark_not_on_list(segs); + qdisc_skb_cb(segs)->pkt_len = segs->len; + slen += segs->len; + + /* FIXME: we should be segmenting to a smaller size + * rather than dropping these + */ + if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) + ret = qdisc_drop(segs, sch, to_free); + else + ret = taprio_enqueue_one(segs, sch, child, to_free); + + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + qdisc_qstats_drop(sch); + } else { + numsegs++; + } + } + + if (numsegs > 1) + qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); + consume_skb(skb); + + return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +} + /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ @@ -458,97 +629,190 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); - /* Large packets might not be transmitted when the transmission duration - * exceeds any configured interval. Therefore, segment the skb into - * smaller chunks. Drivers with full offload are expected to handle - * this in hardware. - */ - if (skb_is_gso(skb)) { - unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); - netdev_features_t features = netif_skb_features(skb); - struct sk_buff *segs, *nskb; - int ret; - - segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); - if (IS_ERR_OR_NULL(segs)) - return qdisc_drop(skb, sch, to_free); + if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { + /* Large packets might not be transmitted when the transmission + * duration exceeds any configured interval. Therefore, segment + * the skb into smaller chunks. Drivers with full offload are + * expected to handle this in hardware. + */ + if (skb_is_gso(skb)) + return taprio_enqueue_segmented(skb, sch, child, + to_free); - skb_list_walk_safe(segs, segs, nskb) { - skb_mark_not_on_list(segs); - qdisc_skb_cb(segs)->pkt_len = segs->len; - slen += segs->len; + return qdisc_drop(skb, sch, to_free); + } - ret = taprio_enqueue_one(segs, sch, child, to_free); - if (ret != NET_XMIT_SUCCESS) { - if (net_xmit_drop_count(ret)) - qdisc_qstats_drop(sch); - } else { - numsegs++; - } - } + return taprio_enqueue_one(skb, sch, child, to_free); +} + +static struct sk_buff *taprio_peek(struct Qdisc *sch) +{ + WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); + return NULL; +} + +static void taprio_set_budgets(struct taprio_sched *q, + struct sched_gate_list *sched, + struct sched_entry *entry) +{ + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); + int tc, budget; - if (numsegs > 1) - qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); - consume_skb(skb); + for (tc = 0; tc < num_tc; tc++) { + /* Traffic classes which never close have infinite budget */ + if (entry->gate_duration[tc] == sched->cycle_time) + budget = INT_MAX; + else + budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, + atomic64_read(&q->picos_per_byte)); - return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; + atomic_set(&entry->budget[tc], budget); } +} - return taprio_enqueue_one(skb, sch, child, to_free); +/* When an skb is sent, it consumes from the budget of all traffic classes */ +static int taprio_update_budgets(struct sched_entry *entry, size_t len, + int tc_consumed, int num_tc) +{ + int tc, budget, new_budget = 0; + + for (tc = 0; tc < num_tc; tc++) { + budget = atomic_read(&entry->budget[tc]); + /* Don't consume from infinite budget */ + if (budget == INT_MAX) { + if (tc == tc_consumed) + new_budget = budget; + continue; + } + + if (tc == tc_consumed) + new_budget = atomic_sub_return(len, &entry->budget[tc]); + else + atomic_sub(len, &entry->budget[tc]); + } + + return new_budget; } -/* Will not be called in the full offload case, since the TX queues are - * attached to the Qdisc created using qdisc_create_dflt() - */ -static struct sk_buff *taprio_peek(struct Qdisc *sch) +static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, + struct sched_entry *entry, + u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct sched_entry *entry; + struct Qdisc *child = q->qdiscs[txq]; + int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; - u32 gate_mask; - int i; + ktime_t guard; + int prio; + int len; + u8 tc; - rcu_read_lock(); - entry = rcu_dereference(q->current_entry); - gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; - rcu_read_unlock(); + if (unlikely(!child)) + return NULL; - if (!gate_mask) + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) + goto skip_peek_checks; + + skb = child->ops->peek(child); + if (!skb) return NULL; - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - int prio; - u8 tc; + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); - if (unlikely(!child)) - continue; + if (!(gate_mask & BIT(tc))) + return NULL; - skb = child->ops->peek(child); - if (!skb) - continue; + len = qdisc_pkt_len(skb); + guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); - if (TXTIME_ASSIST_IS_ENABLED(q->flags)) - return skb; + /* In the case that there's no gate entry, there's no + * guard band ... + */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + !taprio_entry_allows_tx(guard, entry, tc)) + return NULL; + + /* ... and no budget. */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + taprio_update_budgets(entry, len, tc, num_tc) < 0) + return NULL; + +skip_peek_checks: + skb = child->ops->dequeue(child); + if (unlikely(!skb)) + return NULL; + + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + + return skb; +} + +static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) +{ + int offset = dev->tc_to_txq[tc].offset; + int count = dev->tc_to_txq[tc].count; + + (*txq)++; + if (*txq == offset + count) + *txq = offset; +} - prio = skb->priority; - tc = netdev_get_prio_tc_map(dev, prio); +/* Prioritize higher traffic classes, and select among TXQs belonging to the + * same TC using round robin + */ +static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, + struct sched_entry *entry, + u32 gate_mask) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int num_tc = netdev_get_num_tc(dev); + struct sk_buff *skb; + int tc; + + for (tc = num_tc - 1; tc >= 0; tc--) { + int first_txq = q->cur_txq[tc]; if (!(gate_mask & BIT(tc))) continue; - return skb; + do { + skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], + entry, gate_mask); + + taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); + + if (skb) + return skb; + } while (q->cur_txq[tc] != first_txq); } return NULL; } -static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) +/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic + * class other than to determine whether the gate is open or not + */ +static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, + struct sched_entry *entry, + u32 gate_mask) { - atomic_set(&entry->budget, - div64_u64((u64)entry->interval * PSEC_PER_NSEC, - atomic64_read(&q->picos_per_byte))); + struct net_device *dev = qdisc_dev(sch); + struct sk_buff *skb; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); + if (skb) + return skb; + } + + return NULL; } /* Will not be called in the full offload case, since the TX queues are @@ -557,11 +821,9 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb = NULL; struct sched_entry *entry; u32 gate_mask; - int i; rcu_read_lock(); entry = rcu_dereference(q->current_entry); @@ -571,69 +833,23 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; - if (!gate_mask) goto done; - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - ktime_t guard; - int prio; - int len; - u8 tc; - - if (unlikely(!child)) - continue; - - if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { - skb = child->ops->dequeue(child); - if (!skb) - continue; - goto skb_found; - } - - skb = child->ops->peek(child); - if (!skb) - continue; - - prio = skb->priority; - tc = netdev_get_prio_tc_map(dev, prio); - - if (!(gate_mask & BIT(tc))) { - skb = NULL; - continue; - } - - len = qdisc_pkt_len(skb); - guard = ktime_add_ns(taprio_get_time(q), - length_to_duration(q, len)); - - /* In the case that there's no gate entry, there's no - * guard band ... - */ - if (gate_mask != TAPRIO_ALL_GATES_OPEN && - ktime_after(guard, entry->close_time)) { - skb = NULL; - continue; - } - - /* ... and no budget. */ - if (gate_mask != TAPRIO_ALL_GATES_OPEN && - atomic_sub_return(len, &entry->budget) < 0) { - skb = NULL; - continue; - } - - skb = child->ops->dequeue(child); - if (unlikely(!skb)) - goto done; - -skb_found: - qdisc_bstats_update(sch, skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - - goto done; + if (static_branch_unlikely(&taprio_have_broken_mqprio) && + !static_branch_likely(&taprio_have_working_mqprio)) { + /* Single NIC kind which is broken */ + skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); + } else if (static_branch_likely(&taprio_have_working_mqprio) && + !static_branch_unlikely(&taprio_have_broken_mqprio)) { + /* Single NIC kind which prioritizes properly */ + skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); + } else { + /* Mixed NIC kinds present in system, need dynamic testing */ + if (q->broken_mqprio) + skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); + else + skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } done: @@ -648,7 +864,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper, if (list_is_last(&entry->list, &oper->entries)) return true; - if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0) + if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) return true; return false; @@ -656,7 +872,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper, static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, - ktime_t close_time) + ktime_t end_time) { ktime_t next_base_time, extension_time; @@ -665,18 +881,18 @@ static bool should_change_schedules(const struct sched_gate_list *admin, next_base_time = sched_base_time(admin); - /* This is the simple case, the close_time would fall after + /* This is the simple case, the end_time would fall after * the next schedule base_time. */ - if (ktime_compare(next_base_time, close_time) <= 0) + if (ktime_compare(next_base_time, end_time) <= 0) return true; - /* This is the cycle_time_extension case, if the close_time + /* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount. */ - extension_time = ktime_add_ns(close_time, oper->cycle_time_extension); + extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after @@ -692,10 +908,13 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); + struct net_device *dev = qdisc_dev(q->root); struct sched_gate_list *oper, *admin; + int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *next; struct Qdisc *sch = q->root; - ktime_t close_time; + ktime_t end_time; + int tc; spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, @@ -714,41 +933,49 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) * entry of all schedules are pre-calculated during the * schedule initialization. */ - if (unlikely(!entry || entry->close_time == oper->base_time)) { + if (unlikely(!entry || entry->end_time == oper->base_time)) { next = list_first_entry(&oper->entries, struct sched_entry, list); - close_time = next->close_time; + end_time = next->end_time; goto first_run; } if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); - oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time, - oper->cycle_time); + oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, + oper->cycle_time); } else { next = list_next_entry(entry, list); } - close_time = ktime_add_ns(entry->close_time, next->interval); - close_time = min_t(ktime_t, close_time, oper->cycle_close_time); + end_time = ktime_add_ns(entry->end_time, next->interval); + end_time = min_t(ktime_t, end_time, oper->cycle_end_time); + + for (tc = 0; tc < num_tc; tc++) { + if (next->gate_duration[tc] == oper->cycle_time) + next->gate_close_time[tc] = KTIME_MAX; + else + next->gate_close_time[tc] = ktime_add_ns(entry->end_time, + next->gate_duration[tc]); + } - if (should_change_schedules(admin, oper, close_time)) { + if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs. */ - close_time = sched_base_time(admin); + end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); } - next->close_time = close_time; - taprio_set_budget(q, next); + next->end_time = end_time; + taprio_set_budgets(q, oper, next); first_run: rcu_assign_pointer(q->current_entry, next); spin_unlock(&q->current_entry_lock); - hrtimer_set_expires(&q->advance_timer, close_time); + hrtimer_set_expires(&q->advance_timer, end_time); rcu_read_lock(); __netif_schedule(sch); @@ -916,6 +1143,8 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, new->cycle_time = cycle; } + taprio_calculate_gate_durations(q, new); + return 0; } @@ -924,7 +1153,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, struct netlink_ext_ack *extack, u32 taprio_flags) { - int i, j; + bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); if (!qopt && !dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); @@ -937,52 +1166,17 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, if (dev->num_tc) return 0; - /* Verify num_tc is not out of max range */ - if (qopt->num_tc > TC_MAX_QUEUE) { - NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); - return -EINVAL; - } - /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL; } - /* Verify priority mapping uses valid tcs */ - for (i = 0; i <= TC_BITMASK; i++) { - if (qopt->prio_tc_map[i] >= qopt->num_tc) { - NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); - return -EINVAL; - } - } - - for (i = 0; i < qopt->num_tc; i++) { - unsigned int last = qopt->offset[i] + qopt->count[i]; - - /* Verify the queue count is in tx range being equal to the - * real_num_tx_queues indicates the last queue is in use. - */ - if (qopt->offset[i] >= dev->num_tx_queues || - !qopt->count[i] || - last > dev->real_num_tx_queues) { - NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping"); - return -EINVAL; - } - - if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) - continue; - - /* Verify that the offset and counts do not overlap */ - for (j = i + 1; j < qopt->num_tc; j++) { - if (last > qopt->offset[j]) { - NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping"); - return -EINVAL; - } - } - } - - return 0; + /* For some reason, in txtime-assist mode, we allow TXQ ranges for + * different TCs to overlap, and just validate the TXQ ranges. + */ + return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, + extack); } static int taprio_get_start_time(struct Qdisc *sch, @@ -1019,11 +1213,14 @@ static int taprio_get_start_time(struct Qdisc *sch, return 0; } -static void setup_first_close_time(struct taprio_sched *q, - struct sched_gate_list *sched, ktime_t base) +static void setup_first_end_time(struct taprio_sched *q, + struct sched_gate_list *sched, ktime_t base) { + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); struct sched_entry *first; ktime_t cycle; + int tc; first = list_first_entry(&sched->entries, struct sched_entry, list); @@ -1031,10 +1228,18 @@ static void setup_first_close_time(struct taprio_sched *q, cycle = sched->cycle_time; /* FIXME: find a better place to do this */ - sched->cycle_close_time = ktime_add_ns(base, cycle); + sched->cycle_end_time = ktime_add_ns(base, cycle); + + first->end_time = ktime_add_ns(base, first->interval); + taprio_set_budgets(q, sched, first); + + for (tc = 0; tc < num_tc; tc++) { + if (first->gate_duration[tc] == sched->cycle_time) + first->gate_close_time[tc] = KTIME_MAX; + else + first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); + } - first->close_time = ktime_add_ns(base, first->interval); - taprio_set_budget(q, first); rcu_assign_pointer(q->current_entry, NULL); } @@ -1088,6 +1293,8 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct sched_gate_list *oper, *admin; + struct qdisc_size_table *stab; struct taprio_sched *q; ASSERT_RTNL(); @@ -1100,6 +1307,17 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, continue; taprio_set_picos_per_byte(dev, q); + + stab = rtnl_dereference(q->root->stab); + + oper = rtnl_dereference(q->oper_sched); + if (oper) + taprio_update_queue_max_sdu(q, oper, stab); + + admin = rtnl_dereference(q->admin_sched); + if (admin) + taprio_update_queue_max_sdu(q, admin, stab); + break; } @@ -1203,7 +1421,8 @@ static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, - struct tc_taprio_qopt_offload *offload) + struct tc_taprio_qopt_offload *offload, + const struct tc_taprio_caps *caps) { struct sched_entry *entry; int i = 0; @@ -1217,7 +1436,11 @@ static void taprio_sched_to_offload(struct net_device *dev, e->command = entry->command; e->interval = entry->interval; - e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); + if (caps->gate_mask_per_txq) + e->gate_mask = tc_map_to_queue_mask(dev, + entry->gate_mask); + else + e->gate_mask = entry->gate_mask; i++; } @@ -1225,6 +1448,34 @@ static void taprio_sched_to_offload(struct net_device *dev, offload->num_entries = i; } +static void taprio_detect_broken_mqprio(struct taprio_sched *q) +{ + struct net_device *dev = qdisc_dev(q->root); + struct tc_taprio_caps caps; + + qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, + &caps, sizeof(caps)); + + q->broken_mqprio = caps.broken_mqprio; + if (q->broken_mqprio) + static_branch_inc(&taprio_have_broken_mqprio); + else + static_branch_inc(&taprio_have_working_mqprio); + + q->detected_mqprio = true; +} + +static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) +{ + if (!q->detected_mqprio) + return; + + if (q->broken_mqprio) + static_branch_dec(&taprio_have_broken_mqprio); + else + static_branch_dec(&taprio_have_working_mqprio); +} + static int taprio_enable_offload(struct net_device *dev, struct taprio_sched *q, struct sched_gate_list *sched, @@ -1261,7 +1512,8 @@ static int taprio_enable_offload(struct net_device *dev, return -ENOMEM; } offload->enable = 1; - taprio_sched_to_offload(dev, sched, offload); + mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); + taprio_sched_to_offload(dev, sched, offload, &caps); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; @@ -1452,7 +1704,6 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; unsigned long seen_tcs = 0; struct nlattr *n; @@ -1466,18 +1717,14 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) continue; - err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, extack); + err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, + extack); if (err) goto out; } - for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) q->max_sdu[tc] = max_sdu[tc]; - if (max_sdu[tc]) - q->max_frm_len[tc] = max_sdu[tc] + dev->hard_header_len; - else - q->max_frm_len[tc] = U32_MAX; /* never oversized */ - } out: return err; @@ -1533,6 +1780,7 @@ static int taprio_new_flags(const struct nlattr *attr, u32 old, static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct qdisc_size_table *stab = rtnl_dereference(sch->stab); struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); @@ -1600,15 +1848,18 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto free_sched; taprio_set_picos_per_byte(dev, q); + taprio_update_queue_max_sdu(q, new_admin, stab); if (mqprio) { err = netdev_set_num_tc(dev, mqprio->num_tc); if (err) goto free_sched; - for (i = 0; i < mqprio->num_tc; i++) + for (i = 0; i < mqprio->num_tc; i++) { netdev_set_tc_queue(dev, i, mqprio->count[i], mqprio->offset[i]); + q->cur_txq[i] = mqprio->offset[i]; + } /* Always use supplied priority mappings */ for (i = 0; i <= TC_BITMASK; i++) @@ -1663,7 +1914,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { - setup_first_close_time(q, new_admin, start); + setup_first_end_time(q, new_admin, start); /* Protects against advance_sched() */ spin_lock_irqsave(&q->current_entry_lock, flags); @@ -1683,6 +1934,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, new_admin = NULL; err = 0; + if (!stab) + NL_SET_ERR_MSG_MOD(extack, + "Size table not specified, frame length estimations may be inaccurate"); + unlock: spin_unlock_bh(qdisc_lock(sch)); @@ -1743,6 +1998,8 @@ static void taprio_destroy(struct Qdisc *sch) if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); + + taprio_cleanup_broken_mqprio(q); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, @@ -1807,6 +2064,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, q->qdiscs[i] = qdisc; } + taprio_detect_broken_mqprio(q); + return taprio_change(sch, opt, extack); } @@ -1947,7 +2206,8 @@ error_nest: return -1; } -static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb) +static int taprio_dump_tc_entries(struct sk_buff *skb, + struct sched_gate_list *sched) { struct nlattr *n; int tc; @@ -1961,7 +2221,7 @@ static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, - q->max_sdu[tc])) + sched->max_sdu[tc])) goto nla_put_failure; nla_nest_end(skb, n); @@ -1981,18 +2241,11 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; - unsigned int i; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); - opt.num_tc = netdev_get_num_tc(dev); - memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); - - for (i = 0; i < netdev_get_num_tc(dev); i++) { - opt.count[i] = dev->tc_to_txq[i].count; - opt.offset[i] = dev->tc_to_txq[i].offset; - } + mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) @@ -2012,7 +2265,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; - if (taprio_dump_tc_entries(q, skb)) + if (oper && taprio_dump_tc_entries(skb, oper)) goto options_error; if (oper && dump_schedule(skb, oper)) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 84021a6c4f9d..b91616f819de 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -59,6 +59,7 @@ #include <net/ipv6.h> #include <net/inet_common.h> #include <net/busy_poll.h> +#include <trace/events/sock.h> #include <linux/socket.h> /* for sa_family_t */ #include <linux/export.h> @@ -8321,7 +8322,7 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) int low, high, remaining, index; unsigned int rover; - inet_get_local_port_range(net, &low, &high); + inet_sk_get_local_port_range(sk, &low, &high); remaining = (high - low) + 1; rover = get_random_u32_below(remaining) + low; @@ -9244,6 +9245,8 @@ void sctp_data_ready(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e12d4fa5aece..b163266e581a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,6 +27,7 @@ #include <linux/if_vlan.h> #include <linux/rcupdate_wait.h> #include <linux/ctype.h> +#include <linux/splice.h> #include <net/sock.h> #include <net/tcp.h> @@ -501,7 +502,7 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link, return -EINVAL; /* protect against parallel smcr_link_reg_buf() */ - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; @@ -509,7 +510,7 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link, if (rc) break; } - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); return rc; } @@ -518,15 +519,30 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) { struct smc_link_group *lgr = link->lgr; + bool do_slow = false; int i, rc = 0; rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (rc) return rc; + + down_read(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + if (!rmb_desc->is_reg_mr[link->link_idx]) { + up_read(&lgr->llc_conf_mutex); + goto slow_path; + } + } + /* mr register already */ + goto fast_path; +slow_path: + do_slow = true; /* protect against parallel smc_llc_cli_rkey_exchange() and * parallel smcr_link_reg_buf() */ - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; @@ -534,7 +550,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, if (rc) goto out; } - +fast_path: /* exchange confirm_rkey msg with peer */ rc = smc_llc_do_confirm_rkey(link, rmb_desc); if (rc) { @@ -543,7 +559,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, } rmb_desc->is_conf_rkey = true; out: - mutex_unlock(&lgr->llc_conf_mutex); + do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } @@ -3382,12 +3398,14 @@ static int __init smc_init(void) if (rc) goto out_pernet_subsys; - smc_ism_init(); + rc = smc_ism_init(); + if (rc) + goto out_pernet_subsys_stat; smc_clc_init(); rc = smc_nl_init(); if (rc) - goto out_pernet_subsys_stat; + goto out_ism; rc = smc_pnet_init(); if (rc) @@ -3480,6 +3498,8 @@ out_pnet: smc_pnet_exit(); out_nl: smc_nl_exit(); +out_ism: + smc_ism_exit(); out_pernet_subsys_stat: unregister_pernet_subsys(&smc_net_stat_ops); out_pernet_subsys: @@ -3495,6 +3515,7 @@ static void __exit smc_exit(void) sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); + smc_ism_exit(); destroy_workqueue(smc_close_wq); destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index dfb9797f7bc6..b9b8b07aa702 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -813,6 +813,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) struct smc_clc_v2_extension *v2_ext; struct smc_clc_msg_smcd *pclc_smcd; struct smc_clc_msg_trail *trl; + struct smcd_dev *smcd; int len, i, plen, rc; int reason_code = 0; struct kvec vec[8]; @@ -868,7 +869,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (smcd_indicated(ini->smc_type_v1)) { /* add SMC-D specifics */ if (ini->ism_dev[0]) { - pclc_smcd->ism.gid = htonll(ini->ism_dev[0]->local_gid); + smcd = ini->ism_dev[0]; + pclc_smcd->ism.gid = + htonll(smcd->ops->get_local_gid(smcd)); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); } @@ -914,8 +917,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) plen += sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { + smcd = ini->ism_dev[i]; gidchids[i - 1].gid = - htonll(ini->ism_dev[i]->local_gid); + htonll(smcd->ops->get_local_gid(smcd)); gidchids[i - 1].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); } @@ -1000,7 +1004,8 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); clc->hdr.typev1 = SMC_TYPE_D; - clc->d0.gid = conn->lgr->smcd->local_gid; + clc->d0.gid = + conn->lgr->smcd->ops->get_local_gid(conn->lgr->smcd); clc->d0.token = conn->rmb_desc->token; clc->d0.dmbe_size = conn->rmbe_size_short; clc->d0.dmbe_idx = 0; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index c305d8dd23f8..b330a1fa453e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -500,6 +500,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct smcd_dev *smcd = lgr->smcd; struct nlattr *attrs; void *nlh; @@ -515,8 +516,9 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; - if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, lgr->smcd->local_gid, - SMC_NLA_LGR_D_PAD)) + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, + smcd->ops->get_local_gid(smcd), + SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid, SMC_NLA_LGR_D_PAD)) @@ -820,6 +822,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; struct list_head *lgr_list; + struct smcd_dev *smcd; struct smc_link *lnk; spinlock_t *lgr_lock; u8 link_idx; @@ -851,8 +854,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ - mutex_init(&lgr->sndbufs_lock); - mutex_init(&lgr->rmbs_lock); + init_rwsem(&lgr->sndbufs_lock); + init_rwsem(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); @@ -866,7 +869,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ - get_device(&ini->ism_dev[ini->ism_selected]->dev); + smcd = ini->ism_dev[ini->ism_selected]; + get_device(smcd->ops->get_dev(smcd)); lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected]; lgr->smcd = ini->ism_dev[ini->ism_selected]; lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list; @@ -1094,7 +1098,7 @@ err_out: static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { - struct mutex *lock; /* lock buffer list */ + struct rw_semaphore *lock; /* lock buffer list */ int rc; if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { @@ -1102,10 +1106,10 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ - mutex_lock(&lgr->llc_conf_mutex); + down_read(&lgr->llc_conf_mutex); smc_llc_do_delete_rkey(lgr, buf_desc); buf_desc->is_conf_rkey = false; - mutex_unlock(&lgr->llc_conf_mutex); + up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } } @@ -1114,9 +1118,9 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, /* buf registration failed, reuse not possible */ lock = is_rmb ? &lgr->rmbs_lock : &lgr->sndbufs_lock; - mutex_lock(lock); + down_write(lock); list_del(&buf_desc->list); - mutex_unlock(lock); + up_write(lock); smc_buf_free(lgr, is_rmb, buf_desc); } else { @@ -1220,15 +1224,16 @@ static void smcr_buf_unmap_lgr(struct smc_link *lnk) int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) smcr_buf_unmap_link(buf_desc, true, lnk); - mutex_unlock(&lgr->rmbs_lock); - mutex_lock(&lgr->sndbufs_lock); + up_write(&lgr->rmbs_lock); + + down_write(&lgr->sndbufs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) smcr_buf_unmap_link(buf_desc, false, lnk); - mutex_unlock(&lgr->sndbufs_lock); + up_write(&lgr->sndbufs_lock); } } @@ -1373,19 +1378,19 @@ static void smc_lgr_free(struct smc_link_group *lgr) int i; if (!lgr->is_smcd) { - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].state != SMC_LNK_UNUSED) smcr_link_clear(&lgr->lnk[i], false); } - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); smc_llc_lgr_clear(lgr); } destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(&lgr->smcd->dev); + put_device(lgr->smcd->ops->get_dev(lgr->smcd)); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } @@ -1692,12 +1697,12 @@ static void smcr_link_down(struct smc_link *lnk) } else { if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* another llc task is ongoing */ - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) || lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), SMC_LLC_WAIT_TIME); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); } if (!list_empty(&lgr->list)) { smc_llc_send_delete_link(to_lnk, del_link_id, @@ -1757,9 +1762,9 @@ static void smc_link_down_work(struct work_struct *work) if (list_empty(&lgr->list)) return; wake_up_all(&lgr->llc_msg_waiter); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); smcr_link_down(link); - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, @@ -1986,19 +1991,19 @@ int smc_uncompress_bufsize(u8 compressed) * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, - struct mutex *lock, + struct rw_semaphore *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; - mutex_lock(lock); + down_read(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { - mutex_unlock(lock); + up_read(lock); return buf_slot; } } - mutex_unlock(lock); + up_read(lock); return NULL; } @@ -2107,13 +2112,13 @@ int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) return 0; } -static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock, +static int _smcr_buf_map_lgr(struct smc_link *lnk, struct rw_semaphore *lock, struct list_head *lst, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf; int rc = 0; - mutex_lock(lock); + down_write(lock); list_for_each_entry_safe(buf_desc, bf, lst, list) { if (!buf_desc->used) continue; @@ -2122,7 +2127,7 @@ static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock, goto out; } out: - mutex_unlock(lock); + up_write(lock); return rc; } @@ -2155,37 +2160,37 @@ int smcr_buf_reg_lgr(struct smc_link *lnk) int i, rc = 0; /* reg all RMBs for a new link */ - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return rc; } } } - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) return rc; /* reg all vzalloced sndbufs for a new link */ - mutex_lock(&lgr->sndbufs_lock); + down_write(&lgr->sndbufs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { if (!buf_desc->used || !buf_desc->is_vm) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { - mutex_unlock(&lgr->sndbufs_lock); + up_write(&lgr->sndbufs_lock); return rc; } } } - mutex_unlock(&lgr->sndbufs_lock); + up_write(&lgr->sndbufs_lock); return rc; } @@ -2243,7 +2248,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, int i, rc = 0, cnt = 0; /* protect against parallel link reconfiguration */ - mutex_lock(&lgr->llc_conf_mutex); + down_read(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; @@ -2256,7 +2261,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, cnt++; } out: - mutex_unlock(&lgr->llc_conf_mutex); + up_read(&lgr->llc_conf_mutex); if (!rc && !cnt) rc = -EINVAL; return rc; @@ -2305,8 +2310,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + struct rw_semaphore *lock; /* lock buffer list */ bool is_dgraded = false; - struct mutex *lock; /* lock buffer list */ int sk_buf_size; if (is_rmb) @@ -2354,9 +2359,9 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); buf_desc->used = 1; - mutex_lock(lock); + down_write(lock); list_add(&buf_desc->list, buf_list); - mutex_unlock(lock); + up_write(lock); break; /* found */ } @@ -2430,9 +2435,9 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); if (rc) { - mutex_lock(&smc->conn.lgr->sndbufs_lock); + down_write(&smc->conn.lgr->sndbufs_lock); list_del(&smc->conn.sndbuf_desc->list); - mutex_unlock(&smc->conn.lgr->sndbufs_lock); + up_write(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); smc->conn.sndbuf_desc = NULL; } @@ -2595,6 +2600,7 @@ static int smc_core_reboot_event(struct notifier_block *this, { smc_lgrs_shutdown(); smc_ib_unregister_client(); + smc_ism_exit(); return 0; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 285f9bd8e232..08b457c2d294 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -252,9 +252,9 @@ struct smc_link_group { unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ - struct mutex sndbufs_lock; /* protects tx buffers */ + struct rw_semaphore sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ - struct mutex rmbs_lock; /* protects rx buffers */ + struct rw_semaphore rmbs_lock; /* protects rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ @@ -298,7 +298,7 @@ struct smc_link_group { /* queue for llc events */ spinlock_t llc_event_q_lock; /* protects llc_event_q */ - struct mutex llc_conf_mutex; + struct rw_semaphore llc_conf_mutex; /* protects lgr reconfig. */ struct work_struct llc_add_link_work; struct work_struct llc_del_link_work; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 80ea7d954ece..7ff2152971a5 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -167,12 +167,13 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, !list_empty(&smc->conn.lgr->list)) { struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; + struct smcd_dev *smcd = conn->lgr->smcd; memset(&dinfo, 0, sizeof(dinfo)); dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid; - dinfo.my_gid = conn->lgr->smcd->local_gid; + dinfo.my_gid = smcd->ops->get_local_gid(smcd); dinfo.token = conn->rmb_desc->token; dinfo.peer_token = conn->peer_token; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 911fe08bc54b..3b0b7710c6b0 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -17,6 +17,7 @@ #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" +#include "linux/ism.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -26,6 +27,22 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; +#if IS_ENABLED(CONFIG_ISM) +static void smcd_register_dev(struct ism_dev *ism); +static void smcd_unregister_dev(struct ism_dev *ism); +static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); +static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, + u16 dmbemask); + +static struct ism_client smc_ism_client = { + .name = "SMC-D", + .add = smcd_register_dev, + .remove = smcd_unregister_dev, + .handle_event = smcd_handle_event, + .handle_irq = smcd_handle_irq, +}; +#endif + /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { @@ -183,6 +200,7 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { +#if IS_ENABLED(CONFIG_ISM) struct smcd_dmb dmb; int rc; @@ -191,7 +209,7 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; dmb.rgid = lgr->peer_gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb); + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; @@ -200,6 +218,9 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb_desc->len = dmb.dmb_len; } return rc; +#else + return 0; +#endif } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -210,9 +231,11 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; struct nlattr *attrs; + struct ism_dev *ism; int use_cnt = 0; void *nlh; + ism = smcd->priv; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); @@ -227,7 +250,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev); + smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -293,10 +316,11 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +#if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; - struct smcd_event event; + struct ism_event event; }; #define ISM_EVENT_REQUEST 0x0001 @@ -336,24 +360,6 @@ static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) } } -int smc_ism_signal_shutdown(struct smc_link_group *lgr) -{ - int rc; - union smcd_sw_event_info ev_info; - - if (lgr->peer_shutdown) - return 0; - - memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); - ev_info.vlan_id = lgr->vlan_id; - ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_SHUTDOWN, - ev_info.info); - return rc; -} - /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { @@ -373,44 +379,25 @@ static void smc_ism_event_work(struct work_struct *work) kfree(wrk); } -static void smcd_release(struct device *dev) -{ - struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev); - - kfree(smcd->conn); - kfree(smcd); -} - -struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, + const struct smcd_ops *ops, int max_dmbs) { struct smcd_dev *smcd; - smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; - smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), - GFP_KERNEL); - if (!smcd->conn) { - kfree(smcd); + smcd->conn = devm_kcalloc(parent, max_dmbs, + sizeof(struct smc_connection *), GFP_KERNEL); + if (!smcd->conn) return NULL; - } smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); - if (!smcd->event_wq) { - kfree(smcd->conn); - kfree(smcd); + if (!smcd->event_wq) return NULL; - } - smcd->dev.parent = parent; - smcd->dev.release = smcd_release; - device_initialize(&smcd->dev); - dev_set_name(&smcd->dev, name); smcd->ops = ops; - if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); @@ -419,11 +406,23 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, init_waitqueue_head(&smcd->lgrs_deleted); return smcd; } -EXPORT_SYMBOL_GPL(smcd_alloc_dev); -int smcd_register_dev(struct smcd_dev *smcd) +static void smcd_register_dev(struct ism_dev *ism) { - int rc; + const struct smcd_ops *ops = ism_get_smcd_ops(); + struct smcd_dev *smcd; + + if (!ops) + return; + + smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, + ISM_NR_DMBS); + if (!smcd) + return; + smcd->priv = ism; + ism_set_priv(ism, &smc_ism_client, smcd); + if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); mutex_lock(&smcd_dev_list.mutex); if (list_empty(&smcd_dev_list.list)) { @@ -444,43 +443,28 @@ int smcd_register_dev(struct smcd_dev *smcd) mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&smcd->dev), smcd->pnetid, + dev_name(&ism->dev), smcd->pnetid, smcd->pnetid_by_user ? " (user defined)" : ""); - rc = device_add(&smcd->dev); - if (rc) { - mutex_lock(&smcd_dev_list.mutex); - list_del(&smcd->list); - mutex_unlock(&smcd_dev_list.mutex); - } - - return rc; + return; } -EXPORT_SYMBOL_GPL(smcd_register_dev); -void smcd_unregister_dev(struct smcd_dev *smcd) +static void smcd_unregister_dev(struct ism_dev *ism) { + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&smcd->dev)); + dev_name(&ism->dev)); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); - smcd->going_away = 1; - smc_smcd_terminate_all(smcd); destroy_workqueue(smcd->event_wq); - - device_del(&smcd->dev); -} -EXPORT_SYMBOL_GPL(smcd_unregister_dev); - -void smcd_free_dev(struct smcd_dev *smcd) -{ - put_device(&smcd->dev); } -EXPORT_SYMBOL_GPL(smcd_free_dev); /* SMCD Device event handler. Called from ISM device interrupt handler. - * Parameters are smcd device pointer, + * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), * - event->code (event code), * - event->tok (either DMB token when event type 0, or GID when event type 1) @@ -490,8 +474,9 @@ EXPORT_SYMBOL_GPL(smcd_free_dev); * Context: * - Function called in IRQ context from ISM device driver event handler. */ -void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) +static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) { + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_ism_event_work *wrk; if (smcd->going_away) @@ -505,17 +490,18 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } -EXPORT_SYMBOL_GPL(smcd_handle_event); /* SMCD Device interrupt handler. Called from ISM device interrupt handler. - * Parameters are smcd device pointer, DMB number, and the DMBE bitmask. + * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. * Find the connection and schedule the tasklet for this connection. * * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask) +static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, + u16 dmbemask) { + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_connection *conn = NULL; unsigned long flags; @@ -525,10 +511,44 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } -EXPORT_SYMBOL_GPL(smcd_handle_irq); +#endif + +int smc_ism_signal_shutdown(struct smc_link_group *lgr) +{ + int rc = 0; +#if IS_ENABLED(CONFIG_ISM) + union smcd_sw_event_info ev_info; + + if (lgr->peer_shutdown) + return 0; + + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); + ev_info.vlan_id = lgr->vlan_id; + ev_info.code = ISM_EVENT_REQUEST; + rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_SHUTDOWN, + ev_info.info); +#endif + return rc; +} -void __init smc_ism_init(void) +int smc_ism_init(void) { + int rc = 0; + +#if IS_ENABLED(CONFIG_ISM) smc_ism_v2_capable = false; memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN); + + rc = ism_register_client(&smc_ism_client); +#endif + return rc; +} + +void smc_ism_exit(void) +{ +#if IS_ENABLED(CONFIG_ISM) + ism_unregister_client(&smc_ism_client); +#endif } diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index d6b2db604fe8..832b2f42d79f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -42,7 +42,8 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr); void smc_ism_get_system_eid(u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); bool smc_ism_is_v2_capable(void); -void smc_ism_init(void); +int smc_ism_init(void); +void smc_ism_exit(void); int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 524649d0ab65..a0840b8c935b 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -608,7 +608,7 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, prim_lnk_idx = link->link_idx; lnk_idx = link_new->link_idx; - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); ext->num_rkeys = lgr->conns_num; if (!ext->num_rkeys) goto out; @@ -628,7 +628,7 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, } len += i * sizeof(ext->rt[0]); out: - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return len; } @@ -889,7 +889,7 @@ static int smc_llc_cli_rkey_exchange(struct smc_link *link, int rc = 0; int i; - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); num_rkeys_send = lgr->conns_num; buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); do { @@ -916,7 +916,7 @@ static int smc_llc_cli_rkey_exchange(struct smc_link *link, break; } while (num_rkeys_send || num_rkeys_recv); - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return rc; } @@ -999,14 +999,14 @@ static void smc_llc_save_add_link_rkeys(struct smc_link *link, ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + SMC_WR_TX_SIZE); max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); for (i = 0; i < max; i++) { smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, ext->rt[i].rmb_key, ext->rt[i].rmb_vaddr_new, ext->rt[i].rmb_key_new); } - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); } static void smc_llc_save_add_link_info(struct smc_link *link, @@ -1202,12 +1202,12 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); if (smc_llc_is_local_add_link(&qentry->msg)) smc_llc_cli_add_link_invite(qentry->link, qentry); else smc_llc_cli_add_link(qentry->link, qentry); - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); } static int smc_llc_active_link_count(struct smc_link_group *lgr) @@ -1313,7 +1313,7 @@ static int smc_llc_srv_rkey_exchange(struct smc_link *link, int rc = 0; int i; - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); num_rkeys_send = lgr->conns_num; buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); do { @@ -1338,7 +1338,7 @@ static int smc_llc_srv_rkey_exchange(struct smc_link *link, smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); } while (num_rkeys_send || num_rkeys_recv); out: - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return rc; } @@ -1509,13 +1509,13 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); rc = smc_llc_srv_add_link(link, qentry); if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { /* delete any asymmetric link */ smc_llc_delete_asym_link(lgr); } - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); kfree(qentry); } @@ -1582,7 +1582,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) smc_lgr_terminate_sched(lgr); goto out; } - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); /* delete single link */ for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) { if (lgr->lnk[lnk_idx].link_id != del_llc->link_num) @@ -1616,7 +1616,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) smc_lgr_terminate_sched(lgr); } out_unlock: - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); out: kfree(qentry); } @@ -1652,7 +1652,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) int active_links; int i; - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); lnk = qentry->link; del_llc = &qentry->msg.delete_link; @@ -1708,7 +1708,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) smc_llc_add_link_local(lnk); } out: - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); kfree(qentry); } @@ -2126,7 +2126,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) spin_lock_init(&lgr->llc_flow_lock); init_waitqueue_head(&lgr->llc_flow_waiter); init_waitqueue_head(&lgr->llc_msg_waiter); - mutex_init(&lgr->llc_conf_mutex); + init_rwsem(&lgr->llc_conf_mutex); lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time); } diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 25fb2fd186e2..11775401df68 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -103,7 +103,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct smc_ib_device *ibdev; - struct smcd_dev *smcd_dev; + struct smcd_dev *smcd; struct smc_net *sn; int rc = -ENOENT; int ibport; @@ -162,16 +162,17 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ mutex_lock(&smcd_dev_list.mutex); - list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (smcd_dev->pnetid_by_user && + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->pnetid_by_user && (!pnet_name || - smc_pnet_match(pnet_name, smcd_dev->pnetid))) { + smc_pnet_match(pnet_name, smcd->pnetid))) { pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " - "%.16s\n", dev_name(&smcd_dev->dev), - smcd_dev->pnetid); - memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN); - smcd_dev->pnetid_by_user = false; + "%.16s\n", + dev_name(smcd->ops->get_dev(smcd)), + smcd->pnetid); + memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); + smcd->pnetid_by_user = false; rc = 0; } } @@ -331,8 +332,8 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(&smcd_dev->dev), smcd_name, - IB_DEVICE_NAME_MAX - 1)) + if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), + smcd_name, IB_DEVICE_NAME_MAX - 1)) goto out; } smcd_dev = NULL; @@ -411,7 +412,8 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, struct smc_ib_device *ib_dev; bool smcddev_applied = true; bool ibdev_applied = true; - struct smcd_dev *smcd_dev; + struct smcd_dev *smcd; + struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ @@ -425,14 +427,16 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, ib_port, ib_dev->pnetid[ib_port - 1]); } - smcd_dev = smc_pnet_find_smcd(ib_name); - if (smcd_dev) { - smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); - if (smcddev_applied) + smcd = smc_pnet_find_smcd(ib_name); + if (smcd) { + smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); + if (smcddev_applied) { + dev = smcd->ops->get_dev(smcd); pr_warn_ratelimited("smc: smcd device %s " "applied user defined pnetid " - "%.16s\n", dev_name(&smcd_dev->dev), - smcd_dev->pnetid); + "%.16s\n", dev_name(dev), + smcd->pnetid); + } } /* Apply fails when a device has a hardware-defined pnetid set, do not * add a pnet table entry in that case. @@ -1181,7 +1185,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(&smcddev->dev); + const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 17c5aee7ee4f..4380d32f5a5f 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -13,8 +13,10 @@ #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/sched/signal.h> +#include <linux/splice.h> #include <net/sock.h> +#include <trace/events/sock.h> #include "smc.h" #include "smc_core.h" @@ -31,6 +33,8 @@ static void smc_rx_wake_up(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + /* derived from sock_def_readable() */ /* called already in smc_listen_work() */ rcu_read_lock(); diff --git a/net/socket.c b/net/socket.c index 888cd618a968..77626e4d9690 100644 --- a/net/socket.c +++ b/net/socket.c @@ -106,6 +106,7 @@ #include <net/busy_poll.h> #include <linux/errqueue.h> #include <linux/ptp_clock_kernel.h> +#include <trace/events/sock.h> #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; @@ -709,12 +710,22 @@ INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *, size_t)); + +static noinline void call_trace_sock_send_length(struct sock *sk, int ret, + int flags) +{ + trace_sock_send_length(sk, ret, 0); +} + static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); + + if (trace_sock_send_length_enabled()) + call_trace_sock_send_length(sock->sk, ret, 0); return ret; } @@ -989,12 +1000,21 @@ INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, size_t, int)); INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, size_t, int)); + +static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int flags) +{ + trace_sock_recv_length(sk, ret, flags); +} + static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, - inet_recvmsg, sock, msg, msg_data_left(msg), - flags); + int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, + inet_recvmsg, sock, msg, + msg_data_left(msg), flags); + if (trace_sock_recv_length_enabled()) + call_trace_sock_recv_length(sock->sk, ret, flags); + return ret; } /** @@ -1044,6 +1064,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, { struct socket *sock; int flags; + int ret; sock = file->private_data; @@ -1051,7 +1072,11 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */ flags |= more; - return kernel_sendpage(sock, page, offset, size, flags); + ret = kernel_sendpage(sock, page, offset, size, flags); + + if (trace_sock_send_length_enabled()) + call_trace_sock_send_length(sock->sk, ret, 0); + return ret; } static ssize_t sock_splice_read(struct file *file, loff_t *ppos, diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 815baf308236..99eafe87b1d5 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -55,6 +55,7 @@ #include <linux/sunrpc/stats.h> #include <linux/sunrpc/xprt.h> +#include <trace/events/sock.h> #include <trace/events/sunrpc.h> #include "socklib.h" @@ -310,6 +311,8 @@ static void svc_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + trace_sk_data_ready(sk); + if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); @@ -687,6 +690,8 @@ static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + trace_sk_data_ready(sk); + if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index aaa5b2741b79..adcbedc244d6 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -52,6 +52,7 @@ #include <linux/uio.h> #include <linux/sched/mm.h> +#include <trace/events/sock.h> #include <trace/events/sunrpc.h> #include "socklib.h" @@ -1378,6 +1379,8 @@ static void xs_data_ready(struct sock *sk) { struct rpc_xprt *xprt; + trace_sk_data_ready(sk); + xprt = xprt_from_sock(sk); if (xprt != NULL) { struct sock_xprt *transport = container_of(xprt, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b35c8701876a..07c9bf5f7f5c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -37,6 +37,7 @@ #include <linux/rhashtable.h> #include <linux/sched/signal.h> +#include <trace/events/sock.h> #include "core.h" #include "name_table.h" @@ -2130,6 +2131,8 @@ static void tipc_data_ready(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 69c88cc03887..8ee0c07d00e9 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -43,6 +43,7 @@ #include "bearer.h" #include <net/sock.h> #include <linux/module.h> +#include <trace/events/sock.h> /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 @@ -439,6 +440,8 @@ static void tipc_conn_data_ready(struct sock *sk) { struct tipc_conn *con; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); con = sk->sk_user_data; if (connected(con)) { @@ -496,6 +499,8 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk) { struct tipc_topsrv *srv; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); srv = sk->sk_user_data; if (srv) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index a83d2b4275fa..6d0a534b7baa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -43,6 +43,7 @@ #include <net/strparser.h> #include <net/tls.h> +#include <trace/events/sock.h> #include "tls.h" @@ -2284,6 +2285,8 @@ static void tls_data_ready(struct sock *sk) struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_psock *psock; + trace_sk_data_ready(sk); + tls_strp_data_ready(&ctx->strp); psock = sk_psock_get(sk); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f0c2293f1d3b..0be25e712c28 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -112,6 +112,7 @@ #include <linux/mount.h> #include <net/checksum.h> #include <linux/security.h> +#include <linux/splice.h> #include <linux/freezer.h> #include <linux/file.h> #include <linux/btf_ids.h> @@ -807,23 +808,23 @@ static int unix_count_nr_fds(struct sock *sk) static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; + unsigned char s_state; struct unix_sock *u; - int nr_fds; + int nr_fds = 0; if (sk) { + s_state = READ_ONCE(sk->sk_state); u = unix_sk(sk); - if (sock->type == SOCK_DGRAM) { - nr_fds = atomic_read(&u->scm_stat.nr_fds); - goto out_print; - } - unix_state_lock(sk); - if (sk->sk_state != TCP_LISTEN) + /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their + * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. + * SOCK_DGRAM is ordinary. So, no lock is needed. + */ + if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) nr_fds = atomic_read(&u->scm_stat.nr_fds); - else + else if (s_state == TCP_LISTEN) nr_fds = unix_count_nr_fds(sk); - unix_state_unlock(sk); -out_print: + seq_printf(m, "scm_fds: %u\n", nr_fds); } } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index d593d5b6d4b1..19aea7cba26e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1861,8 +1861,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, written = transport->stream_enqueue(vsk, msg, len - total_written); } + if (written < 0) { - err = -ENOMEM; + err = written; goto out_err; } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index ad64f403536a..28b5a8e8e094 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -42,8 +42,7 @@ struct virtio_vsock { bool tx_run; struct work_struct send_pkt_work; - spinlock_t send_pkt_list_lock; - struct list_head send_pkt_list; + struct sk_buff_head send_pkt_queue; atomic_t queued_replies; @@ -101,41 +100,31 @@ virtio_transport_send_pkt_work(struct work_struct *work) vq = vsock->vqs[VSOCK_VQ_TX]; for (;;) { - struct virtio_vsock_pkt *pkt; struct scatterlist hdr, buf, *sgs[2]; int ret, in_sg = 0, out_sg = 0; + struct sk_buff *skb; bool reply; - spin_lock_bh(&vsock->send_pkt_list_lock); - if (list_empty(&vsock->send_pkt_list)) { - spin_unlock_bh(&vsock->send_pkt_list_lock); + skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); + if (!skb) break; - } - - pkt = list_first_entry(&vsock->send_pkt_list, - struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - spin_unlock_bh(&vsock->send_pkt_list_lock); - virtio_transport_deliver_tap_pkt(pkt); + virtio_transport_deliver_tap_pkt(skb); + reply = virtio_vsock_skb_reply(skb); - reply = pkt->reply; - - sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); + sg_init_one(&hdr, virtio_vsock_hdr(skb), sizeof(*virtio_vsock_hdr(skb))); sgs[out_sg++] = &hdr; - if (pkt->buf) { - sg_init_one(&buf, pkt->buf, pkt->len); + if (skb->len > 0) { + sg_init_one(&buf, skb->data, skb->len); sgs[out_sg++] = &buf; } - ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL); + ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL); /* Usually this means that there is no more space available in * the vq */ if (ret < 0) { - spin_lock_bh(&vsock->send_pkt_list_lock); - list_add(&pkt->list, &vsock->send_pkt_list); - spin_unlock_bh(&vsock->send_pkt_list_lock); + virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); break; } @@ -164,32 +153,32 @@ out: } static int -virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) +virtio_transport_send_pkt(struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr; struct virtio_vsock *vsock; - int len = pkt->len; + int len = skb->len; + + hdr = virtio_vsock_hdr(skb); rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); if (!vsock) { - virtio_transport_free_pkt(pkt); + kfree_skb(skb); len = -ENODEV; goto out_rcu; } - if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) { - virtio_transport_free_pkt(pkt); + if (le64_to_cpu(hdr->dst_cid) == vsock->guest_cid) { + kfree_skb(skb); len = -ENODEV; goto out_rcu; } - if (pkt->reply) + if (virtio_vsock_skb_reply(skb)) atomic_inc(&vsock->queued_replies); - spin_lock_bh(&vsock->send_pkt_list_lock); - list_add_tail(&pkt->list, &vsock->send_pkt_list); - spin_unlock_bh(&vsock->send_pkt_list_lock); - + virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); out_rcu: @@ -201,9 +190,7 @@ static int virtio_transport_cancel_pkt(struct vsock_sock *vsk) { struct virtio_vsock *vsock; - struct virtio_vsock_pkt *pkt, *n; int cnt = 0, ret; - LIST_HEAD(freeme); rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); @@ -212,20 +199,7 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk) goto out_rcu; } - spin_lock_bh(&vsock->send_pkt_list_lock); - list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { - if (pkt->vsk != vsk) - continue; - list_move(&pkt->list, &freeme); - } - spin_unlock_bh(&vsock->send_pkt_list_lock); - - list_for_each_entry_safe(pkt, n, &freeme, list) { - if (pkt->reply) - cnt++; - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } + cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue); if (cnt) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; @@ -246,38 +220,28 @@ out_rcu: static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) { - int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; - struct virtio_vsock_pkt *pkt; - struct scatterlist hdr, buf, *sgs[2]; + int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM; + struct scatterlist pkt, *p; struct virtqueue *vq; + struct sk_buff *skb; int ret; vq = vsock->vqs[VSOCK_VQ_RX]; do { - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) + skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL); + if (!skb) break; - pkt->buf = kmalloc(buf_len, GFP_KERNEL); - if (!pkt->buf) { - virtio_transport_free_pkt(pkt); + memset(skb->head, 0, VIRTIO_VSOCK_SKB_HEADROOM); + sg_init_one(&pkt, virtio_vsock_hdr(skb), total_len); + p = &pkt; + ret = virtqueue_add_sgs(vq, &p, 0, 1, skb, GFP_KERNEL); + if (ret < 0) { + kfree_skb(skb); break; } - pkt->buf_len = buf_len; - pkt->len = buf_len; - - sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); - sgs[0] = &hdr; - - sg_init_one(&buf, pkt->buf, buf_len); - sgs[1] = &buf; - ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL); - if (ret) { - virtio_transport_free_pkt(pkt); - break; - } vsock->rx_buf_nr++; } while (vq->num_free); if (vsock->rx_buf_nr > vsock->rx_buf_max_nr) @@ -299,12 +263,12 @@ static void virtio_transport_tx_work(struct work_struct *work) goto out; do { - struct virtio_vsock_pkt *pkt; + struct sk_buff *skb; unsigned int len; virtqueue_disable_cb(vq); - while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) { - virtio_transport_free_pkt(pkt); + while ((skb = virtqueue_get_buf(vq, &len)) != NULL) { + consume_skb(skb); added = true; } } while (!virtqueue_enable_cb(vq)); @@ -529,7 +493,7 @@ static void virtio_transport_rx_work(struct work_struct *work) do { virtqueue_disable_cb(vq); for (;;) { - struct virtio_vsock_pkt *pkt; + struct sk_buff *skb; unsigned int len; if (!virtio_transport_more_replies(vsock)) { @@ -540,23 +504,22 @@ static void virtio_transport_rx_work(struct work_struct *work) goto out; } - pkt = virtqueue_get_buf(vq, &len); - if (!pkt) { + skb = virtqueue_get_buf(vq, &len); + if (!skb) break; - } vsock->rx_buf_nr--; /* Drop short/long packets */ - if (unlikely(len < sizeof(pkt->hdr) || - len > sizeof(pkt->hdr) + pkt->len)) { - virtio_transport_free_pkt(pkt); + if (unlikely(len < sizeof(struct virtio_vsock_hdr) || + len > virtio_vsock_skb_len(skb))) { + kfree_skb(skb); continue; } - pkt->len = len - sizeof(pkt->hdr); - virtio_transport_deliver_tap_pkt(pkt); - virtio_transport_recv_pkt(&virtio_transport, pkt); + virtio_vsock_skb_rx_put(skb); + virtio_transport_deliver_tap_pkt(skb); + virtio_transport_recv_pkt(&virtio_transport, skb); } } while (!virtqueue_enable_cb(vq)); @@ -610,7 +573,7 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) { struct virtio_device *vdev = vsock->vdev; - struct virtio_vsock_pkt *pkt; + struct sk_buff *skb; /* Reset all connected sockets when the VQs disappear */ vsock_for_each_connected_socket(&virtio_transport.transport, @@ -637,23 +600,16 @@ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) virtio_reset_device(vdev); mutex_lock(&vsock->rx_lock); - while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) - virtio_transport_free_pkt(pkt); + while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) + kfree_skb(skb); mutex_unlock(&vsock->rx_lock); mutex_lock(&vsock->tx_lock); - while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) - virtio_transport_free_pkt(pkt); + while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) + kfree_skb(skb); mutex_unlock(&vsock->tx_lock); - spin_lock_bh(&vsock->send_pkt_list_lock); - while (!list_empty(&vsock->send_pkt_list)) { - pkt = list_first_entry(&vsock->send_pkt_list, - struct virtio_vsock_pkt, list); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - spin_unlock_bh(&vsock->send_pkt_list_lock); + virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue); /* Delete virtqueues and flush outstanding callbacks if any */ vdev->config->del_vqs(vdev); @@ -690,8 +646,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev) mutex_init(&vsock->tx_lock); mutex_init(&vsock->rx_lock); mutex_init(&vsock->event_lock); - spin_lock_init(&vsock->send_pkt_list_lock); - INIT_LIST_HEAD(&vsock->send_pkt_list); + skb_queue_head_init(&vsock->send_pkt_queue); INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); INIT_WORK(&vsock->event_work, virtio_transport_event_work); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index a9980e9b9304..a1581c77cf84 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -37,53 +37,56 @@ virtio_transport_get_ops(struct vsock_sock *vsk) return container_of(t, struct virtio_transport, transport); } -static struct virtio_vsock_pkt * -virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, +/* Returns a new packet on success, otherwise returns NULL. + * + * If NULL is returned, errp is set to a negative errno. + */ +static struct sk_buff * +virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, size_t len, u32 src_cid, u32 src_port, u32 dst_cid, u32 dst_port) { - struct virtio_vsock_pkt *pkt; + const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len; + struct virtio_vsock_hdr *hdr; + struct sk_buff *skb; + void *payload; int err; - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) + skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); + if (!skb) return NULL; - pkt->hdr.type = cpu_to_le16(info->type); - pkt->hdr.op = cpu_to_le16(info->op); - pkt->hdr.src_cid = cpu_to_le64(src_cid); - pkt->hdr.dst_cid = cpu_to_le64(dst_cid); - pkt->hdr.src_port = cpu_to_le32(src_port); - pkt->hdr.dst_port = cpu_to_le32(dst_port); - pkt->hdr.flags = cpu_to_le32(info->flags); - pkt->len = len; - pkt->hdr.len = cpu_to_le32(len); - pkt->reply = info->reply; - pkt->vsk = info->vsk; + hdr = virtio_vsock_hdr(skb); + hdr->type = cpu_to_le16(info->type); + hdr->op = cpu_to_le16(info->op); + hdr->src_cid = cpu_to_le64(src_cid); + hdr->dst_cid = cpu_to_le64(dst_cid); + hdr->src_port = cpu_to_le32(src_port); + hdr->dst_port = cpu_to_le32(dst_port); + hdr->flags = cpu_to_le32(info->flags); + hdr->len = cpu_to_le32(len); if (info->msg && len > 0) { - pkt->buf = kmalloc(len, GFP_KERNEL); - if (!pkt->buf) - goto out_pkt; - - pkt->buf_len = len; - - err = memcpy_from_msg(pkt->buf, info->msg, len); + payload = skb_put(skb, len); + err = memcpy_from_msg(payload, info->msg, len); if (err) goto out; if (msg_data_left(info->msg) == 0 && info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) { - pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); + hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); if (info->msg->msg_flags & MSG_EOR) - pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); + hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); } } + if (info->reply) + virtio_vsock_skb_set_reply(skb); + trace_virtio_transport_alloc_pkt(src_cid, src_port, dst_cid, dst_port, len, @@ -91,19 +94,18 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, info->op, info->flags); - return pkt; + return skb; out: - kfree(pkt->buf); -out_pkt: - kfree(pkt); + kfree_skb(skb); return NULL; } /* Packet capture */ static struct sk_buff *virtio_transport_build_skb(void *opaque) { - struct virtio_vsock_pkt *pkt = opaque; + struct virtio_vsock_hdr *pkt_hdr; + struct sk_buff *pkt = opaque; struct af_vsockmon_hdr *hdr; struct sk_buff *skb; size_t payload_len; @@ -113,10 +115,11 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) * the payload length from the header and the buffer pointer taking * care of the offset in the original packet. */ - payload_len = le32_to_cpu(pkt->hdr.len); - payload_buf = pkt->buf + pkt->off; + pkt_hdr = virtio_vsock_hdr(pkt); + payload_len = pkt->len; + payload_buf = pkt->data; - skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len, + skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len, GFP_ATOMIC); if (!skb) return NULL; @@ -124,16 +127,16 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) hdr = skb_put(skb, sizeof(*hdr)); /* pkt->hdr is little-endian so no need to byteswap here */ - hdr->src_cid = pkt->hdr.src_cid; - hdr->src_port = pkt->hdr.src_port; - hdr->dst_cid = pkt->hdr.dst_cid; - hdr->dst_port = pkt->hdr.dst_port; + hdr->src_cid = pkt_hdr->src_cid; + hdr->src_port = pkt_hdr->src_port; + hdr->dst_cid = pkt_hdr->dst_cid; + hdr->dst_port = pkt_hdr->dst_port; hdr->transport = cpu_to_le16(AF_VSOCK_TRANSPORT_VIRTIO); - hdr->len = cpu_to_le16(sizeof(pkt->hdr)); + hdr->len = cpu_to_le16(sizeof(*pkt_hdr)); memset(hdr->reserved, 0, sizeof(hdr->reserved)); - switch (le16_to_cpu(pkt->hdr.op)) { + switch (le16_to_cpu(pkt_hdr->op)) { case VIRTIO_VSOCK_OP_REQUEST: case VIRTIO_VSOCK_OP_RESPONSE: hdr->op = cpu_to_le16(AF_VSOCK_OP_CONNECT); @@ -154,7 +157,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) break; } - skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr)); + skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr)); if (payload_len) { skb_put_data(skb, payload_buf, payload_len); @@ -163,13 +166,13 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) return skb; } -void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt) +void virtio_transport_deliver_tap_pkt(struct sk_buff *skb) { - if (pkt->tap_delivered) + if (virtio_vsock_skb_tap_delivered(skb)) return; - vsock_deliver_tap(virtio_transport_build_skb, pkt); - pkt->tap_delivered = true; + vsock_deliver_tap(virtio_transport_build_skb, skb); + virtio_vsock_skb_set_tap_delivered(skb); } EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); @@ -192,8 +195,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, u32 src_cid, src_port, dst_cid, dst_port; const struct virtio_transport *t_ops; struct virtio_vsock_sock *vvs; - struct virtio_vsock_pkt *pkt; u32 pkt_len = info->pkt_len; + struct sk_buff *skb; info->type = virtio_transport_get_type(sk_vsock(vsk)); @@ -224,42 +227,47 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) return pkt_len; - pkt = virtio_transport_alloc_pkt(info, pkt_len, + skb = virtio_transport_alloc_skb(info, pkt_len, src_cid, src_port, dst_cid, dst_port); - if (!pkt) { + if (!skb) { virtio_transport_put_credit(vvs, pkt_len); return -ENOMEM; } - virtio_transport_inc_tx_pkt(vvs, pkt); + virtio_transport_inc_tx_pkt(vvs, skb); - return t_ops->send_pkt(pkt); + return t_ops->send_pkt(skb); } static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { - if (vvs->rx_bytes + pkt->len > vvs->buf_alloc) + if (vvs->rx_bytes + skb->len > vvs->buf_alloc) return false; - vvs->rx_bytes += pkt->len; + vvs->rx_bytes += skb->len; return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { - vvs->rx_bytes -= pkt->len; - vvs->fwd_cnt += pkt->len; + int len; + + len = skb_headroom(skb) - sizeof(struct virtio_vsock_hdr) - skb->len; + vvs->rx_bytes -= len; + vvs->fwd_cnt += len; } -void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) +void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + spin_lock_bh(&vvs->rx_lock); vvs->last_fwd_cnt = vvs->fwd_cnt; - pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); - pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); + hdr->fwd_cnt = cpu_to_le32(vvs->fwd_cnt); + hdr->buf_alloc = cpu_to_le32(vvs->buf_alloc); spin_unlock_bh(&vvs->rx_lock); } EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); @@ -303,29 +311,29 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt; size_t bytes, total = 0, off; + struct sk_buff *skb, *tmp; int err = -EFAULT; spin_lock_bh(&vvs->rx_lock); - list_for_each_entry(pkt, &vvs->rx_queue, list) { - off = pkt->off; + skb_queue_walk_safe(&vvs->rx_queue, skb, tmp) { + off = 0; if (total == len) break; - while (total < len && off < pkt->len) { + while (total < len && off < skb->len) { bytes = len - total; - if (bytes > pkt->len - off) - bytes = pkt->len - off; + if (bytes > skb->len - off) + bytes = skb->len - off; /* sk_lock is held by caller so no one else can dequeue. * Unlock rx_lock since memcpy_to_msg() may sleep. */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, pkt->buf + off, bytes); + err = memcpy_to_msg(msg, skb->data + off, bytes); if (err) goto out; @@ -352,37 +360,38 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt; size_t bytes, total = 0; - u32 free_space; + struct sk_buff *skb; int err = -EFAULT; + u32 free_space; spin_lock_bh(&vvs->rx_lock); - while (total < len && !list_empty(&vvs->rx_queue)) { - pkt = list_first_entry(&vvs->rx_queue, - struct virtio_vsock_pkt, list); + while (total < len && !skb_queue_empty(&vvs->rx_queue)) { + skb = __skb_dequeue(&vvs->rx_queue); bytes = len - total; - if (bytes > pkt->len - pkt->off) - bytes = pkt->len - pkt->off; + if (bytes > skb->len) + bytes = skb->len; /* sk_lock is held by caller so no one else can dequeue. * Unlock rx_lock since memcpy_to_msg() may sleep. */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes); + err = memcpy_to_msg(msg, skb->data, bytes); if (err) goto out; spin_lock_bh(&vvs->rx_lock); total += bytes; - pkt->off += bytes; - if (pkt->off == pkt->len) { - virtio_transport_dec_rx_pkt(vvs, pkt); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); + skb_pull(skb, bytes); + + if (skb->len == 0) { + virtio_transport_dec_rx_pkt(vvs, skb); + consume_skb(skb); + } else { + __skb_queue_head(&vvs->rx_queue, skb); } } @@ -414,10 +423,10 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, int flags) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt; int dequeued_len = 0; size_t user_buf_len = msg_data_left(msg); bool msg_ready = false; + struct sk_buff *skb; spin_lock_bh(&vvs->rx_lock); @@ -427,13 +436,18 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, } while (!msg_ready) { - pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list); + struct virtio_vsock_hdr *hdr; + + skb = __skb_dequeue(&vvs->rx_queue); + if (!skb) + break; + hdr = virtio_vsock_hdr(skb); if (dequeued_len >= 0) { size_t pkt_len; size_t bytes_to_copy; - pkt_len = (size_t)le32_to_cpu(pkt->hdr.len); + pkt_len = (size_t)le32_to_cpu(hdr->len); bytes_to_copy = min(user_buf_len, pkt_len); if (bytes_to_copy) { @@ -444,7 +458,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy); + err = memcpy_to_msg(msg, skb->data, bytes_to_copy); if (err) { /* Copy of message failed. Rest of * fragments will be freed without copy. @@ -452,6 +466,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, dequeued_len = err; } else { user_buf_len -= bytes_to_copy; + skb_pull(skb, bytes_to_copy); } spin_lock_bh(&vvs->rx_lock); @@ -461,17 +476,16 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, dequeued_len += pkt_len; } - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) { + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { msg_ready = true; vvs->msg_count--; - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) msg->msg_flags |= MSG_EOR; } - virtio_transport_dec_rx_pkt(vvs, pkt); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); + virtio_transport_dec_rx_pkt(vvs, skb); + kfree_skb(skb); } spin_unlock_bh(&vvs->rx_lock); @@ -609,7 +623,7 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, spin_lock_init(&vvs->rx_lock); spin_lock_init(&vvs->tx_lock); - INIT_LIST_HEAD(&vvs->rx_queue); + skb_queue_head_init(&vvs->rx_queue); return 0; } @@ -806,16 +820,16 @@ void virtio_transport_destruct(struct vsock_sock *vsk) EXPORT_SYMBOL_GPL(virtio_transport_destruct); static int virtio_transport_reset(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, - .reply = !!pkt, + .reply = !!skb, .vsk = vsk, }; /* Send RST only if the original pkt is not a RST pkt */ - if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + if (skb && le16_to_cpu(virtio_vsock_hdr(skb)->op) == VIRTIO_VSOCK_OP_RST) return 0; return virtio_transport_send_pkt_info(vsk, &info); @@ -825,29 +839,30 @@ static int virtio_transport_reset(struct vsock_sock *vsk, * attempt was made to connect to a socket that does not exist. */ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { - struct virtio_vsock_pkt *reply; + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, - .type = le16_to_cpu(pkt->hdr.type), + .type = le16_to_cpu(hdr->type), .reply = true, }; + struct sk_buff *reply; /* Send RST only if the original pkt is not a RST pkt */ - if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) return 0; - reply = virtio_transport_alloc_pkt(&info, 0, - le64_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port), - le64_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port)); + reply = virtio_transport_alloc_skb(&info, 0, + le64_to_cpu(hdr->dst_cid), + le32_to_cpu(hdr->dst_port), + le64_to_cpu(hdr->src_cid), + le32_to_cpu(hdr->src_port)); if (!reply) return -ENOMEM; if (!t) { - virtio_transport_free_pkt(reply); + kfree_skb(reply); return -ENOTCONN; } @@ -858,16 +873,11 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, static void virtio_transport_remove_sock(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt, *tmp; /* We don't need to take rx_lock, as the socket is closing and we are * removing it. */ - list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - + __skb_queue_purge(&vvs->rx_queue); vsock_remove_sock(vsk); } @@ -981,13 +991,14 @@ EXPORT_SYMBOL_GPL(virtio_transport_release); static int virtio_transport_recv_connecting(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); - int err; int skerr; + int err; - switch (le16_to_cpu(pkt->hdr.op)) { + switch (le16_to_cpu(hdr->op)) { case VIRTIO_VSOCK_OP_RESPONSE: sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; @@ -1008,7 +1019,7 @@ virtio_transport_recv_connecting(struct sock *sk, return 0; destroy: - virtio_transport_reset(vsk, pkt); + virtio_transport_reset(vsk, skb); sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk_error_report(sk); @@ -1017,34 +1028,37 @@ destroy: static void virtio_transport_recv_enqueue(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { struct virtio_vsock_sock *vvs = vsk->trans; bool can_enqueue, free_pkt = false; + struct virtio_vsock_hdr *hdr; + u32 len; - pkt->len = le32_to_cpu(pkt->hdr.len); - pkt->off = 0; + hdr = virtio_vsock_hdr(skb); + len = le32_to_cpu(hdr->len); spin_lock_bh(&vvs->rx_lock); - can_enqueue = virtio_transport_inc_rx_pkt(vvs, pkt); + can_enqueue = virtio_transport_inc_rx_pkt(vvs, skb); if (!can_enqueue) { free_pkt = true; goto out; } - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count++; /* Try to copy small packets into the buffer of last packet queued, * to avoid wasting memory queueing the entire buffer with a small * payload. */ - if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) { - struct virtio_vsock_pkt *last_pkt; + if (len <= GOOD_COPY_LEN && !skb_queue_empty(&vvs->rx_queue)) { + struct virtio_vsock_hdr *last_hdr; + struct sk_buff *last_skb; - last_pkt = list_last_entry(&vvs->rx_queue, - struct virtio_vsock_pkt, list); + last_skb = skb_peek_tail(&vvs->rx_queue); + last_hdr = virtio_vsock_hdr(last_skb); /* If there is space in the last packet queued, we copy the * new packet in its buffer. We avoid this if the last packet @@ -1052,35 +1066,35 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, * delimiter of SEQPACKET message, so 'pkt' is the first packet * of a new message. */ - if ((pkt->len <= last_pkt->buf_len - last_pkt->len) && - !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)) { - memcpy(last_pkt->buf + last_pkt->len, pkt->buf, - pkt->len); - last_pkt->len += pkt->len; + if (skb->len < skb_tailroom(last_skb) && + !(le32_to_cpu(last_hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)) { + memcpy(skb_put(last_skb, skb->len), skb->data, skb->len); free_pkt = true; - last_pkt->hdr.flags |= pkt->hdr.flags; + last_hdr->flags |= hdr->flags; + last_hdr->len = cpu_to_le32(last_skb->len); goto out; } } - list_add_tail(&pkt->list, &vvs->rx_queue); + __skb_queue_tail(&vvs->rx_queue, skb); out: spin_unlock_bh(&vvs->rx_lock); if (free_pkt) - virtio_transport_free_pkt(pkt); + kfree_skb(skb); } static int virtio_transport_recv_connected(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); int err = 0; - switch (le16_to_cpu(pkt->hdr.op)) { + switch (le16_to_cpu(hdr->op)) { case VIRTIO_VSOCK_OP_RW: - virtio_transport_recv_enqueue(vsk, pkt); + virtio_transport_recv_enqueue(vsk, skb); vsock_data_ready(sk); return err; case VIRTIO_VSOCK_OP_CREDIT_REQUEST: @@ -1090,18 +1104,17 @@ virtio_transport_recv_connected(struct sock *sk, sk->sk_write_space(sk); break; case VIRTIO_VSOCK_OP_SHUTDOWN: - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) vsk->peer_shutdown |= RCV_SHUTDOWN; - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) vsk->peer_shutdown |= SEND_SHUTDOWN; if (vsk->peer_shutdown == SHUTDOWN_MASK && vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) { (void)virtio_transport_reset(vsk, NULL); - virtio_transport_do_close(vsk, true); } - if (le32_to_cpu(pkt->hdr.flags)) + if (le32_to_cpu(virtio_vsock_hdr(skb)->flags)) sk->sk_state_change(sk); break; case VIRTIO_VSOCK_OP_RST: @@ -1112,28 +1125,30 @@ virtio_transport_recv_connected(struct sock *sk, break; } - virtio_transport_free_pkt(pkt); + kfree_skb(skb); return err; } static void virtio_transport_recv_disconnecting(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); - if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) virtio_transport_do_close(vsk, true); } static int virtio_transport_send_response(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RESPONSE, - .remote_cid = le64_to_cpu(pkt->hdr.src_cid), - .remote_port = le32_to_cpu(pkt->hdr.src_port), + .remote_cid = le64_to_cpu(hdr->src_cid), + .remote_port = le32_to_cpu(hdr->src_port), .reply = true, .vsk = vsk, }; @@ -1142,8 +1157,9 @@ virtio_transport_send_response(struct vsock_sock *vsk, } static bool virtio_transport_space_update(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); struct virtio_vsock_sock *vvs = vsk->trans; bool space_available; @@ -1158,8 +1174,8 @@ static bool virtio_transport_space_update(struct sock *sk, /* buf_alloc and fwd_cnt is always included in the hdr */ spin_lock_bh(&vvs->tx_lock); - vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); - vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + vvs->peer_buf_alloc = le32_to_cpu(hdr->buf_alloc); + vvs->peer_fwd_cnt = le32_to_cpu(hdr->fwd_cnt); space_available = virtio_transport_has_space(vsk); spin_unlock_bh(&vvs->tx_lock); return space_available; @@ -1167,27 +1183,28 @@ static bool virtio_transport_space_update(struct sock *sk, /* Handle server socket */ static int -virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, +virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, struct virtio_transport *t) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); struct vsock_sock *vchild; struct sock *child; int ret; - if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { - virtio_transport_reset_no_sock(t, pkt); + if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) { + virtio_transport_reset_no_sock(t, skb); return -EINVAL; } if (sk_acceptq_is_full(sk)) { - virtio_transport_reset_no_sock(t, pkt); + virtio_transport_reset_no_sock(t, skb); return -ENOMEM; } child = vsock_create_connected(sk); if (!child) { - virtio_transport_reset_no_sock(t, pkt); + virtio_transport_reset_no_sock(t, skb); return -ENOMEM; } @@ -1198,10 +1215,10 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, child->sk_state = TCP_ESTABLISHED; vchild = vsock_sk(child); - vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port)); - vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port)); + vsock_addr_init(&vchild->local_addr, le64_to_cpu(hdr->dst_cid), + le32_to_cpu(hdr->dst_port)); + vsock_addr_init(&vchild->remote_addr, le64_to_cpu(hdr->src_cid), + le32_to_cpu(hdr->src_port)); ret = vsock_assign_transport(vchild, vsk); /* Transport assigned (looking at remote_addr) must be the same @@ -1209,17 +1226,17 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, */ if (ret || vchild->transport != &t->transport) { release_sock(child); - virtio_transport_reset_no_sock(t, pkt); + virtio_transport_reset_no_sock(t, skb); sock_put(child); return ret; } - if (virtio_transport_space_update(child, pkt)) + if (virtio_transport_space_update(child, skb)) child->sk_write_space(child); vsock_insert_connected(vchild); vsock_enqueue_accept(sk, child); - virtio_transport_send_response(vchild, pkt); + virtio_transport_send_response(vchild, skb); release_sock(child); @@ -1237,29 +1254,30 @@ static bool virtio_transport_valid_type(u16 type) * lock. */ void virtio_transport_recv_pkt(struct virtio_transport *t, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct sockaddr_vm src, dst; struct vsock_sock *vsk; struct sock *sk; bool space_available; - vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port)); - vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port)); + vsock_addr_init(&src, le64_to_cpu(hdr->src_cid), + le32_to_cpu(hdr->src_port)); + vsock_addr_init(&dst, le64_to_cpu(hdr->dst_cid), + le32_to_cpu(hdr->dst_port)); trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, dst.svm_cid, dst.svm_port, - le32_to_cpu(pkt->hdr.len), - le16_to_cpu(pkt->hdr.type), - le16_to_cpu(pkt->hdr.op), - le32_to_cpu(pkt->hdr.flags), - le32_to_cpu(pkt->hdr.buf_alloc), - le32_to_cpu(pkt->hdr.fwd_cnt)); - - if (!virtio_transport_valid_type(le16_to_cpu(pkt->hdr.type))) { - (void)virtio_transport_reset_no_sock(t, pkt); + le32_to_cpu(hdr->len), + le16_to_cpu(hdr->type), + le16_to_cpu(hdr->op), + le32_to_cpu(hdr->flags), + le32_to_cpu(hdr->buf_alloc), + le32_to_cpu(hdr->fwd_cnt)); + + if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) { + (void)virtio_transport_reset_no_sock(t, skb); goto free_pkt; } @@ -1270,13 +1288,13 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, if (!sk) { sk = vsock_find_bound_socket(&dst); if (!sk) { - (void)virtio_transport_reset_no_sock(t, pkt); + (void)virtio_transport_reset_no_sock(t, skb); goto free_pkt; } } - if (virtio_transport_get_type(sk) != le16_to_cpu(pkt->hdr.type)) { - (void)virtio_transport_reset_no_sock(t, pkt); + if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) { + (void)virtio_transport_reset_no_sock(t, skb); sock_put(sk); goto free_pkt; } @@ -1287,13 +1305,13 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, /* Check if sk has been closed before lock_sock */ if (sock_flag(sk, SOCK_DONE)) { - (void)virtio_transport_reset_no_sock(t, pkt); + (void)virtio_transport_reset_no_sock(t, skb); release_sock(sk); sock_put(sk); goto free_pkt; } - space_available = virtio_transport_space_update(sk, pkt); + space_available = virtio_transport_space_update(sk, skb); /* Update CID in case it has changed after a transport reset event */ if (vsk->local_addr.svm_cid != VMADDR_CID_ANY) @@ -1304,23 +1322,23 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, switch (sk->sk_state) { case TCP_LISTEN: - virtio_transport_recv_listen(sk, pkt, t); - virtio_transport_free_pkt(pkt); + virtio_transport_recv_listen(sk, skb, t); + kfree_skb(skb); break; case TCP_SYN_SENT: - virtio_transport_recv_connecting(sk, pkt); - virtio_transport_free_pkt(pkt); + virtio_transport_recv_connecting(sk, skb); + kfree_skb(skb); break; case TCP_ESTABLISHED: - virtio_transport_recv_connected(sk, pkt); + virtio_transport_recv_connected(sk, skb); break; case TCP_CLOSING: - virtio_transport_recv_disconnecting(sk, pkt); - virtio_transport_free_pkt(pkt); + virtio_transport_recv_disconnecting(sk, skb); + kfree_skb(skb); break; default: - (void)virtio_transport_reset_no_sock(t, pkt); - virtio_transport_free_pkt(pkt); + (void)virtio_transport_reset_no_sock(t, skb); + kfree_skb(skb); break; } @@ -1333,16 +1351,42 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, return; free_pkt: - virtio_transport_free_pkt(pkt); + kfree_skb(skb); } EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); -void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) +/* Remove skbs found in a queue that have a vsk that matches. + * + * Each skb is freed. + * + * Returns the count of skbs that were reply packets. + */ +int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue) { - kvfree(pkt->buf); - kfree(pkt); + struct sk_buff_head freeme; + struct sk_buff *skb, *tmp; + int cnt = 0; + + skb_queue_head_init(&freeme); + + spin_lock_bh(&queue->lock); + skb_queue_walk_safe(queue, skb, tmp) { + if (vsock_sk(skb->sk) != vsk) + continue; + + __skb_unlink(skb, queue); + __skb_queue_tail(&freeme, skb); + + if (virtio_vsock_skb_reply(skb)) + cnt++; + } + spin_unlock_bh(&queue->lock); + + __skb_queue_purge(&freeme); + + return cnt; } -EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); +EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 169a8cf65b39..671e03240fc5 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -16,7 +16,7 @@ struct vsock_loopback { struct workqueue_struct *workqueue; spinlock_t pkt_list_lock; /* protects pkt_list */ - struct list_head pkt_list; + struct sk_buff_head pkt_queue; struct work_struct pkt_work; }; @@ -27,13 +27,13 @@ static u32 vsock_loopback_get_local_cid(void) return VMADDR_CID_LOCAL; } -static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) +static int vsock_loopback_send_pkt(struct sk_buff *skb) { struct vsock_loopback *vsock = &the_vsock_loopback; - int len = pkt->len; + int len = skb->len; spin_lock_bh(&vsock->pkt_list_lock); - list_add_tail(&pkt->list, &vsock->pkt_list); + skb_queue_tail(&vsock->pkt_queue, skb); spin_unlock_bh(&vsock->pkt_list_lock); queue_work(vsock->workqueue, &vsock->pkt_work); @@ -44,21 +44,8 @@ static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) { struct vsock_loopback *vsock = &the_vsock_loopback; - struct virtio_vsock_pkt *pkt, *n; - LIST_HEAD(freeme); - spin_lock_bh(&vsock->pkt_list_lock); - list_for_each_entry_safe(pkt, n, &vsock->pkt_list, list) { - if (pkt->vsk != vsk) - continue; - list_move(&pkt->list, &freeme); - } - spin_unlock_bh(&vsock->pkt_list_lock); - - list_for_each_entry_safe(pkt, n, &freeme, list) { - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } + virtio_transport_purge_skbs(vsk, &vsock->pkt_queue); return 0; } @@ -121,20 +108,18 @@ static void vsock_loopback_work(struct work_struct *work) { struct vsock_loopback *vsock = container_of(work, struct vsock_loopback, pkt_work); - LIST_HEAD(pkts); + struct sk_buff_head pkts; + struct sk_buff *skb; + + skb_queue_head_init(&pkts); spin_lock_bh(&vsock->pkt_list_lock); - list_splice_init(&vsock->pkt_list, &pkts); + skb_queue_splice_init(&vsock->pkt_queue, &pkts); spin_unlock_bh(&vsock->pkt_list_lock); - while (!list_empty(&pkts)) { - struct virtio_vsock_pkt *pkt; - - pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - - virtio_transport_deliver_tap_pkt(pkt); - virtio_transport_recv_pkt(&loopback_transport, pkt); + while ((skb = __skb_dequeue(&pkts))) { + virtio_transport_deliver_tap_pkt(skb); + virtio_transport_recv_pkt(&loopback_transport, skb); } } @@ -148,7 +133,7 @@ static int __init vsock_loopback_init(void) return -ENOMEM; spin_lock_init(&vsock->pkt_list_lock); - INIT_LIST_HEAD(&vsock->pkt_list); + skb_queue_head_init(&vsock->pkt_queue); INIT_WORK(&vsock->pkt_work, vsock_loopback_work); ret = vsock_core_register(&loopback_transport.transport, @@ -166,19 +151,13 @@ out_wq: static void __exit vsock_loopback_exit(void) { struct vsock_loopback *vsock = &the_vsock_loopback; - struct virtio_vsock_pkt *pkt; vsock_core_unregister(&loopback_transport.transport); flush_work(&vsock->pkt_work); spin_lock_bh(&vsock->pkt_list_lock); - while (!list_empty(&vsock->pkt_list)) { - pkt = list_first_entry(&vsock->pkt_list, - struct virtio_vsock_pkt, list); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } + virtio_vsock_skb_queue_purge(&vsock->pkt_queue); spin_unlock_bh(&vsock->pkt_list_lock); destroy_workqueue(vsock->workqueue); diff --git a/net/wireless/core.h b/net/wireless/core.h index af85d8909935..7c61752f6d83 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -278,8 +278,8 @@ struct cfg80211_event { }; struct cfg80211_cached_keys { - struct key_params params[CFG80211_MAX_WEP_KEYS]; - u8 data[CFG80211_MAX_WEP_KEYS][WLAN_KEY_LEN_WEP104]; + struct key_params params[4]; + u8 data[4][WLAN_KEY_LEN_WEP104]; int def; }; diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index edd062f104f4..e6fdb0b8187d 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -45,8 +45,7 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, cfg80211_hold_bss(bss_from_pub(bss)); wdev->u.ibss.current_bss = bss_from_pub(bss); - if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) - cfg80211_upload_connect_keys(wdev); + cfg80211_upload_connect_keys(wdev); nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid, GFP_KERNEL); @@ -294,7 +293,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; - for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) + for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } err = __cfg80211_join_ibss(rdev, wdev->netdev, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 33a82ecab9d5..64cf6110ce9d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -883,7 +883,7 @@ nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { }, [NL80211_REKEY_DATA_KCK] = { .type = NLA_BINARY, - .len = NL80211_KCK_EXT_LEN + .len = NL80211_KCK_EXT_LEN_32 }, [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN(NL80211_REPLAY_CTR_LEN), [NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 }, @@ -13809,7 +13809,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN && !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && - nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KCK_EXT_LEN)) + nla_len(tb[NL80211_REKEY_DATA_KCK]) == NL80211_KCK_EXT_LEN) && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KCK_32 && + nla_len(tb[NL80211_REKEY_DATA_KCK]) == NL80211_KCK_EXT_LEN_32)) return -ERANGE; rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 4f3f31244e8b..af65196c916e 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -737,51 +737,9 @@ static bool valid_country(const u8 *data, unsigned int size, } #ifdef CONFIG_CFG80211_REQUIRE_SIGNED_REGDB -static struct key *builtin_regdb_keys; - -static void __init load_keys_from_buffer(const u8 *p, unsigned int buflen) -{ - const u8 *end = p + buflen; - size_t plen; - key_ref_t key; - - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(builtin_regdb_keys, 1), - "asymmetric", NULL, p, plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_BUILT_IN | - KEY_ALLOC_BYPASS_RESTRICTION); - if (IS_ERR(key)) { - pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - } else { - pr_notice("Loaded X.509 cert '%s'\n", - key_ref_to_ptr(key)->description); - key_ref_put(key); - } - p += plen; - } - - return; +#include <keys/asymmetric-type.h> -dodgy_cert: - pr_err("Problem parsing in-kernel X.509 certificate list\n"); -} +static struct key *builtin_regdb_keys; static int __init load_builtin_regdb_keys(void) { @@ -797,11 +755,15 @@ static int __init load_builtin_regdb_keys(void) pr_notice("Loading compiled-in X.509 certificates for regulatory database\n"); #ifdef CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS - load_keys_from_buffer(shipped_regdb_certs, shipped_regdb_certs_len); + x509_load_certificate_list(shipped_regdb_certs, + shipped_regdb_certs_len, + builtin_regdb_keys); #endif #ifdef CONFIG_CFG80211_EXTRA_REGDB_KEYDIR if (CONFIG_CFG80211_EXTRA_REGDB_KEYDIR[0] != '\0') - load_keys_from_buffer(extra_regdb_certs, extra_regdb_certs_len); + x509_load_certificate_list(extra_regdb_certs, + extra_regdb_certs_len, + builtin_regdb_keys); #endif return 0; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 4b5b6ee0fe01..123248b2c0be 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -855,8 +855,7 @@ void __cfg80211_connect_result(struct net_device *dev, ETH_ALEN); } - if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) - cfg80211_upload_connect_keys(wdev); + cfg80211_upload_connect_keys(wdev); rcu_read_lock(); for_each_valid_link(cr, link) { @@ -1462,9 +1461,6 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, connect->crypto.ciphers_pairwise[0] = cipher; } } - - connect->crypto.wep_keys = connkeys->params; - connect->crypto.wep_tx_key = connkeys->def; } else { if (WARN_ON(connkeys)) return -EINVAL; diff --git a/net/wireless/util.c b/net/wireless/util.c index 8f403f9fe816..38d3b434c18c 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -934,7 +934,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) if (!wdev->connect_keys) return; - for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) { + for (i = 0; i < 4; i++) { if (!wdev->connect_keys->params[i].cipher) continue; if (rdev_add_key(rdev, dev, -1, i, false, NULL, diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 8a24dfca75af..e3acfac7430a 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -439,7 +439,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, GFP_KERNEL); if (!wdev->wext.keys) return -ENOMEM; - for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) + for (i = 0; i < 4; i++) wdev->wext.keys->params[i].key = wdev->wext.keys->data[i]; } diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index fe8765c4075d..13a72b17248e 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -636,7 +636,15 @@ void wireless_send_event(struct net_device * dev, } EXPORT_SYMBOL(wireless_send_event); +#ifdef CONFIG_CFG80211_WEXT +static void wireless_warn_cfg80211_wext(void) +{ + char name[sizeof(current->comm)]; + pr_warn_ratelimited("warning: `%s' uses wireless extensions that are deprecated for modern drivers; use nl80211\n", + get_task_comm(name, current)); +} +#endif /* IW handlers */ @@ -652,8 +660,12 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev) if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy && dev->ieee80211_ptr->wiphy->wext && - dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) + dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) { + wireless_warn_cfg80211_wext(); + if (dev->ieee80211_ptr->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) + return NULL; return dev->ieee80211_ptr->wiphy->wext->get_wireless_stats(dev); + } #endif /* not found */ @@ -690,8 +702,12 @@ static iw_handler get_handler(struct net_device *dev, unsigned int cmd) const struct iw_handler_def *handlers = NULL; #ifdef CONFIG_CFG80211_WEXT - if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy) + if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy) { + wireless_warn_cfg80211_wext(); + if (dev->ieee80211_ptr->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) + return NULL; handlers = dev->ieee80211_ptr->wiphy->wext; + } #endif #ifdef CONFIG_WIRELESS_EXT if (dev->wireless_handlers) diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 191c6d98c700..f231207ca210 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -47,7 +47,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; - for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) + for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 74a54295c164..872b80188e83 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -6,6 +6,7 @@ #include <net/espintcp.h> #include <linux/skmsg.h> #include <net/inet_common.h> +#include <trace/events/sock.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif @@ -397,6 +398,8 @@ static void espintcp_data_ready(struct sock *sk) { struct espintcp_ctx *ctx = espintcp_getctx(sk); + trace_sk_data_ready(sk); + strp_data_ready(&ctx->strp); } diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 4aff76c6f12e..95f1436bf6a2 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -309,7 +309,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, else xso->type = XFRM_DEV_OFFLOAD_CRYPTO; - err = dev->xfrmdev_ops->xdo_dev_state_add(x); + err = dev->xfrmdev_ops->xdo_dev_state_add(x, extack); if (err) { xso->dev = NULL; xso->dir = 0; @@ -326,7 +326,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, */ WARN_ON(err == -EOPNOTSUPP && is_packet_offload); if (err != -EOPNOTSUPP || is_packet_offload) { - NL_SET_ERR_MSG(extack, "Device failed to offload this state"); + NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this state"); return err; } } @@ -383,14 +383,14 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, return -EINVAL; } - err = dev->xfrmdev_ops->xdo_dev_policy_add(xp); + err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack); if (err) { xdo->dev = NULL; xdo->real_dev = NULL; xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; xdo->dir = 0; netdev_put(dev, &xdo->dev_tracker); - NL_SET_ERR_MSG(extack, "Device failed to offload this policy"); + NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this policy"); return err; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 00afe831c71c..2ab3e09e2227 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1274,7 +1274,7 @@ found: xso->real_dev = xdo->real_dev; netdev_tracker_alloc(xso->dev, &xso->dev_tracker, GFP_ATOMIC); - error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x); + error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL); if (error) { xso->dir = 0; netdev_put(xso->dev, &xso->dev_tracker); |