diff options
Diffstat (limited to 'include/net')
31 files changed, 651 insertions, 186 deletions
diff --git a/include/net/act_api.h b/include/net/act_api.h index 4ae0580b63ca..ea13e1e4a7c2 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -191,7 +191,7 @@ int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); -void tcf_idr_insert_many(struct tc_action *actions[]); +void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index e302c0e804d0..535701efc1e5 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -137,7 +137,6 @@ struct vsock_transport { u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); - int (*set_rcvlowat)(struct vsock_sock *vsk, int val); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, @@ -168,6 +167,7 @@ struct vsock_transport { struct vsock_transport_send_notify_data *); /* sk_lock held by the caller */ void (*notify_buffer_size)(struct vsock_sock *, u64 *); + int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4ecfb06c413d..ac1fb326dcda 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -52,7 +52,7 @@ * such wiphy can have zero, one, or many virtual interfaces associated with * it, which need to be identified as such by pointing the network interface's * @ieee80211_ptr pointer to a &struct wireless_dev which further describes - * the wireless part of the interface, normally this struct is embedded in the + * the wireless part of the interface. Normally this struct is embedded in the * network interface's private data area. Drivers can optionally allow creating * or destroying virtual interfaces on the fly, but without at least one or the * ability to create some the wireless device isn't useful. @@ -977,6 +977,15 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1, const struct cfg80211_chan_def *chandef2); /** + * nl80211_chan_width_to_mhz - get the channel width in MHz + * @chan_width: the channel width from &enum nl80211_chan_width + * + * Return: channel width in MHz if the chan_width from &enum nl80211_chan_width + * is valid. -1 otherwise. + */ +int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width); + +/** * cfg80211_chandef_valid - check if a channel definition is valid * @chandef: the channel definition to check * Return: %true if the channel definition is valid. %false otherwise. @@ -1665,6 +1674,21 @@ struct link_station_del_parameters { }; /** + * struct cfg80211_ttlm_params: TID to link mapping parameters + * + * Used for setting a TID to link mapping. + * + * @dlink: Downlink TID to link mapping, as defined in section 9.4.2.314 + * (TID-To-Link Mapping element) in Draft P802.11be_D4.0. + * @ulink: Uplink TID to link mapping, as defined in section 9.4.2.314 + * (TID-To-Link Mapping element) in Draft P802.11be_D4.0. + */ +struct cfg80211_ttlm_params { + u16 dlink[8]; + u16 ulink[8]; +}; + +/** * struct station_parameters - station parameters * * Used to change and create a new station. @@ -2560,7 +2584,7 @@ struct cfg80211_scan_info { * @short_ssid: short ssid to scan for * @bssid: bssid to scan for * @channel_idx: idx of the channel in the channel array in the scan request - * which the above info relvant to + * which the above info is relevant to * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU * @short_ssid_valid: @short_ssid is valid and can be used * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait @@ -2608,6 +2632,8 @@ struct cfg80211_scan_6ghz_params { * @n_6ghz_params: number of 6 GHz params * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) + * @tsf_report_link_id: for MLO, indicates the link ID of the BSS that should be + * used for TSF reporting. Can be set to -1 to indicate no preference. */ struct cfg80211_scan_request { struct cfg80211_ssid *ssids; @@ -2636,6 +2662,7 @@ struct cfg80211_scan_request { bool scan_6ghz; u32 n_6ghz_params; struct cfg80211_scan_6ghz_params *scan_6ghz_params; + s8 tsf_report_link_id; /* keep last */ struct ieee80211_channel *channels[] __counted_by(n_channels); @@ -2816,6 +2843,13 @@ enum cfg80211_signal_type { * the BSS that requested the scan in which the beacon/probe was received. * @chains: bitmask for filled values in @chain_signal. * @chain_signal: per-chain signal strength of last received BSS in dBm. + * @restrict_use: restrict usage, if not set, assume @use_for is + * %NL80211_BSS_USE_FOR_NORMAL. + * @use_for: bitmap of possible usage for this BSS, see + * &enum nl80211_bss_use_for + * @cannot_use_reasons: the reasons (bitmap) for not being able to connect, + * if @restrict_use is set and @use_for is zero (empty); may be 0 for + * unspecified reasons; see &enum nl80211_bss_cannot_use_reasons * @drv_data: Data to be passed through to @inform_bss */ struct cfg80211_inform_bss { @@ -2827,6 +2861,9 @@ struct cfg80211_inform_bss { u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; + u8 restrict_use:1, use_for:7; + u8 cannot_use_reasons; + void *drv_data; }; @@ -2878,6 +2915,11 @@ struct cfg80211_bss_ies { * @chain_signal: per-chain signal strength of last received BSS in dBm. * @bssid_index: index in the multiple BSS set * @max_bssid_indicator: max number of members in the BSS set + * @use_for: bitmap of possible usage for this BSS, see + * &enum nl80211_bss_use_for + * @cannot_use_reasons: the reasons (bitmap) for not being able to connect, + * if @restrict_use is set and @use_for is zero (empty); may be 0 for + * unspecified reasons; see &enum nl80211_bss_cannot_use_reasons * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes */ struct cfg80211_bss { @@ -2903,6 +2945,9 @@ struct cfg80211_bss { u8 bssid_index; u8 max_bssid_indicator; + u8 use_for; + u8 cannot_use_reasons; + u8 priv[] __aligned(sizeof(void *)); }; @@ -4493,6 +4538,7 @@ struct mgmt_frame_regs { * @del_link_station: Remove a link of a station. * * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. + * @set_ttlm: set the TID to link mapping. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -4852,6 +4898,8 @@ struct cfg80211_ops { struct link_station_del_parameters *params); int (*set_hw_timestamp)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts); + int (*set_ttlm)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_ttlm_params *params); }; /* @@ -4910,6 +4958,8 @@ struct cfg80211_ops { * NL80211_REGDOM_SET_BY_DRIVER. * @WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON: reg_call_notifier() is called if driver * set this flag to update channels on beacon hints. + * @WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY: support connection to non-primary link + * of an NSTR mobile AP MLD. */ enum wiphy_flags { WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), @@ -4923,7 +4973,7 @@ enum wiphy_flags { WIPHY_FLAG_IBSS_RSN = BIT(8), WIPHY_FLAG_MESH_AUTH = BIT(10), WIPHY_FLAG_SUPPORTS_EXT_KCK_32 = BIT(11), - /* use hole at 12 */ + WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY = BIT(12), WIPHY_FLAG_SUPPORTS_FW_ROAM = BIT(13), WIPHY_FLAG_AP_UAPSD = BIT(14), WIPHY_FLAG_SUPPORTS_TDLS = BIT(15), @@ -7162,6 +7212,25 @@ cfg80211_inform_bss(struct wiphy *wiphy, } /** + * __cfg80211_get_bss - get a BSS reference + * @wiphy: the wiphy this BSS struct belongs to + * @channel: the channel to search on (or %NULL) + * @bssid: the desired BSSID (or %NULL) + * @ssid: the desired SSID (or %NULL) + * @ssid_len: length of the SSID (or 0) + * @bss_type: type of BSS, see &enum ieee80211_bss_type + * @privacy: privacy filter, see &enum ieee80211_privacy + * @use_for: indicates which use is intended + */ +struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy, + u32 use_for); + +/** * cfg80211_get_bss - get a BSS reference * @wiphy: the wiphy this BSS struct belongs to * @channel: the channel to search on (or %NULL) @@ -7170,13 +7239,20 @@ cfg80211_inform_bss(struct wiphy *wiphy, * @ssid_len: length of the SSID (or 0) * @bss_type: type of BSS, see &enum ieee80211_bss_type * @privacy: privacy filter, see &enum ieee80211_privacy + * + * This version implies regular usage, %NL80211_BSS_USE_FOR_NORMAL. */ -struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, - struct ieee80211_channel *channel, - const u8 *bssid, - const u8 *ssid, size_t ssid_len, - enum ieee80211_bss_type bss_type, - enum ieee80211_privacy privacy); +static inline struct cfg80211_bss * +cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, + const u8 *bssid, const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy) +{ + return __cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, + bss_type, privacy, + NL80211_BSS_USE_FOR_NORMAL); +} + static inline struct cfg80211_bss * cfg80211_get_ibss(struct wiphy *wiphy, struct ieee80211_channel *channel, @@ -7309,7 +7385,7 @@ struct cfg80211_rx_assoc_resp_data { * This function may sleep. The caller must hold the corresponding wdev's mutex. */ void cfg80211_rx_assoc_resp(struct net_device *dev, - struct cfg80211_rx_assoc_resp_data *data); + const struct cfg80211_rx_assoc_resp_data *data); /** * struct cfg80211_assoc_failure - association failure data @@ -7428,7 +7504,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, * RFkill integration in cfg80211 is almost invisible to drivers, * as cfg80211 automatically registers an rfkill instance for each * wireless device it knows about. Soft kill is also translated - * into disconnecting and turning all interfaces off, drivers are + * into disconnecting and turning all interfaces off. Drivers are * expected to turn off the device when all interfaces are down. * * However, devices may have a hard RFkill line, in which case they @@ -7476,7 +7552,7 @@ static inline void wiphy_rfkill_stop_polling(struct wiphy *wiphy) * the configuration mechanism. * * A driver supporting vendor commands must register them as an array - * in struct wiphy, with handlers for each one, each command has an + * in struct wiphy, with handlers for each one. Each command has an * OUI and sub command ID to identify it. * * Note that this feature should not be (ab)used to implement protocol @@ -7640,7 +7716,7 @@ static inline void cfg80211_vendor_event(struct sk_buff *skb, gfp_t gfp) * interact with driver-specific tools to aid, for instance, * factory programming. * - * This chapter describes how drivers interact with it, for more + * This chapter describes how drivers interact with it. For more * information see the nl80211 book's chapter on it. */ diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 3c70ad53a49c..6d3a20163260 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -85,7 +85,10 @@ FN(IPV6_NDISC_BAD_OPTIONS) \ FN(IPV6_NDISC_NS_OTHERHOST) \ FN(QUEUE_PURGE) \ - FN(TC_ERROR) \ + FN(TC_COOKIE_ERROR) \ + FN(PACKET_SOCK_ERROR) \ + FN(TC_CHAIN_NOTFOUND) \ + FN(TC_RECLASSIFY_LOOP) \ FNe(MAX) /** @@ -376,8 +379,23 @@ enum skb_drop_reason { SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST, /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */ SKB_DROP_REASON_QUEUE_PURGE, - /** @SKB_DROP_REASON_TC_ERROR: generic internal tc error. */ - SKB_DROP_REASON_TC_ERROR, + /** + * @SKB_DROP_REASON_TC_COOKIE_ERROR: An error occurred whilst + * processing a tc ext cookie. + */ + SKB_DROP_REASON_TC_COOKIE_ERROR, + /** + * @SKB_DROP_REASON_PACKET_SOCK_ERROR: generic packet socket errors + * after its filter matches an incoming packet. + */ + SKB_DROP_REASON_PACKET_SOCK_ERROR, + /** @SKB_DROP_REASON_TC_CHAIN_NOTFOUND: tc chain lookup failed. */ + SKB_DROP_REASON_TC_CHAIN_NOTFOUND, + /** + * @SKB_DROP_REASON_TC_RECLASSIFY_LOOP: tc exceeded max reclassify loop + * iterations. + */ + SKB_DROP_REASON_TC_RECLASSIFY_LOOP, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/include/net/genetlink.h b/include/net/genetlink.h index c53244f20437..85c63d4f16dd 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -51,6 +51,9 @@ struct genl_info; * @split_ops: the split do/dump form of operation definition * @n_split_ops: number of entries in @split_ops, not that with split do/dump * ops the number of entries is not the same as number of commands + * @sock_priv_size: the size of per-socket private memory + * @sock_priv_init: the per-socket private memory initializer + * @sock_priv_destroy: the per-socket private memory destructor * * Attribute policies (the combination of @policy and @maxattr fields) * can be attached at the family level or at the operation level. @@ -84,11 +87,17 @@ struct genl_family { const struct genl_multicast_group *mcgrps; struct module *module; + size_t sock_priv_size; + void (*sock_priv_init)(void *priv); + void (*sock_priv_destroy)(void *priv); + /* private: internal use only */ /* protocol family identifier */ int id; /* starting number of multicast group IDs in this family */ unsigned int mcgrp_offset; + /* list of per-socket privs */ + struct xarray *sock_privs; }; /** @@ -298,6 +307,8 @@ static inline bool genl_info_is_ntf(const struct genl_info *info) return !info->nlhdr; } +void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk); +void *genl_sk_priv_get(struct genl_family *family, struct sock *sk); int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, @@ -438,6 +449,35 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) } /** + * genlmsg_multicast_netns_filtered - multicast a netlink message + * to a specific netns with filter + * function + * @family: the generic netlink family + * @net: the net namespace + * @skb: netlink message as socket buffer + * @portid: own netlink portid to avoid sending to yourself + * @group: offset of multicast group in groups array + * @flags: allocation flags + * @filter: filter function + * @filter_data: filter function private data + * + * Return: 0 on success, negative error code for failure. + */ +static inline int +genlmsg_multicast_netns_filtered(const struct genl_family *family, + struct net *net, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags, + netlink_filter_fn filter, + void *filter_data) +{ + if (WARN_ON_ONCE(group >= family->n_mcgrps)) + return -EINVAL; + group = family->mcgrp_offset + group; + return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, + flags, filter, filter_data); +} + +/** * genlmsg_multicast_netns - multicast a netlink message to a specific netns * @family: the generic netlink family * @net: the net namespace @@ -450,10 +490,8 @@ static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) - return -EINVAL; - group = family->mcgrp_offset + group; - return nlmsg_multicast(net->genl_sock, skb, portid, group, flags); + return genlmsg_multicast_netns_filtered(family, net, skb, portid, + group, flags, NULL, NULL); } /** diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 74db6d97cae1..aa86453f6b9b 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -234,10 +234,7 @@ struct inet_sock { int uc_index; int mc_index; __be32 mc_addr; - struct { - __u16 lo; - __u16 hi; - } local_port_range; + u32 local_port_range; /* high << 16 | low */ struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; diff --git a/include/net/ip.h b/include/net/ip.h index 1fc4c8d69e33..de0c69c57e3c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -349,8 +349,14 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } -void inet_get_local_port_range(const struct net *net, int *low, int *high); -void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); +static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) +{ + u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); + + *low = range & 0xffff; + *high = range >> 16; +} +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index f346b4efbc30..2d746f4c9a0a 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -416,6 +416,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, return 0; } +static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, + const struct sk_buff *skb) +{ + __be16 payload_protocol = skb_protocol(skb, true); + + if (payload_protocol == htons(ETH_P_IPV6)) + return ip6_flowlabel((const struct ipv6hdr *)iph); + else + return 0; +} + static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 580781ff9dcf..77a71b1396b1 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2686,6 +2686,9 @@ struct ieee80211_txq { * @IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX: Hardware/driver handles transmitting * multicast frames on all links, mac80211 should not do that. * + * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT + * and connecting with a lower bandwidth instead + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2743,6 +2746,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP, IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, + IEEE80211_HW_DISALLOW_PUNCTURING, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS @@ -5809,12 +5813,11 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, * ieee80211_remove_key - remove the given key * @keyconf: the parameter passed with the set key * + * Context: Must be called with the wiphy mutex held. + * * Remove the given key. If the key was uploaded to the hardware at the * time this function is called, it is not deleted in the hardware but * instead assumed to have been removed already. - * - * Note that due to locking considerations this function can (currently) - * only be called during key iteration (ieee80211_iter_keys().) */ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); @@ -6368,12 +6371,12 @@ ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * + * Context: Must be called with wiphy mutex held; can sleep. + * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. This is intended for use in WoWLAN if the device - * needs reprogramming of the keys during suspend. Note that due - * to locking reasons, it is also only safe to call this at few - * spots since it must hold the RTNL and be able to sleep. + * needs reprogramming of the keys during suspend. * * The order in which the keys are iterated matches the order * in which they were originally installed and handed to the @@ -7435,6 +7438,9 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * @vif: interface to set active links on * @active_links: the new active links bitmap * + * Context: Must be called with wiphy mutex held; may sleep; calls + * back into the driver. + * * This changes the active links on an interface. The interface * must be in client mode (in AP mode, all links are always active), * and @active_links must be a subset of the vif's valid_links. @@ -7442,6 +7448,7 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * If a link is switched off and another is switched on at the same * time (e.g. active_links going from 0x1 to 0x10) then you will get * a sequence of calls like + * * - change_vif_links(0x11) * - unassign_vif_chanctx(link_id=0) * - change_sta_links(0x11) for each affected STA (the AP) @@ -7451,10 +7458,6 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * - change_sta_links(0x10) for each affected STA (the AP) * - assign_vif_chanctx(link_id=4) * - change_vif_links(0x10) - * - * Note: This function acquires some mac80211 locks and must not - * be called with any driver locks held that could cause a - * lock dependency inversion. Best call it without locks. */ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 88b6ef7ce1a6..76f2fd2645b7 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -293,6 +293,7 @@ struct gdma_queue { u32 head; u32 tail; + struct list_head entry; /* Extra fields specific to EQ/CQ. */ union { @@ -328,6 +329,7 @@ struct gdma_queue_spec { void *context; unsigned long log2_throttle_limit; + unsigned int msix_index; } eq; struct { @@ -344,7 +346,9 @@ struct gdma_queue_spec { struct gdma_irq_context { void (*handler)(void *arg); - void *arg; + /* Protect the eq_list */ + spinlock_t lock; + struct list_head eq_list; char name[MANA_IRQ_NAME_SZ]; }; @@ -355,7 +359,6 @@ struct gdma_context { unsigned int max_num_queues; unsigned int max_num_msix; unsigned int num_msix_usable; - struct gdma_resource msix_resource; struct gdma_irq_context *irq_contexts; /* L2 MTU */ diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 6e3e9c1363db..76147feb0d10 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -353,6 +353,25 @@ struct mana_tx_qp { struct mana_ethtool_stats { u64 stop_queue; u64 wake_queue; + u64 hc_rx_discards_no_wqe; + u64 hc_rx_err_vport_disabled; + u64 hc_rx_bytes; + u64 hc_rx_ucast_pkts; + u64 hc_rx_ucast_bytes; + u64 hc_rx_bcast_pkts; + u64 hc_rx_bcast_bytes; + u64 hc_rx_mcast_pkts; + u64 hc_rx_mcast_bytes; + u64 hc_tx_err_gf_disabled; + u64 hc_tx_err_vport_disabled; + u64 hc_tx_err_inval_vportoffset_pkt; + u64 hc_tx_err_vlan_enforcement; + u64 hc_tx_err_eth_type_enforcement; + u64 hc_tx_err_sa_enforcement; + u64 hc_tx_err_sqpdid_enforcement; + u64 hc_tx_err_cqpdid_enforcement; + u64 hc_tx_err_mtu_violation; + u64 hc_tx_err_inval_oob; u64 hc_tx_bytes; u64 hc_tx_ucast_pkts; u64 hc_tx_ucast_bytes; @@ -360,6 +379,7 @@ struct mana_ethtool_stats { u64 hc_tx_bcast_bytes; u64 hc_tx_mcast_pkts; u64 hc_tx_mcast_bytes; + u64 hc_tx_err_gdma; u64 tx_cqe_err; u64 tx_cqe_unknown_type; u64 rx_coalesced_err; @@ -602,8 +622,8 @@ struct mana_query_gf_stat_resp { struct gdma_resp_hdr hdr; u64 reported_stats; /* rx errors/discards */ - u64 discard_rx_nowqe; - u64 err_rx_vport_disabled; + u64 rx_discards_nowqe; + u64 rx_err_vport_disabled; /* rx bytes/packets */ u64 hc_rx_bytes; u64 hc_rx_ucast_pkts; @@ -613,16 +633,16 @@ struct mana_query_gf_stat_resp { u64 hc_rx_mcast_pkts; u64 hc_rx_mcast_bytes; /* tx errors */ - u64 err_tx_gf_disabled; - u64 err_tx_vport_disabled; - u64 err_tx_inval_vport_offset_pkt; - u64 err_tx_vlan_enforcement; - u64 err_tx_ethtype_enforcement; - u64 err_tx_SA_enforecement; - u64 err_tx_SQPDID_enforcement; - u64 err_tx_CQPDID_enforcement; - u64 err_tx_mtu_violation; - u64 err_tx_inval_oob; + u64 tx_err_gf_disabled; + u64 tx_err_vport_disabled; + u64 tx_err_inval_vport_offset_pkt; + u64 tx_err_vlan_enforcement; + u64 tx_err_ethtype_enforcement; + u64 tx_err_SA_enforcement; + u64 tx_err_SQPDID_enforcement; + u64 tx_err_CQPDID_enforcement; + u64 tx_err_mtu_violation; + u64 tx_err_inval_oob; /* tx bytes/packets */ u64 hc_tx_bytes; u64 hc_tx_ucast_pkts; @@ -632,7 +652,7 @@ struct mana_query_gf_stat_resp { u64 hc_tx_mcast_pkts; u64 hc_tx_mcast_bytes; /* tx error */ - u64 err_tx_gdma; + u64 tx_err_gdma; }; /* HW DATA */ /* Configure vPort Rx Steering */ diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index cdcafb30d437..aa1716fb0e53 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -21,6 +21,10 @@ struct netdev_rx_queue { #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif + /* NAPI instance for the queue + * Readers and writers must hold RTNL + */ + struct napi_struct *napi; } ____cacheline_aligned_in_smp; /* diff --git a/include/net/netlink.h b/include/net/netlink.h index 83bdf787aeee..c19ff921b661 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1011,6 +1011,20 @@ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) } /** + * nlmsg_new_large - Allocate a new netlink message with non-contiguous + * physical memory + * @payload: size of the message payload + * + * The allocated skb is unable to have frag page for shinfo->frags*, + * as the NULL setting for skb->head in netlink_skb_destructor() will + * bypass most of the handling in skb_release_data() + */ +static inline struct sk_buff *nlmsg_new_large(size_t payload) +{ + return netlink_alloc_large_skb(nlmsg_total_size(payload), 0); +} + +/** * nlmsg_end - Finalize a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header @@ -1073,21 +1087,29 @@ static inline void nlmsg_free(struct sk_buff *skb) } /** - * nlmsg_multicast - multicast a netlink message + * nlmsg_multicast_filtered - multicast a netlink message with filter function * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags + * @filter: filter function + * @filter_data: filter function private data + * + * Return: 0 on success, negative error code for failure. */ -static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, - u32 portid, unsigned int group, gfp_t flags) +static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb, + u32 portid, unsigned int group, + gfp_t flags, + netlink_filter_fn filter, + void *filter_data) { int err; NETLINK_CB(skb).dst_group = group; - err = netlink_broadcast(sk, skb, portid, group, flags); + err = netlink_broadcast_filtered(sk, skb, portid, group, flags, + filter, filter_data); if (err > 0) err = 0; @@ -1095,6 +1117,21 @@ static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, } /** + * nlmsg_multicast - multicast a netlink message + * @sk: netlink socket to spread messages to + * @skb: netlink message as socket buffer + * @portid: own netlink portid to avoid sending to yourself + * @group: multicast group id + * @flags: allocation flags + */ +static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags) +{ + return nlmsg_multicast_filtered(sk, skb, portid, group, flags, + NULL, NULL); +} + +/** * nlmsg_unicast - unicast a netlink message * @sk: netlink socket to spread message to * @skb: netlink message as socket buffer @@ -1200,7 +1237,7 @@ static inline void *nla_data(const struct nlattr *nla) * nla_len - length of payload * @nla: netlink attribute */ -static inline int nla_len(const struct nlattr *nla) +static inline u16 nla_len(const struct nlattr *nla) { return nla->nla_len - NLA_HDRLEN; } diff --git a/include/net/netns/core.h b/include/net/netns/core.h index a91ef9f8de60..78214f1b43a2 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -13,6 +13,7 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + int sysctl_optmem_max; u8 sysctl_txrehash; #ifdef CONFIG_PROC_FS diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 73f43f699199..c356c458b340 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -19,8 +19,7 @@ struct hlist_head; struct fib_table; struct sock; struct local_ports { - seqlock_t lock; - int range[2]; + u32 range; /* high << 16 | low */ bool warned; }; @@ -42,6 +41,38 @@ struct inet_timewait_death_row { struct tcp_fastopen_context; struct netns_ipv4 { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. + * Please update the document when adding new fields. + */ + + /* TX readonly hotpath cache lines */ + __cacheline_group_begin(netns_ipv4_read_tx); + u8 sysctl_tcp_early_retrans; + u8 sysctl_tcp_tso_win_divisor; + u8 sysctl_tcp_tso_rtt_log; + u8 sysctl_tcp_autocorking; + int sysctl_tcp_min_snd_mss; + unsigned int sysctl_tcp_notsent_lowat; + int sysctl_tcp_limit_output_bytes; + int sysctl_tcp_min_rtt_wlen; + int sysctl_tcp_wmem[3]; + u8 sysctl_ip_fwd_use_pmtu; + __cacheline_group_end(netns_ipv4_read_tx); + + /* TXRX readonly hotpath cache lines */ + __cacheline_group_begin(netns_ipv4_read_txrx); + u8 sysctl_tcp_moderate_rcvbuf; + __cacheline_group_end(netns_ipv4_read_txrx); + + /* RX readonly hotpath cache line */ + __cacheline_group_begin(netns_ipv4_read_rx); + u8 sysctl_ip_early_demux; + u8 sysctl_tcp_early_demux; + int sysctl_tcp_reordering; + int sysctl_tcp_rmem[3]; + __cacheline_group_end(netns_ipv4_read_rx); + struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; @@ -96,17 +127,14 @@ struct netns_ipv4 { u8 sysctl_ip_default_ttl; u8 sysctl_ip_no_pmtu_disc; - u8 sysctl_ip_fwd_use_pmtu; u8 sysctl_ip_fwd_update_priority; u8 sysctl_ip_nonlocal_bind; u8 sysctl_ip_autobind_reuse; /* Shall we try to damage output packets if routing dev changes? */ u8 sysctl_ip_dynaddr; - u8 sysctl_ip_early_demux; #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_raw_l3mdev_accept; #endif - u8 sysctl_tcp_early_demux; u8 sysctl_udp_early_demux; u8 sysctl_nexthop_compat_mode; @@ -119,7 +147,6 @@ struct netns_ipv4 { u8 sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probe_floor; int sysctl_tcp_base_mss; - int sysctl_tcp_min_snd_mss; int sysctl_tcp_probe_threshold; u32 sysctl_tcp_probe_interval; @@ -135,17 +162,14 @@ struct netns_ipv4 { u8 sysctl_tcp_backlog_ack_defer; u8 sysctl_tcp_pingpong_thresh; - int sysctl_tcp_reordering; u8 sysctl_tcp_retries1; u8 sysctl_tcp_retries2; u8 sysctl_tcp_orphan_retries; u8 sysctl_tcp_tw_reuse; int sysctl_tcp_fin_timeout; - unsigned int sysctl_tcp_notsent_lowat; u8 sysctl_tcp_sack; u8 sysctl_tcp_window_scaling; u8 sysctl_tcp_timestamps; - u8 sysctl_tcp_early_retrans; u8 sysctl_tcp_recovery; u8 sysctl_tcp_thin_linear_timeouts; u8 sysctl_tcp_slow_start_after_idle; @@ -161,21 +185,13 @@ struct netns_ipv4 { u8 sysctl_tcp_frto; u8 sysctl_tcp_nometrics_save; u8 sysctl_tcp_no_ssthresh_metrics_save; - u8 sysctl_tcp_moderate_rcvbuf; - u8 sysctl_tcp_tso_win_divisor; u8 sysctl_tcp_workaround_signed_windows; - int sysctl_tcp_limit_output_bytes; int sysctl_tcp_challenge_ack_limit; - int sysctl_tcp_min_rtt_wlen; u8 sysctl_tcp_min_tso_segs; - u8 sysctl_tcp_tso_rtt_log; - u8 sysctl_tcp_autocorking; u8 sysctl_tcp_reflect_tos; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; - int sysctl_tcp_wmem[3]; - int sysctl_tcp_rmem[3]; unsigned int sysctl_tcp_child_ehash_entries; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 582212ada3ba..fc752a50f91b 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -22,5 +22,7 @@ struct netns_smc { int sysctl_smcr_testlink_time; int sysctl_wmem; int sysctl_rmem; + int sysctl_max_links_per_lgr; + int sysctl_max_conns_per_lgr; }; #endif diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 4ebd544ae977..1d397c1a0043 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -11,7 +11,7 @@ * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * - * Basic use involves replacing and alloc_pages() calls with page_pool_alloc(), + * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * @@ -29,7 +29,7 @@ * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for - * 'struct page' and atomic operation for page->pp_frag_count. + * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call @@ -37,15 +37,15 @@ * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * - * page_pool_put_page() may be called multi times on the same page if a page is - * split into multi fragments. For the last fragment, it will either recycle the - * page, or in case of page->_refcount > 1, it will release the DMA mapping and - * in-flight state accounting. + * page_pool_put_page() may be called multiple times on the same page if a page + * is split into multiple fragments. For the last fragment, it will either + * recycle the page, or in case of page->_refcount > 1, it will release the DMA + * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in - * the same page when a page is split, the API user must setup pool->p.max_len + * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ @@ -55,16 +55,12 @@ #include <net/page_pool/types.h> #ifdef CONFIG_PAGE_POOL_STATS +/* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); -/* - * Drivers that wish to harvest page pool stats and report them to users - * (perhaps via ethtool, debugfs, or another mechanism) can allocate a - * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. - */ -bool page_pool_get_stats(struct page_pool *pool, +bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) @@ -214,69 +210,82 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) return pool->p.dma_dir; } -/* pp_frag_count represents the number of writers who can update the page - * either by updating skb->data or via DMA mappings for the device. - * We can't rely on the page refcnt for that as we don't know who might be - * holding page references and we can't reliably destroy or sync DMA mappings - * of the fragments. +/** + * page_pool_fragment_page() - split a fresh page into fragments + * @page: page to split + * @nr: references to set + * + * pp_ref_count represents the number of outstanding references to the page, + * which will be freed using page_pool APIs (rather than page allocator APIs + * like put_page()). Such references are usually held by page_pool-aware + * objects like skbs marked for page pool recycling. * - * When pp_frag_count reaches 0 we can either recycle the page if the page - * refcnt is 1 or return it back to the memory allocator and destroy any - * mappings we have. + * This helper allows the caller to take (set) multiple references to a + * freshly allocated page. The page must be freshly allocated (have a + * pp_ref_count of 1). This is commonly done by drivers and + * "fragment allocators" to save atomic operations - either when they know + * upfront how many references they will need; or to take MAX references and + * return the unused ones with a single atomic dec(), instead of performing + * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { - atomic_long_set(&page->pp_frag_count, nr); + atomic_long_set(&page->pp_ref_count, nr); } -static inline long page_pool_defrag_page(struct page *page, long nr) +static inline long page_pool_unref_page(struct page *page, long nr) { long ret; - /* If nr == pp_frag_count then we have cleared all remaining + /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case - * for pp_frag_count draining. + * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be - * partitioned into only 2 or 3 pieces; but also unify the pp_frag_count + * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ - if (atomic_long_read(&page->pp_frag_count) == nr) { + if (atomic_long_read(&page->pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case - * here for pp_frag_count draining, which is a rare case. + * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) - atomic_long_set(&page->pp_frag_count, 1); + atomic_long_set(&page->pp_ref_count, 1); return 0; } - ret = atomic_long_sub_return(nr, &page->pp_frag_count); + ret = atomic_long_sub_return(nr, &page->pp_ref_count); WARN_ON(ret < 0); - /* We are the last user here too, reset pp_frag_count back to 1 to + /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call - * page_pool_defrag_page() currently. + * page_pool_unref_page() currently. */ if (unlikely(!ret)) - atomic_long_set(&page->pp_frag_count, 1); + atomic_long_set(&page->pp_ref_count, 1); return ret; } -static inline bool page_pool_is_last_frag(struct page *page) +static inline void page_pool_ref_page(struct page *page) +{ + atomic_long_inc(&page->pp_ref_count); +} + +static inline bool page_pool_is_last_ref(struct page *page) { - /* If page_pool_defrag_page() returns 0, we were the last user */ - return page_pool_defrag_page(page, 1) == 0; + /* If page_pool_unref_page() returns 0, we were the last user */ + return page_pool_unref_page(page, 1) == 0; } /** @@ -301,10 +310,10 @@ static inline void page_pool_put_page(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_frag(page)) + if (!page_pool_is_last_ref(page)) return; - page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); + page_pool_put_unrefed_page(pool, page, dma_sync_size, allow_direct); #endif } diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 6fc5134095ed..76481c465375 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -5,6 +5,7 @@ #include <linux/dma-direction.h> #include <linux/ptr_ring.h> +#include <linux/types.h> #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA * map/unmap @@ -48,24 +49,30 @@ struct pp_alloc_cache { * @pool_size: size of the ptr_ring * @nid: NUMA node id to allocate from pages from * @dev: device, for DMA pre-mapping purposes + * @netdev: netdev this pool will serve (leave as NULL if none or multiple) * @napi: NAPI which is the sole consumer of pages, otherwise NULL * @dma_dir: DMA mapping direction * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV */ struct page_pool_params { - unsigned int flags; - unsigned int order; - unsigned int pool_size; - int nid; - struct device *dev; - struct napi_struct *napi; - enum dma_data_direction dma_dir; - unsigned int max_len; - unsigned int offset; + struct_group_tagged(page_pool_params_fast, fast, + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; + struct device *dev; + struct napi_struct *napi; + enum dma_data_direction dma_dir; + unsigned int max_len; + unsigned int offset; + ); + struct_group_tagged(page_pool_params_slow, slow, + struct net_device *netdev; /* private: used by test code only */ - void (*init_callback)(struct page *page, void *arg); - void *init_arg; + void (*init_callback)(struct page *page, void *arg); + void *init_arg; + ); }; #ifdef CONFIG_PAGE_POOL_STATS @@ -119,7 +126,9 @@ struct page_pool_stats { #endif struct page_pool { - struct page_pool_params p; + struct page_pool_params_fast p; + + bool has_init_callback; long frag_users; struct page *frag_page; @@ -178,6 +187,16 @@ struct page_pool { refcount_t user_cnt; u64 destroy_cnt; + + /* Slow/Control-path information follows */ + struct page_pool_params_slow slow; + /* User-facing fields, protected by page_pools_lock */ + struct { + struct hlist_node list; + u64 detach_time; + u32 napi_id; + u32 id; + } user; }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); @@ -215,9 +234,9 @@ static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, } #endif -void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, - bool allow_direct); +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, + bool allow_direct); static inline bool is_page_pool_compiled_in(void) { diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a76c9171db0e..f308e8268651 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -154,12 +154,6 @@ __cls_set_class(unsigned long *clp, unsigned long cl) return xchg(clp, cl); } -static inline void tcf_set_drop_reason(struct tcf_result *res, - enum skb_drop_reason reason) -{ - res->drop_reason = reason; -} - static inline void __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9fa1d0794dfa..1e200d9a066d 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -275,24 +275,6 @@ static inline void skb_txtime_consumed(struct sk_buff *skb) skb->tstamp = ktime_set(0, 0); } -struct tc_skb_cb { - struct qdisc_skb_cb qdisc_cb; - - u16 mru; - u8 post_ct:1; - u8 post_ct_snat:1; - u8 post_ct_dnat:1; - u16 zone; /* Only valid if post_ct = true */ -}; - -static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) -{ - struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; - - BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); - return cb; -} - static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, unsigned long cl, struct qdisc_walker *arg) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index dcb9160e6467..1d70c2c1572f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -332,7 +332,6 @@ struct tcf_result { }; const struct tcf_proto *goto_tp; }; - enum skb_drop_reason drop_reason; }; struct tcf_chain; @@ -1037,6 +1036,37 @@ static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) return skb; } +struct tc_skb_cb { + struct qdisc_skb_cb qdisc_cb; + u32 drop_reason; + + u16 zone; /* Only valid if post_ct = true */ + u16 mru; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; +}; + +static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) +{ + struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; + + BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); + return cb; +} + +static inline enum skb_drop_reason +tcf_get_drop_reason(const struct sk_buff *skb) +{ + return tc_skb_cb(skb)->drop_reason; +} + +static inline void tcf_set_drop_reason(const struct sk_buff *skb, + enum skb_drop_reason reason) +{ + tc_skb_cb(skb)->drop_reason = reason; +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ diff --git a/include/net/sock.h b/include/net/sock.h index 0201136b0b9c..ba6642811db4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2925,7 +2925,6 @@ extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern int sysctl_tstamp_allow_data; -extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; diff --git a/include/net/tcp.h b/include/net/tcp.h index 144ba48bb07b..f5ca4abaee8b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -490,13 +490,14 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst, u32 tsoff); -int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, - u32 cookie); + struct dst_entry *dst); +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, - const struct tcp_request_sock_ops *af_ops, - struct sock *sk, struct sk_buff *skb); + struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *tcp_opt, + int mss, u32 tsoff); + #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. @@ -582,12 +583,15 @@ __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); -bool cookie_ecn_ok(const struct tcp_options_received *opt, - const struct net *net, const struct dst_entry *dst); + +static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *dst) +{ + return READ_ONCE(net->ipv4.sysctl_tcp_ecn) || + dst_feature(dst, RTAX_FEATURE_ECN); +} /* From net/ipv6/syncookies.c */ -int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, - u32 cookie); +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index 647781080613..0f2dcc9e8d46 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -271,8 +271,7 @@ void tcp_ao_established(struct sock *sk); void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb); void tcp_ao_connect_init(struct sock *sk); void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, - struct tcp_request_sock *treq, - unsigned short int family, int l3index); + struct request_sock *req, unsigned short int family); #else /* CONFIG_TCP_AO */ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, @@ -283,8 +282,7 @@ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, } static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, - struct tcp_request_sock *treq, - unsigned short int family, int l3index) + struct request_sock *req, unsigned short int family) { } diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h index cc00118acca1..d60e8148ff4c 100644 --- a/include/net/tcp_states.h +++ b/include/net/tcp_states.h @@ -22,6 +22,7 @@ enum { TCP_LISTEN, TCP_CLOSING, /* Now a valid state */ TCP_NEW_SYN_RECV, + TCP_BOUND_INACTIVE, /* Pseudo-state for inet_diag */ TCP_MAX_STATES /* Leave at the end! */ }; @@ -43,6 +44,7 @@ enum { TCPF_LISTEN = (1 << TCP_LISTEN), TCPF_CLOSING = (1 << TCP_CLOSING), TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV), + TCPF_BOUND_INACTIVE = (1 << TCP_BOUND_INACTIVE), }; #endif /* _LINUX_TCP_STATES_H */ diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 6a9f8a5f387c..33ba6fc151cf 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -210,22 +210,23 @@ struct vxlan_rdst { }; struct vxlan_config { - union vxlan_addr remote_ip; - union vxlan_addr saddr; - __be32 vni; - int remote_ifindex; - int mtu; - __be16 dst_port; - u16 port_min; - u16 port_max; - u8 tos; - u8 ttl; - __be32 label; - u32 flags; - unsigned long age_interval; - unsigned int addrmax; - bool no_share; - enum ifla_vxlan_df df; + union vxlan_addr remote_ip; + union vxlan_addr saddr; + __be32 vni; + int remote_ifindex; + int mtu; + __be16 dst_port; + u16 port_min; + u16 port_max; + u8 tos; + u8 ttl; + __be32 label; + enum ifla_vxlan_label_policy label_policy; + u32 flags; + unsigned long age_interval; + unsigned int addrmax; + bool no_share; + enum ifla_vxlan_df df; }; enum { diff --git a/include/net/xdp.h b/include/net/xdp.h index 349c36fb5fd8..e6770dd40c91 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -16,7 +16,7 @@ * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how - * the driver have configured a given RX-ring queue. + * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP @@ -32,7 +32,7 @@ * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver - * naturally need to stop the RX-ring before purging and reallocating + * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. @@ -369,7 +369,12 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) static inline bool xdp_metalen_invalid(unsigned long metalen) { - return (metalen & (sizeof(__u32) - 1)) || (metalen > 32); + unsigned long meta_max; + + meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); + BUILD_BUG_ON(!__builtin_constant_p(meta_max)); + + return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { @@ -399,6 +404,10 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ + NETDEV_XDP_RX_METADATA_VLAN_TAG, \ + bpf_xdp_metadata_rx_vlan_tag, \ + xmo_rx_vlan_tag) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, @@ -427,6 +436,7 @@ enum xdp_rss_hash_type { XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ + XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, @@ -442,11 +452,13 @@ enum xdp_rss_hash_type { XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, @@ -457,6 +469,8 @@ struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); + int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, + u16 *vlan_tci); }; #ifdef CONFIG_NET diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index f83128007fb0..3cb4dc9bd70e 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -30,6 +30,7 @@ struct xdp_umem { struct user_struct *user; refcount_t users; u8 flags; + u8 tx_metadata_len; bool zc; struct page **pgs; int id; @@ -92,12 +93,105 @@ struct xdp_sock { struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; +/* + * AF_XDP TX metadata hooks for network devices. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. + * + * void (*tmo_request_timestamp)(void *priv) + * Called when AF_XDP frame requested egress timestamp. + * + * u64 (*tmo_fill_timestamp)(void *priv) + * Called when AF_XDP frame, that had requested egress timestamp, + * received a completion. The hook needs to return the actual HW timestamp. + * + * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv) + * Called when AF_XDP frame requested HW checksum offload. csum_start + * indicates position where checksumming should start. + * csum_offset indicates position where checksum should be stored. + * + */ +struct xsk_tx_metadata_ops { + void (*tmo_request_timestamp)(void *priv); + u64 (*tmo_fill_timestamp)(void *priv); + void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); +}; + #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); +/** + * xsk_tx_metadata_to_compl - Save enough relevant metadata information + * to perform tx completion in the future. + * @meta: pointer to AF_XDP metadata area + * @compl: pointer to output struct xsk_tx_metadata_to_compl + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. The value of @compl should be stored + * and passed to xsk_tx_metadata_complete upon TX completion. + */ +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ + if (!meta) + return; + + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + compl->tx_timestamp = &meta->completion.tx_timestamp; + else + compl->tx_timestamp = NULL; +} + +/** + * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission + * and call appropriate xsk_tx_metadata_ops operation. + * @meta: pointer to AF_XDP metadata area + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. + */ +static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!meta) + return; + + if (ops->tmo_request_timestamp) + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + ops->tmo_request_timestamp(priv); + + if (ops->tmo_request_checksum) + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) + ops->tmo_request_checksum(meta->request.csum_start, + meta->request.csum_offset, priv); +} + +/** + * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion + * and call appropriate xsk_tx_metadata_ops operation. + * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device upon + * AF_XDP egress completion. + */ +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!compl) + return; + + *compl->tx_timestamp = ops->tmo_fill_timestamp(priv); +} + #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -114,6 +208,23 @@ static inline void __xsk_map_flush(void) { } +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ +} + +static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 1f6fc8c7a84c..b62bb8525a5f 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -14,6 +14,12 @@ #ifdef CONFIG_XDP_SOCKETS +struct xsk_cb_desc { + void *src; + u8 off; + u8 bytes; +}; + void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); @@ -47,6 +53,12 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, xp_set_rxq_info(pool, rxq); } +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) +{ + xp_fill_cb(pool, desc); +} + static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) { #ifdef CONFIG_NET_RX_BUSY_POLL @@ -165,6 +177,30 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } +#define XDP_TXMD_FLAGS_VALID ( \ + XDP_TXMD_FLAGS_TIMESTAMP | \ + XDP_TXMD_FLAGS_CHECKSUM | \ + 0) + +static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +{ + return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); +} + +static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + struct xsk_tx_metadata *meta; + + if (!pool->tx_metadata_len) + return NULL; + + meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return NULL; /* no way to signal the error to the user */ + + return meta; +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); @@ -250,6 +286,11 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, { } +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) +{ +} + static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) { return 0; @@ -324,6 +365,16 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } +static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +{ + return false; +} + +static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return NULL; +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { } diff --git a/include/net/xfrm.h b/include/net/xfrm.h index c9bb0f892f55..1d107241b901 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2190,4 +2190,13 @@ static inline int register_xfrm_interface_bpf(void) #endif +#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF) +int register_xfrm_state_bpf(void); +#else +static inline int register_xfrm_state_bpf(void) +{ + return 0; +} +#endif + #endif /* _NET_XFRM_H */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index b0bdff26fc88..99dd7376df6a 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -12,6 +12,7 @@ struct xsk_buff_pool; struct xdp_rxq_info; +struct xsk_cb_desc; struct xsk_queue; struct xdp_desc; struct xdp_umem; @@ -33,6 +34,7 @@ struct xdp_buff_xsk { }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) +#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) struct xsk_dma_map { dma_addr_t *dma_pages; @@ -77,10 +79,12 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; bool dma_need_sync; bool unaligned; + bool tx_sw_csum; void *addrs; /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when @@ -132,6 +136,7 @@ static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_p /* AF_XDP ZC drivers, via xdp_sock_buff.h */ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +void xp_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc); int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages); void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); @@ -233,4 +238,9 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } +static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) +{ + return pool->tx_metadata_len > 0; +} + #endif /* XSK_BUFF_POOL_H_ */ |