diff options
Diffstat (limited to 'include/net')
84 files changed, 2058 insertions, 662 deletions
diff --git a/include/net/act_api.h b/include/net/act_api.h index 4ae0580b63ca..77ee0c657e2c 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -137,6 +137,7 @@ struct tc_action_ops { #ifdef CONFIG_NET_CLS_ACT +#define ACT_P_BOUND 0 #define ACT_P_CREATED 1 #define ACT_P_DELETED 1 @@ -191,7 +192,7 @@ int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); -void tcf_idr_insert_many(struct tc_action *actions[]); +void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); @@ -200,6 +201,8 @@ int tcf_idr_release(struct tc_action *a, bool bind); int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); +#define NET_ACT_ALIAS_PREFIX "net-act-" +#define MODULE_ALIAS_NET_ACT(kind) MODULE_ALIAS(NET_ACT_ALIAS_PREFIX kind) int tcf_action_destroy(struct tc_action *actions[], int bind); int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); @@ -207,8 +210,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action *actions[], int init_res[], size_t *attr_size, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); -struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police, - bool rtnl_held, +struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 82da55101b5a..9d06eb945509 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -8,8 +8,9 @@ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ -#define TEMP_VALID_LIFETIME (7*86400) -#define TEMP_PREFERRED_LIFETIME (86400) +#define TEMP_VALID_LIFETIME (7*86400) /* 1 week */ +#define TEMP_PREFERRED_LIFETIME (86400) /* 24 hours */ +#define REGEN_MIN_ADVANCE (2) /* 2 seconds */ #define REGEN_MAX_RETRY (3) #define MAX_DESYNC_FACTOR (600) @@ -31,17 +32,22 @@ struct prefix_info { __u8 length; __u8 prefix_len; + union __packed { + __u8 flags; + struct __packed { #if defined(__BIG_ENDIAN_BITFIELD) - __u8 onlink : 1, + __u8 onlink : 1, autoconf : 1, reserved : 6; #elif defined(__LITTLE_ENDIAN_BITFIELD) - __u8 reserved : 6, + __u8 reserved : 6, autoconf : 1, onlink : 1; #else #error "Please fix <asm/byteorder.h>" #endif + }; + }; __be32 valid; __be32 prefered; __be32 reserved2; @@ -49,6 +55,9 @@ struct prefix_info { struct in6_addr prefix; }; +/* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */ +static_assert(sizeof(struct prefix_info) == 32); + #include <linux/ipv6.h> #include <linux/netdevice.h> #include <net/if_inet6.h> @@ -408,7 +417,7 @@ static inline bool ip6_ignore_linkdown(const struct net_device *dev) if (unlikely(!idev)) return true; - return !!idev->cnf.ignore_routes_with_linkdown; + return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown); } void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 5531dd08061e..0754c463224a 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -15,6 +15,7 @@ struct key; struct sock; struct socket; struct rxrpc_call; +struct rxrpc_peer; enum rxrpc_abort_reason; enum rxrpc_interruptibility { @@ -41,13 +42,14 @@ void rxrpc_kernel_new_call_notification(struct socket *, rxrpc_notify_new_call_t, rxrpc_discard_new_call_t); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, - struct sockaddr_rxrpc *srx, + struct rxrpc_peer *peer, struct key *key, unsigned long user_call_ID, s64 tx_total_len, u32 hard_timeout, gfp_t gfp, rxrpc_notify_rx_t notify_rx, + u16 service_id, bool upgrade, enum rxrpc_interruptibility interruptibility, unsigned int debug_id); @@ -60,9 +62,14 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, enum rxrpc_abort_reason); void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call); void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call); -void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, - struct sockaddr_rxrpc *); -bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *); +struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, + struct sockaddr_rxrpc *srx, gfp_t gfp); +void rxrpc_kernel_put_peer(struct rxrpc_peer *peer); +struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer); +struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call); +const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer); +const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); +unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t, unsigned int); diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 824c258143a3..627ea8e2d915 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -8,21 +8,29 @@ #include <linux/refcount.h> #include <net/sock.h> +#if IS_ENABLED(CONFIG_UNIX) +struct unix_sock *unix_get_socket(struct file *filp); +#else +static inline struct unix_sock *unix_get_socket(struct file *filp) +{ + return NULL; +} +#endif + +extern spinlock_t unix_gc_lock; +extern unsigned int unix_tot_inflight; + void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); -void unix_destruct_scm(struct sk_buff *skb); -void io_uring_destruct_scm(struct sk_buff *skb); void unix_gc(void); -void wait_for_unix_gc(void); -struct sock *unix_get_socket(struct file *filp); +void wait_for_unix_gc(struct scm_fp_list *fpl); + struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) #define UNIX_HASH_SIZE (256 * 2) #define UNIX_HASH_BITS 8 -extern unsigned int unix_tot_inflight; - struct unix_address { refcount_t refcnt; int len; @@ -46,12 +54,6 @@ struct scm_stat { #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) -#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) -#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) -#define unix_state_lock_nested(s) \ - spin_lock_nested(&unix_sk(s)->lock, \ - SINGLE_DEPTH_NESTING) - /* The AF_UNIX socket */ struct unix_sock { /* WARNING: sk has to be the first member */ @@ -61,7 +63,7 @@ struct unix_sock { struct mutex iolock, bindlock; struct sock *peer; struct list_head link; - atomic_long_t inflight; + unsigned long inflight; spinlock_t lock; unsigned long gc_flags; #define UNIX_GC_CANDIDATE 0 @@ -75,6 +77,21 @@ struct unix_sock { }; #define unix_sk(ptr) container_of_const(ptr, struct unix_sock, sk) +#define unix_peer(sk) (unix_sk(sk)->peer) + +#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) +enum unix_socket_lock_class { + U_LOCK_NORMAL, + U_LOCK_SECOND, /* for double locking, see unix_state_double_lock(). */ + U_LOCK_DIAG, /* used while dumping icons, see sk_diag_dump_icons(). */ +}; + +static inline void unix_state_lock_nested(struct sock *sk, + enum unix_socket_lock_class subclass) +{ + spin_lock_nested(&unix_sk(sk)->lock, subclass); +} #define peer_wait peer_wq.wait diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index e302c0e804d0..535701efc1e5 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -137,7 +137,6 @@ struct vsock_transport { u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); - int (*set_rcvlowat)(struct vsock_sock *vsk, int val); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, @@ -168,6 +167,7 @@ struct vsock_transport { struct vsock_transport_send_notify_data *); /* sk_lock held by the caller */ void (*notify_buffer_size)(struct vsock_sock *, u64 *); + int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 20988623c5cc..8f8dd9173714 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -189,6 +189,7 @@ struct blocked_key { struct smp_csrk { bdaddr_t bdaddr; u8 bdaddr_type; + u8 link_type; u8 type; u8 val[16]; }; @@ -198,6 +199,7 @@ struct smp_ltk { struct rcu_head rcu; bdaddr_t bdaddr; u8 bdaddr_type; + u8 link_type; u8 authenticated; u8 type; u8 enc_size; @@ -212,6 +214,7 @@ struct smp_irk { bdaddr_t rpa; bdaddr_t bdaddr; u8 addr_type; + u8 link_type; u8 val[16]; }; @@ -219,6 +222,8 @@ struct link_key { struct list_head list; struct rcu_head rcu; bdaddr_t bdaddr; + u8 bdaddr_type; + u8 link_type; u8 type; u8 val[HCI_LINK_KEY_SIZE]; u8 pin_len; @@ -534,7 +539,6 @@ struct hci_dev { struct work_struct tx_work; struct delayed_work le_scan_disable; - struct delayed_work le_scan_restart; struct sk_buff_head rx_q; struct sk_buff_head raw_q; @@ -952,7 +956,6 @@ void hci_inquiry_cache_flush(struct hci_dev *hdev); /* ----- HCI Connections ----- */ enum { HCI_CONN_AUTH_PEND, - HCI_CONN_REAUTH_PEND, HCI_CONN_ENCRYPT_PEND, HCI_CONN_RSWITCH_PEND, HCI_CONN_MODE_CHANGE_PEND, @@ -1227,11 +1230,11 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev, continue; /* Match CIG ID if set */ - if (cig != BT_ISO_QOS_CIG_UNSET && cig != c->iso_qos.ucast.cig) + if (cig != c->iso_qos.ucast.cig) continue; /* Match CIS ID if set */ - if (id != BT_ISO_QOS_CIS_UNSET && id != c->iso_qos.ucast.cis) + if (id != c->iso_qos.ucast.cis) continue; /* Match destination address if set */ @@ -1293,6 +1296,30 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, } static inline struct hci_conn * +hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK || + c->state != state) + continue; + + if (handle == c->iso_qos.bcast.big) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big) { struct hci_conn_hash *h = &hdev->conn_hash; diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index c5e57c6bd873..9ce5ac2bfbad 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -54,6 +54,8 @@ typedef enum { AD_MUX_DETACHED, /* mux machine */ AD_MUX_WAITING, /* mux machine */ AD_MUX_ATTACHED, /* mux machine */ + AD_MUX_COLLECTING, /* mux machine */ + AD_MUX_DISTRIBUTING, /* mux machine */ AD_MUX_COLLECTING_DISTRIBUTING /* mux machine */ } mux_states_t; diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 69292ecc0325..473a0147769e 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -76,6 +76,7 @@ enum { BOND_OPT_MISSED_MAX, BOND_OPT_NS_TARGETS, BOND_OPT_PRIO, + BOND_OPT_COUPLED_CONTROL, BOND_OPT_LAST }; diff --git a/include/net/bonding.h b/include/net/bonding.h index 5b8b1b644a2d..b61fb1aa3a56 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -148,6 +148,7 @@ struct bond_params { #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; #endif + int coupled_control; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; @@ -167,6 +168,7 @@ struct slave { u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ + rx_disabled:1, /* indicates whether slave's Rx is disabled */ should_notify:1, /* indicates whether the state changed */ should_notify_link:1; /* indicates whether the link changed */ u8 duplex; @@ -568,6 +570,14 @@ static inline void bond_set_slave_inactive_flags(struct slave *slave, bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); if (!slave->bond->params.all_slaves_active) slave->inactive = 1; + if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) + slave->rx_disabled = 1; +} + +static inline void bond_set_slave_tx_disabled_flags(struct slave *slave, + bool notify) +{ + bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); } static inline void bond_set_slave_active_flags(struct slave *slave, @@ -575,6 +585,14 @@ static inline void bond_set_slave_active_flags(struct slave *slave, { bond_set_slave_state(slave, BOND_STATE_ACTIVE, notify); slave->inactive = 0; + if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) + slave->rx_disabled = 0; +} + +static inline void bond_set_slave_rx_enabled_flags(struct slave *slave, + bool notify) +{ + slave->rx_disabled = 0; } static inline bool bond_is_slave_inactive(struct slave *slave) @@ -582,6 +600,11 @@ static inline bool bond_is_slave_inactive(struct slave *slave) return slave->inactive; } +static inline bool bond_is_slave_rx_disabled(struct slave *slave) +{ + return slave->rx_disabled; +} + static inline void bond_propose_link_state(struct slave *slave, int state) { slave->link_new_state = state; diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 4dabeb6c76d3..9b09acac538e 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -48,6 +48,10 @@ void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget); + #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b137a33a1b68..2e2be4fd2bb6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7,7 +7,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2021, 2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/ethtool.h> @@ -52,7 +52,7 @@ * such wiphy can have zero, one, or many virtual interfaces associated with * it, which need to be identified as such by pointing the network interface's * @ieee80211_ptr pointer to a &struct wireless_dev which further describes - * the wireless part of the interface, normally this struct is embedded in the + * the wireless part of the interface. Normally this struct is embedded in the * network interface's private data area. Drivers can optionally allow creating * or destroying virtual interfaces on the fly, but without at least one or the * ability to create some the wireless device isn't useful. @@ -117,6 +117,14 @@ struct wiphy; * This may be due to the driver or due to regulatory bandwidth * restrictions. * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel. + * @IEEE80211_CHAN_DFS_CONCURRENT: See %NL80211_RRF_DFS_CONCURRENT + * @IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT: Client connection with VLP AP + * not permitted using this channel + * @IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT: Client connection with AFC AP + * not permitted using this channel + * @IEEE80211_CHAN_CAN_MONITOR: This channel can be used for monitor + * mode even in the presence of other (regulatory) restrictions, + * even if it is otherwise disabled. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = 1<<0, @@ -140,6 +148,10 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_16MHZ = 1<<18, IEEE80211_CHAN_NO_320MHZ = 1<<19, IEEE80211_CHAN_NO_EHT = 1<<20, + IEEE80211_CHAN_DFS_CONCURRENT = 1<<21, + IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT = 1<<22, + IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT = 1<<23, + IEEE80211_CHAN_CAN_MONITOR = 1<<24, }; #define IEEE80211_CHAN_NO_HT40 \ @@ -800,6 +812,9 @@ struct key_params { * chan will define the primary channel and all other * parameters are ignored. * @freq1_offset: offset from @center_freq1, in KHz + * @punctured: mask of the punctured 20 MHz subchannels, with + * bits turned on being disabled (punctured); numbered + * from lower to higher frequency (like in the spec) */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -808,6 +823,7 @@ struct cfg80211_chan_def { u32 center_freq2; struct ieee80211_edmg edmg; u16 freq1_offset; + u16 punctured; }; /* @@ -948,7 +964,8 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1, chandef1->width == chandef2->width && chandef1->center_freq1 == chandef2->center_freq1 && chandef1->freq1_offset == chandef2->freq1_offset && - chandef1->center_freq2 == chandef2->center_freq2); + chandef1->center_freq2 == chandef2->center_freq2 && + chandef1->punctured == chandef2->punctured); } /** @@ -977,6 +994,15 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1, const struct cfg80211_chan_def *chandef2); /** + * nl80211_chan_width_to_mhz - get the channel width in MHz + * @chan_width: the channel width from &enum nl80211_chan_width + * + * Return: channel width in MHz if the chan_width from &enum nl80211_chan_width + * is valid. -1 otherwise. + */ +int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width); + +/** * cfg80211_chandef_valid - check if a channel definition is valid * @chandef: the channel definition to check * Return: %true if the channel definition is valid. %false otherwise. @@ -1031,6 +1057,20 @@ cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef); /** + * cfg80211_chandef_primary - calculate primary 40/80/160 MHz freq + * @chandef: chandef to calculate for + * @primary_chan_width: primary channel width to calculate center for + * @punctured: punctured sub-channel bitmap, will be recalculated + * according to the new bandwidth, can be %NULL + * + * Returns: the primary 40/80/160 MHz channel center frequency, or -1 + * for errors, updating the punctured bitmap + */ +int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef, + enum nl80211_chan_width primary_chan_width, + u16 *punctured); + +/** * nl80211_send_chandef - sends the channel definition. * @msg: the msg to send channel definition * @chandef: the channel definition to check @@ -1440,9 +1480,6 @@ struct cfg80211_unsol_bcast_probe_resp { * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @mbssid_config: AP settings for multiple bssid - * @punct_bitmap: Preamble puncturing bitmap. Each bit represents - * a 20 MHz channel, lowest bit corresponding to the lowest channel. - * Bit set to 1 indicates that the channel is punctured. */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1477,7 +1514,6 @@ struct cfg80211_ap_settings { struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; struct cfg80211_mbssid_config mbssid_config; - u16 punct_bitmap; }; @@ -1511,9 +1547,8 @@ struct cfg80211_ap_update { * @radar_required: whether radar detection is required on the new channel * @block_tx: whether transmissions should be blocked while changing * @count: number of beacons until switch - * @punct_bitmap: Preamble puncturing bitmap. Each bit represents - * a 20 MHz channel, lowest bit corresponding to the lowest channel. - * Bit set to 1 indicates that the channel is punctured. + * @link_id: defines the link on which channel switch is expected during + * MLO. 0 in case of non-MLO. */ struct cfg80211_csa_settings { struct cfg80211_chan_def chandef; @@ -1526,7 +1561,7 @@ struct cfg80211_csa_settings { bool radar_required; bool block_tx; u8 count; - u16 punct_bitmap; + u8 link_id; }; /** @@ -1665,6 +1700,21 @@ struct link_station_del_parameters { }; /** + * struct cfg80211_ttlm_params: TID to link mapping parameters + * + * Used for setting a TID to link mapping. + * + * @dlink: Downlink TID to link mapping, as defined in section 9.4.2.314 + * (TID-To-Link Mapping element) in Draft P802.11be_D4.0. + * @ulink: Uplink TID to link mapping, as defined in section 9.4.2.314 + * (TID-To-Link Mapping element) in Draft P802.11be_D4.0. + */ +struct cfg80211_ttlm_params { + u16 dlink[8]; + u16 ulink[8]; +}; + +/** * struct station_parameters - station parameters * * Used to change and create a new station. @@ -1734,11 +1784,15 @@ struct station_parameters { * @subtype: Management frame subtype to use for indicating removal * (10 = Disassociation, 12 = Deauthentication) * @reason_code: Reason code for the Disassociation/Deauthentication frame + * @link_id: Link ID indicating a link that stations to be flushed must be + * using; valid only for MLO, but can also be -1 for MLO to really + * remove all stations. */ struct station_del_parameters { const u8 *mac; u8 subtype; u16 reason_code; + int link_id; }; /** @@ -2560,7 +2614,7 @@ struct cfg80211_scan_info { * @short_ssid: short ssid to scan for * @bssid: bssid to scan for * @channel_idx: idx of the channel in the channel array in the scan request - * which the above info relvant to + * which the above info is relevant to * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU * @short_ssid_valid: @short_ssid is valid and can be used * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait @@ -2608,6 +2662,8 @@ struct cfg80211_scan_6ghz_params { * @n_6ghz_params: number of 6 GHz params * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) + * @tsf_report_link_id: for MLO, indicates the link ID of the BSS that should be + * used for TSF reporting. Can be set to -1 to indicate no preference. */ struct cfg80211_scan_request { struct cfg80211_ssid *ssids; @@ -2636,6 +2692,7 @@ struct cfg80211_scan_request { bool scan_6ghz; u32 n_6ghz_params; struct cfg80211_scan_6ghz_params *scan_6ghz_params; + s8 tsf_report_link_id; /* keep last */ struct ieee80211_channel *channels[] __counted_by(n_channels); @@ -2660,19 +2717,11 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match * or no match (RSSI only) * @rssi_thold: don't report scan results below this threshold (in s32 dBm) - * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied - * for filtering out scan results received. Drivers advertise this support - * of band specific rssi based filtering through the feature capability - * %NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band - * specific rssi thresholds take precedence over rssi_thold, if specified. - * If not specified for any band, it will be assigned with rssi_thold of - * corresponding matchset. */ struct cfg80211_match_set { struct cfg80211_ssid ssid; u8 bssid[ETH_ALEN]; s32 rssi_thold; - s32 per_band_rssi_thold[NUM_NL80211_BANDS]; }; /** @@ -2816,6 +2865,13 @@ enum cfg80211_signal_type { * the BSS that requested the scan in which the beacon/probe was received. * @chains: bitmask for filled values in @chain_signal. * @chain_signal: per-chain signal strength of last received BSS in dBm. + * @restrict_use: restrict usage, if not set, assume @use_for is + * %NL80211_BSS_USE_FOR_NORMAL. + * @use_for: bitmap of possible usage for this BSS, see + * &enum nl80211_bss_use_for + * @cannot_use_reasons: the reasons (bitmap) for not being able to connect, + * if @restrict_use is set and @use_for is zero (empty); may be 0 for + * unspecified reasons; see &enum nl80211_bss_cannot_use_reasons * @drv_data: Data to be passed through to @inform_bss */ struct cfg80211_inform_bss { @@ -2827,6 +2883,9 @@ struct cfg80211_inform_bss { u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; + u8 restrict_use:1, use_for:7; + u8 cannot_use_reasons; + void *drv_data; }; @@ -2865,6 +2924,8 @@ struct cfg80211_bss_ies { * own the beacon_ies, but they're just pointers to the ones from the * @hidden_beacon_bss struct) * @proberesp_ies: the information elements from the last Probe Response frame + * @proberesp_ecsa_stuck: ECSA element is stuck in the Probe Response frame, + * cannot rely on it having valid data * @hidden_beacon_bss: in case this BSS struct represents a probe response from * a BSS that hides the SSID in its beacon, this points to the BSS struct * that holds the beacon data. @beacon_ies is still valid, of course, and @@ -2878,6 +2939,11 @@ struct cfg80211_bss_ies { * @chain_signal: per-chain signal strength of last received BSS in dBm. * @bssid_index: index in the multiple BSS set * @max_bssid_indicator: max number of members in the BSS set + * @use_for: bitmap of possible usage for this BSS, see + * &enum nl80211_bss_use_for + * @cannot_use_reasons: the reasons (bitmap) for not being able to connect, + * if @restrict_use is set and @use_for is zero (empty); may be 0 for + * unspecified reasons; see &enum nl80211_bss_cannot_use_reasons * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes */ struct cfg80211_bss { @@ -2900,9 +2966,14 @@ struct cfg80211_bss { u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; + u8 proberesp_ecsa_stuck:1; + u8 bssid_index; u8 max_bssid_indicator; + u8 use_for; + u8 cannot_use_reasons; + u8 priv[] __aligned(sizeof(void *)); }; @@ -3006,6 +3077,7 @@ struct cfg80211_assoc_link { * @CONNECT_REQ_MLO_SUPPORT: Userspace indicates support for handling MLD links. * Drivers shall disable MLO features for the current association if this * flag is not set. + * @ASSOC_REQ_SPP_AMSDU: SPP A-MSDUs will be used on this connection (if any) */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), @@ -3015,6 +3087,7 @@ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HE = BIT(4), ASSOC_REQ_DISABLE_EHT = BIT(5), CONNECT_REQ_MLO_SUPPORT = BIT(6), + ASSOC_REQ_SPP_AMSDU = BIT(7), }; /** @@ -3180,8 +3253,8 @@ struct cfg80211_ibss_params { * * @behaviour: requested BSS selection behaviour. * @param: parameters for requestion behaviour. - * @band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF. - * @adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST. + * @param.band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF. + * @param.adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST. */ struct cfg80211_bss_selection { enum nl80211_bss_select_attr behaviour; @@ -3539,12 +3612,15 @@ struct cfg80211_wowlan_nd_info { * @tcp_connlost: TCP connection lost or failed to establish * @tcp_nomoretokens: TCP data ran out of tokens * @net_detect: if not %NULL, woke up because of net detect + * @unprot_deauth_disassoc: woke up due to unprotected deauth or + * disassoc frame (in MFP). */ struct cfg80211_wowlan_wakeup { bool disconnect, magic_pkt, gtk_rekey_failure, eap_identity_req, four_way_handshake, rfkill_release, packet_80211, - tcp_match, tcp_connlost, tcp_nomoretokens; + tcp_match, tcp_connlost, tcp_nomoretokens, + unprot_deauth_disassoc; s32 pattern_idx; u32 packet_present_len, packet_len; const void *packet; @@ -4493,6 +4569,7 @@ struct mgmt_frame_regs { * @del_link_station: Remove a link of a station. * * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. + * @set_ttlm: set the TID to link mapping. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -4852,6 +4929,8 @@ struct cfg80211_ops { struct link_station_del_parameters *params); int (*set_hw_timestamp)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts); + int (*set_ttlm)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_ttlm_params *params); }; /* @@ -4863,7 +4942,7 @@ struct cfg80211_ops { * enum wiphy_flags - wiphy capability flags * * @WIPHY_FLAG_SPLIT_SCAN_6GHZ: if set to true, the scan request will be split - * into two, first for legacy bands and second for UHB. + * into two, first for legacy bands and second for 6 GHz. * @WIPHY_FLAG_NETNS_OK: if not set, do not allow changing the netns of this * wiphy at all * @WIPHY_FLAG_PS_ON_BY_DEFAULT: if set to true, powersave will be enabled @@ -4910,6 +4989,8 @@ struct cfg80211_ops { * NL80211_REGDOM_SET_BY_DRIVER. * @WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON: reg_call_notifier() is called if driver * set this flag to update channels on beacon hints. + * @WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY: support connection to non-primary link + * of an NSTR mobile AP MLD. */ enum wiphy_flags { WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), @@ -4923,7 +5004,7 @@ enum wiphy_flags { WIPHY_FLAG_IBSS_RSN = BIT(8), WIPHY_FLAG_MESH_AUTH = BIT(10), WIPHY_FLAG_SUPPORTS_EXT_KCK_32 = BIT(11), - /* use hole at 12 */ + WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY = BIT(12), WIPHY_FLAG_SUPPORTS_FW_ROAM = BIT(13), WIPHY_FLAG_AP_UAPSD = BIT(14), WIPHY_FLAG_SUPPORTS_TDLS = BIT(15), @@ -6013,7 +6094,6 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy, * wireless device if it has no netdev * @u: union containing data specific to @iftype * @connected: indicates if connected or not (STA mode) - * @bssid: (private) Used by the internal configuration code * @wext: (private) Used by the internal wireless extensions compat code * @wext.ibss: (private) IBSS data part of wext handling * @wext.connect: (private) connection handling data @@ -6033,8 +6113,6 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy, * @mgmt_registrations: list of registrations for management frames * @mgmt_registrations_need_update: mgmt registrations were updated, * need to propagate the update to the driver - * @beacon_interval: beacon interval used on this device for transmitting - * beacons, 0 when not valid * @address: The address for this device, valid only if @netdev is %NULL * @is_running: true if this is a non-netdev device that has been started, e.g. * the P2P Device. @@ -6145,7 +6223,7 @@ struct wireless_dev { int beacon_interval; struct cfg80211_chan_def preset_chandef; struct cfg80211_chan_def chandef; - u8 id[IEEE80211_MAX_SSID_LEN]; + u8 id[IEEE80211_MAX_MESH_ID_LEN]; u8 id_len, id_up_len; } mesh; struct { @@ -6793,13 +6871,45 @@ cfg80211_find_vendor_ie(unsigned int oui, int oui_type, } /** + * enum cfg80211_rnr_iter_ret - reduced neighbor report iteration state + * @RNR_ITER_CONTINUE: continue iterating with the next entry + * @RNR_ITER_BREAK: break iteration and return success + * @RNR_ITER_ERROR: break iteration and return error + */ +enum cfg80211_rnr_iter_ret { + RNR_ITER_CONTINUE, + RNR_ITER_BREAK, + RNR_ITER_ERROR, +}; + +/** + * cfg80211_iter_rnr - iterate reduced neighbor report entries + * @elems: the frame elements to iterate RNR elements and then + * their entries in + * @elems_len: length of the elements + * @iter: iteration function, see also &enum cfg80211_rnr_iter_ret + * for the return value + * @iter_data: additional data passed to the iteration function + * Return: %true on success (after successfully iterating all entries + * or if the iteration function returned %RNR_ITER_BREAK), + * %false on error (iteration function returned %RNR_ITER_ERROR + * or elements were malformed.) + */ +bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len, + enum cfg80211_rnr_iter_ret + (*iter)(void *data, u8 type, + const struct ieee80211_neighbor_ap_info *info, + const u8 *tbtt_info, u8 tbtt_info_len), + void *iter_data); + +/** * cfg80211_defragment_element - Defrag the given element data into a buffer * * @elem: the element to defragment * @ies: elements where @elem is contained * @ieslen: length of @ies - * @data: buffer to store element data - * @data_len: length of @data + * @data: buffer to store element data, or %NULL to just determine size + * @data_len: length of @data, or 0 * @frag_id: the element ID of fragments * * Return: length of @data, or -EINVAL on error @@ -7097,11 +7207,13 @@ size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, * from a beacon or probe response * @CFG80211_BSS_FTYPE_BEACON: data comes from a beacon * @CFG80211_BSS_FTYPE_PRESP: data comes from a probe response + * @CFG80211_BSS_FTYPE_S1G_BEACON: data comes from an S1G beacon */ enum cfg80211_bss_frame_type { CFG80211_BSS_FTYPE_UNKNOWN, CFG80211_BSS_FTYPE_BEACON, CFG80211_BSS_FTYPE_PRESP, + CFG80211_BSS_FTYPE_S1G_BEACON, }; /** @@ -7116,6 +7228,23 @@ int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen, enum nl80211_band band); /** + * cfg80211_ssid_eq - compare two SSIDs + * @a: first SSID + * @b: second SSID + * + * Return: %true if SSIDs are equal, %false otherwise. + */ +static inline bool +cfg80211_ssid_eq(struct cfg80211_ssid *a, struct cfg80211_ssid *b) +{ + if (WARN_ON(!a || !b)) + return false; + if (a->ssid_len != b->ssid_len) + return false; + return memcmp(a->ssid, b->ssid, a->ssid_len) ? false : true; +} + +/** * cfg80211_inform_bss_data - inform cfg80211 of a new BSS * * @wiphy: the wiphy reporting the BSS @@ -7162,6 +7291,25 @@ cfg80211_inform_bss(struct wiphy *wiphy, } /** + * __cfg80211_get_bss - get a BSS reference + * @wiphy: the wiphy this BSS struct belongs to + * @channel: the channel to search on (or %NULL) + * @bssid: the desired BSSID (or %NULL) + * @ssid: the desired SSID (or %NULL) + * @ssid_len: length of the SSID (or 0) + * @bss_type: type of BSS, see &enum ieee80211_bss_type + * @privacy: privacy filter, see &enum ieee80211_privacy + * @use_for: indicates which use is intended + */ +struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy, + u32 use_for); + +/** * cfg80211_get_bss - get a BSS reference * @wiphy: the wiphy this BSS struct belongs to * @channel: the channel to search on (or %NULL) @@ -7170,13 +7318,20 @@ cfg80211_inform_bss(struct wiphy *wiphy, * @ssid_len: length of the SSID (or 0) * @bss_type: type of BSS, see &enum ieee80211_bss_type * @privacy: privacy filter, see &enum ieee80211_privacy + * + * This version implies regular usage, %NL80211_BSS_USE_FOR_NORMAL. */ -struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, - struct ieee80211_channel *channel, - const u8 *bssid, - const u8 *ssid, size_t ssid_len, - enum ieee80211_bss_type bss_type, - enum ieee80211_privacy privacy); +static inline struct cfg80211_bss * +cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, + const u8 *bssid, const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy) +{ + return __cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, + bss_type, privacy, + NL80211_BSS_USE_FOR_NORMAL); +} + static inline struct cfg80211_bss * cfg80211_get_ibss(struct wiphy *wiphy, struct ieee80211_channel *channel, @@ -7270,8 +7425,6 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); /** * struct cfg80211_rx_assoc_resp_data - association response data - * @bss: the BSS that association was requested with, ownership of the pointer - * moves to cfg80211 in the call to cfg80211_rx_assoc_resp() * @buf: (Re)Association Response frame (header + body) * @len: length of the frame data * @uapsd_queues: bitmap of queues configured for uapsd. Same format @@ -7281,6 +7434,8 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); * @ap_mld_addr: AP MLD address (in case of MLO) * @links: per-link information indexed by link ID, use links[0] for * non-MLO connections + * @links.bss: the BSS that association was requested with, ownership of the + * pointer moves to cfg80211 in the call to cfg80211_rx_assoc_resp() * @links.status: Set this (along with a BSS pointer) for links that * were rejected by the AP. */ @@ -7309,7 +7464,7 @@ struct cfg80211_rx_assoc_resp_data { * This function may sleep. The caller must hold the corresponding wdev's mutex. */ void cfg80211_rx_assoc_resp(struct net_device *dev, - struct cfg80211_rx_assoc_resp_data *data); + const struct cfg80211_rx_assoc_resp_data *data); /** * struct cfg80211_assoc_failure - association failure data @@ -7428,7 +7583,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, * RFkill integration in cfg80211 is almost invisible to drivers, * as cfg80211 automatically registers an rfkill instance for each * wireless device it knows about. Soft kill is also translated - * into disconnecting and turning all interfaces off, drivers are + * into disconnecting and turning all interfaces off. Drivers are * expected to turn off the device when all interfaces are down. * * However, devices may have a hard RFkill line, in which case they @@ -7476,7 +7631,7 @@ static inline void wiphy_rfkill_stop_polling(struct wiphy *wiphy) * the configuration mechanism. * * A driver supporting vendor commands must register them as an array - * in struct wiphy, with handlers for each one, each command has an + * in struct wiphy, with handlers for each one. Each command has an * OUI and sub command ID to identify it. * * Note that this feature should not be (ab)used to implement protocol @@ -7640,7 +7795,7 @@ static inline void cfg80211_vendor_event(struct sk_buff *skb, gfp_t gfp) * interact with driver-specific tools to aid, for instance, * factory programming. * - * This chapter describes how drivers interact with it, for more + * This chapter describes how drivers interact with it. For more * information see the nl80211 book's chapter on it. */ @@ -8631,14 +8786,13 @@ bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, * @dev: the device which switched channels * @chandef: the new channel definition * @link_id: the link ID for MLO, must be 0 for non-MLO - * @punct_bitmap: the new puncturing bitmap * * Caller must hold wiphy mutex, therefore must only be called from sleepable * driver context! */ void cfg80211_ch_switch_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, - unsigned int link_id, u16 punct_bitmap); + unsigned int link_id); /* * cfg80211_ch_switch_started_notify - notify channel switch start @@ -8647,7 +8801,6 @@ void cfg80211_ch_switch_notify(struct net_device *dev, * @link_id: the link ID for MLO, must be 0 for non-MLO * @count: the number of TBTTs until the channel switch happens * @quiet: whether or not immediate quiet was requested by the AP - * @punct_bitmap: the future puncturing bitmap * * Inform the userspace about the channel switch that has just * started, so that it can take appropriate actions (eg. starting @@ -8656,7 +8809,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev, void cfg80211_ch_switch_started_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, unsigned int link_id, u8 count, - bool quiet, u16 punct_bitmap); + bool quiet); /** * ieee80211_operating_class_to_band - convert operating class to band @@ -8670,6 +8823,19 @@ bool ieee80211_operating_class_to_band(u8 operating_class, enum nl80211_band *band); /** + * ieee80211_operating_class_to_chandef - convert operating class to chandef + * + * @operating_class: the operating class to convert + * @chan: the ieee80211_channel to convert + * @chandef: a pointer to the resulting chandef + * + * Returns %true if the conversion was successful, %false otherwise. + */ +bool ieee80211_operating_class_to_chandef(u8 operating_class, + struct ieee80211_channel *chan, + struct cfg80211_chan_def *chandef); + +/** * ieee80211_chandef_to_operating_class - convert chandef to operation class * * @chandef: the chandef to convert @@ -9275,18 +9441,6 @@ static inline int cfg80211_color_change_notify(struct net_device *dev) } /** - * cfg80211_valid_disable_subchannel_bitmap - validate puncturing bitmap - * @bitmap: bitmap to be validated - * @chandef: channel definition - * - * Validate the puncturing bitmap. - * - * Return: %true if the bitmap is valid. %false otherwise. - */ -bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap, - const struct cfg80211_chan_def *chandef); - -/** * cfg80211_links_removed - Notify about removed STA MLD setup links. * @dev: network device. * @link_mask: BIT mask of removed STA MLD setup link IDs. @@ -9299,4 +9453,60 @@ bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap, */ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); +/** + * cfg80211_schedule_channels_check - schedule regulatory check if needed + * @wdev: the wireless device to check + * + * In case the device supports NO_IR or DFS relaxations, schedule regulatory + * channels check, as previous concurrent operation conditions may not + * hold anymore. + */ +void cfg80211_schedule_channels_check(struct wireless_dev *wdev); + +#ifdef CONFIG_CFG80211_DEBUGFS +/** + * wiphy_locked_debugfs_read - do a locked read in debugfs + * @wiphy: the wiphy to use + * @file: the file being read + * @buf: the buffer to fill and then read from + * @bufsize: size of the buffer + * @userbuf: the user buffer to copy to + * @count: read count + * @ppos: read position + * @handler: the read handler to call (under wiphy lock) + * @data: additional data to pass to the read handler + */ +ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file, + char *buf, size_t bufsize, + char __user *userbuf, size_t count, + loff_t *ppos, + ssize_t (*handler)(struct wiphy *wiphy, + struct file *file, + char *buf, + size_t bufsize, + void *data), + void *data); + +/** + * wiphy_locked_debugfs_write - do a locked write in debugfs + * @wiphy: the wiphy to use + * @file: the file being written to + * @buf: the buffer to copy the user data to + * @bufsize: size of the buffer + * @userbuf: the user buffer to copy from + * @count: read count + * @handler: the write handler to call (under wiphy lock) + * @data: additional data to pass to the write handler + */ +ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, + char *buf, size_t bufsize, + const char __user *userbuf, size_t count, + ssize_t (*handler)(struct wiphy *wiphy, + struct file *file, + char *buf, + size_t count, + void *data), + void *data); +#endif + #endif /* __NET_CFG80211_H */ diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 3c70ad53a49c..9707ab54fdd5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -30,6 +30,7 @@ FN(TCP_AOFAILURE) \ FN(SOCKET_BACKLOG) \ FN(TCP_FLAGS) \ + FN(TCP_ABORT_ON_DATA) \ FN(TCP_ZEROWINDOW) \ FN(TCP_OLD_DATA) \ FN(TCP_OVERWINDOW) \ @@ -37,6 +38,7 @@ FN(TCP_RFC7323_PAWS) \ FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ + FN(TCP_INVALID_ACK_SEQUENCE) \ FN(TCP_RESET) \ FN(TCP_INVALID_SYN) \ FN(TCP_CLOSE) \ @@ -54,6 +56,7 @@ FN(NEIGH_QUEUEFULL) \ FN(NEIGH_DEAD) \ FN(TC_EGRESS) \ + FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ FN(CPU_BACKLOG) \ FN(XDP) \ @@ -85,7 +88,10 @@ FN(IPV6_NDISC_BAD_OPTIONS) \ FN(IPV6_NDISC_NS_OTHERHOST) \ FN(QUEUE_PURGE) \ - FN(TC_ERROR) \ + FN(TC_COOKIE_ERROR) \ + FN(PACKET_SOCK_ERROR) \ + FN(TC_CHAIN_NOTFOUND) \ + FN(TC_RECLASSIFY_LOOP) \ FNe(MAX) /** @@ -102,7 +108,13 @@ enum skb_drop_reason { SKB_CONSUMED, /** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */ SKB_DROP_REASON_NOT_SPECIFIED, - /** @SKB_DROP_REASON_NO_SOCKET: socket not found */ + /** + * @SKB_DROP_REASON_NO_SOCKET: no valid socket that can be used. + * Reason could be one of three cases: + * 1) no established/listening socket found during lookup process + * 2) no valid request socket during 3WHS process + * 3) no valid child socket during 3WHS process + */ SKB_DROP_REASON_NO_SOCKET, /** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */ SKB_DROP_REASON_PKT_TOO_SMALL, @@ -195,6 +207,11 @@ enum skb_drop_reason { /** @SKB_DROP_REASON_TCP_FLAGS: TCP flags invalid */ SKB_DROP_REASON_TCP_FLAGS, /** + * @SKB_DROP_REASON_TCP_ABORT_ON_DATA: abort on data, corresponding to + * LINUX_MIB_TCPABORTONDATA + */ + SKB_DROP_REASON_TCP_ABORT_ON_DATA, + /** * @SKB_DROP_REASON_TCP_ZEROWINDOW: TCP receive window size is zero, * see LINUX_MIB_TCPZEROWINDOWDROP */ @@ -218,13 +235,19 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_OFOMERGE, /** * @SKB_DROP_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to - * LINUX_MIB_PAWSESTABREJECTED + * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED */ SKB_DROP_REASON_TCP_RFC7323_PAWS, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ SKB_DROP_REASON_TCP_OLD_SEQUENCE, /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ SKB_DROP_REASON_TCP_INVALID_SEQUENCE, + /** + * @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ + * field because ack sequence is not in the window between snd_una + * and snd_nxt + */ + SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE, /** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */ SKB_DROP_REASON_TCP_RESET, /** @@ -268,6 +291,8 @@ enum skb_drop_reason { SKB_DROP_REASON_NEIGH_DEAD, /** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */ SKB_DROP_REASON_TC_EGRESS, + /** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */ + SKB_DROP_REASON_SECURITY_HOOK, /** * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting ( * failed to enqueue to current qdisc) @@ -376,8 +401,23 @@ enum skb_drop_reason { SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST, /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */ SKB_DROP_REASON_QUEUE_PURGE, - /** @SKB_DROP_REASON_TC_ERROR: generic internal tc error. */ - SKB_DROP_REASON_TC_ERROR, + /** + * @SKB_DROP_REASON_TC_COOKIE_ERROR: An error occurred whilst + * processing a tc ext cookie. + */ + SKB_DROP_REASON_TC_COOKIE_ERROR, + /** + * @SKB_DROP_REASON_PACKET_SOCK_ERROR: generic packet socket errors + * after its filter matches an incoming packet. + */ + SKB_DROP_REASON_PACKET_SOCK_ERROR, + /** @SKB_DROP_REASON_TC_CHAIN_NOTFOUND: tc chain lookup failed. */ + SKB_DROP_REASON_TC_CHAIN_NOTFOUND, + /** + * @SKB_DROP_REASON_TC_RECLASSIFY_LOOP: tc exceeded max reclassify loop + * iterations. + */ + SKB_DROP_REASON_TC_RECLASSIFY_LOOP, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/include/net/dsa.h b/include/net/dsa.h index 82135fbdb1e6..7c0da9effe4e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -991,9 +991,9 @@ struct dsa_switch_ops { * Port's MAC EEE settings */ int (*set_mac_eee)(struct dsa_switch *ds, int port, - struct ethtool_eee *e); + struct ethtool_keee *e); int (*get_mac_eee)(struct dsa_switch *ds, int port, - struct ethtool_eee *e); + struct ethtool_keee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); diff --git a/include/net/dst.h b/include/net/dst.h index f5dfc8fb7b37..0aa331bd2fdb 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -390,7 +390,6 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); -struct dst_entry *dst_destroy(struct dst_entry *dst); void dst_dev_put(struct dst_entry *dst); static inline void dst_confirm(struct dst_entry *dst) diff --git a/include/net/eee.h b/include/net/eee.h new file mode 100644 index 000000000000..84837aba3cd9 --- /dev/null +++ b/include/net/eee.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _EEE_H +#define _EEE_H + +#include <linux/types.h> + +struct eee_config { + u32 tx_lpi_timer; + bool tx_lpi_enabled; + bool eee_enabled; +}; + +static inline bool eeecfg_mac_can_tx_lpi(const struct eee_config *eeecfg) +{ + /* eee_enabled is the master on/off */ + if (!eeecfg->eee_enabled || !eeecfg->tx_lpi_enabled) + return false; + + return true; +} + +static inline void eeecfg_to_eee(struct ethtool_keee *eee, + const struct eee_config *eeecfg) +{ + eee->tx_lpi_timer = eeecfg->tx_lpi_timer; + eee->tx_lpi_enabled = eeecfg->tx_lpi_enabled; + eee->eee_enabled = eeecfg->eee_enabled; +} + +static inline void eee_to_eeecfg(struct eee_config *eeecfg, + const struct ethtool_keee *eee) +{ + eeecfg->tx_lpi_timer = eee->tx_lpi_timer; + eeecfg->tx_lpi_enabled = eee->tx_lpi_enabled; + eeecfg->eee_enabled = eee->eee_enabled; +} + +#endif diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 82da359bca03..d17855c52ef9 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -172,8 +172,7 @@ void fib_rules_unregister(struct fib_rules_ops *); int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); -int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, - u32 flags); +int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); diff --git a/include/net/genetlink.h b/include/net/genetlink.h index e18a4c0d69ee..9ece6e5a3ea8 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -8,10 +8,15 @@ #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) +/* Binding to multicast group requires %CAP_NET_ADMIN */ +#define GENL_MCAST_CAP_NET_ADMIN BIT(0) +/* Binding to multicast group requires %CAP_SYS_ADMIN */ +#define GENL_MCAST_CAP_SYS_ADMIN BIT(1) + /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family - * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) + * @flags: GENL_MCAST_* flags */ struct genl_multicast_group { char name[GENL_NAMSIZ]; @@ -36,6 +41,8 @@ struct genl_info; * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks + * @bind: called when family multicast group is added to a netlink socket + * @unbind: called when family multicast group is removed from a netlink socket * @module: pointer to the owning module (set to THIS_MODULE) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups @@ -49,6 +56,9 @@ struct genl_info; * @split_ops: the split do/dump form of operation definition * @n_split_ops: number of entries in @split_ops, not that with split do/dump * ops the number of entries is not the same as number of commands + * @sock_priv_size: the size of per-socket private memory + * @sock_priv_init: the per-socket private memory initializer + * @sock_priv_destroy: the per-socket private memory destructor * * Attribute policies (the combination of @policy and @maxattr fields) * can be attached at the family level or at the operation level. @@ -76,17 +86,25 @@ struct genl_family { void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); + int (*bind)(int mcgrp); + void (*unbind)(int mcgrp); const struct genl_ops * ops; const struct genl_small_ops *small_ops; const struct genl_split_ops *split_ops; const struct genl_multicast_group *mcgrps; struct module *module; + size_t sock_priv_size; + void (*sock_priv_init)(void *priv); + void (*sock_priv_destroy)(void *priv); + /* private: internal use only */ /* protocol family identifier */ int id; /* starting number of multicast group IDs in this family */ unsigned int mcgrp_offset; + /* list of per-socket privs */ + struct xarray *sock_privs; }; /** @@ -135,7 +153,7 @@ static inline void *genl_info_userhdr(const struct genl_info *info) /* Report that a root attribute is missing */ #define GENL_REQ_ATTR_CHECK(info, attr) ({ \ - struct genl_info *__info = (info); \ + const struct genl_info *__info = (info); \ \ NL_REQ_ATTR_CHECK(__info->extack, NULL, __info->attrs, (attr)); \ }) @@ -296,6 +314,8 @@ static inline bool genl_info_is_ntf(const struct genl_info *info) return !info->nlhdr; } +void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk); +void *genl_sk_priv_get(struct genl_family *family, struct sock *sk); int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, @@ -436,6 +456,35 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) } /** + * genlmsg_multicast_netns_filtered - multicast a netlink message + * to a specific netns with filter + * function + * @family: the generic netlink family + * @net: the net namespace + * @skb: netlink message as socket buffer + * @portid: own netlink portid to avoid sending to yourself + * @group: offset of multicast group in groups array + * @flags: allocation flags + * @filter: filter function + * @filter_data: filter function private data + * + * Return: 0 on success, negative error code for failure. + */ +static inline int +genlmsg_multicast_netns_filtered(const struct genl_family *family, + struct net *net, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags, + netlink_filter_fn filter, + void *filter_data) +{ + if (WARN_ON_ONCE(group >= family->n_mcgrps)) + return -EINVAL; + group = family->mcgrp_offset + group; + return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, + flags, filter, filter_data); +} + +/** * genlmsg_multicast_netns - multicast a netlink message to a specific netns * @family: the generic netlink family * @net: the net namespace @@ -448,10 +497,8 @@ static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) - return -EINVAL; - group = family->mcgrp_offset + group; - return nlmsg_multicast(net->genl_sock, skb, portid, group, flags); + return genlmsg_multicast_netns_filtered(family, net, skb, portid, + group, flags, NULL, NULL); } /** diff --git a/include/net/gro.h b/include/net/gro.h index b435f0ddbf64..d6fc8fbd3730 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -9,6 +9,7 @@ #include <net/ip6_checksum.h> #include <linux/skbuff.h> #include <net/udp.h> +#include <net/hotdata.h> struct napi_gro_cb { union { @@ -139,21 +140,16 @@ static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) NAPI_GRO_CB(skb)->data_offset += len; } -static inline void *skb_gro_header_fast(struct sk_buff *skb, +static inline void *skb_gro_header_fast(const struct sk_buff *skb, unsigned int offset) { return NAPI_GRO_CB(skb)->frag0 + offset; } -static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) +static inline bool skb_gro_may_pull(const struct sk_buff *skb, + unsigned int hlen) { - return NAPI_GRO_CB(skb)->frag0_len < hlen; -} - -static inline void skb_gro_frag0_invalidate(struct sk_buff *skb) -{ - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + return likely(hlen <= NAPI_GRO_CB(skb)->frag0_len); } static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, @@ -162,28 +158,30 @@ static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, if (!pskb_may_pull(skb, hlen)) return NULL; - skb_gro_frag0_invalidate(skb); return skb->data + offset; } -static inline void *skb_gro_header(struct sk_buff *skb, - unsigned int hlen, unsigned int offset) +static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen, + unsigned int offset) { void *ptr; ptr = skb_gro_header_fast(skb, offset); - if (skb_gro_header_hard(skb, hlen)) + if (!skb_gro_may_pull(skb, hlen)) ptr = skb_gro_header_slow(skb, hlen, offset); return ptr; } -static inline void *skb_gro_network_header(struct sk_buff *skb) +static inline void *skb_gro_network_header(const struct sk_buff *skb) { - return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) + - skb_network_offset(skb); + if (skb_gro_may_pull(skb, skb_gro_offset(skb))) + return skb_gro_header_fast(skb, skb_network_offset(skb)); + + return skb_network_header(skb); } -static inline __wsum inet_gro_compute_pseudo(struct sk_buff *skb, int proto) +static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb, + int proto) { const struct iphdr *iph = skb_gro_network_header(skb); @@ -421,7 +419,8 @@ static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) return uh; } -static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto) +static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, + int proto) { const struct ipv6hdr *iph = skb_gro_network_header(skb); @@ -448,7 +447,7 @@ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, { list_add_tail(&skb->list, &napi->rx_list); napi->rx_count += segs; - if (napi->rx_count >= READ_ONCE(gro_normal_batch)) + if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) gro_normal_list(napi); } @@ -495,6 +494,4 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int * #endif } -extern struct list_head offload_base; - #endif /* _NET_IPV6_GRO_H */ diff --git a/include/net/hotdata.h b/include/net/hotdata.h new file mode 100644 index 000000000000..003667a1efd6 --- /dev/null +++ b/include/net/hotdata.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_HOTDATA_H +#define _NET_HOTDATA_H + +#include <linux/types.h> +#include <linux/netdevice.h> +#include <net/protocol.h> + +/* Read mostly data used in network fast paths. */ +struct net_hotdata { +#if IS_ENABLED(CONFIG_INET) + struct packet_offload ip_packet_offload; + struct net_offload tcpv4_offload; + struct net_protocol tcp_protocol; + struct net_offload udpv4_offload; + struct net_protocol udp_protocol; + struct packet_offload ipv6_packet_offload; + struct net_offload tcpv6_offload; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_protocol tcpv6_protocol; + struct inet6_protocol udpv6_protocol; +#endif + struct net_offload udpv6_offload; +#endif + struct list_head offload_base; + struct list_head ptype_all; + struct kmem_cache *skbuff_cache; + struct kmem_cache *skbuff_fclone_cache; + struct kmem_cache *skb_small_head_cache; +#ifdef CONFIG_RPS + struct rps_sock_flow_table __rcu *rps_sock_flow_table; + u32 rps_cpu_mask; +#endif + int gro_normal_batch; + int netdev_budget; + int netdev_budget_usecs; + int tstamp_prequeue; + int max_backlog; + int dev_tx_weight; + int dev_rx_weight; +}; + +#define inet_ehash_secret net_hotdata.tcp_protocol.secret +#define udp_ehash_secret net_hotdata.udp_protocol.secret +#define inet6_ehash_secret net_hotdata.tcpv6_protocol.secret +#define tcp_ipv6_hash_secret net_hotdata.tcpv6_offload.secret +#define udp6_ehash_secret net_hotdata.udpv6_protocol.secret +#define udp_ipv6_hash_secret net_hotdata.udpv6_offload.secret + +extern struct net_hotdata net_hotdata; + +#endif /* _NET_HOTDATA_H */ diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 3e454c4d7ba6..238ad3349456 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -22,10 +22,6 @@ #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 -/* prefix flags */ -#define IF_PREFIX_ONLINK 0x01 -#define IF_PREFIX_AUTOCONF 0x02 - enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, @@ -148,7 +144,7 @@ struct ipv6_ac_socklist { struct ifacaddr6 { struct in6_addr aca_addr; struct fib6_info *aca_rt; - struct ifacaddr6 *aca_next; + struct ifacaddr6 __rcu *aca_next; struct hlist_node aca_addr_lst; int aca_users; refcount_t aca_refcnt; @@ -200,7 +196,7 @@ struct inet6_dev { spinlock_t mc_report_lock; /* mld query report lock */ struct mutex mc_lock; /* mld global lock */ - struct ifacaddr6 *ac_list; + struct ifacaddr6 __rcu *ac_list; rwlock_t lock; refcount_t refcnt; __u32 if_flags; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index d0a2f827d5f2..9ab4bf704e86 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -357,4 +357,12 @@ static inline bool inet_csk_has_ulp(const struct sock *sk) return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; } +static inline void inet_init_csk_locks(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + spin_lock_init(&icsk->icsk_accept_queue.rskq_lock); + spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock); +} + #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3ecfeadbfa06..7f1b38458743 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -88,7 +88,7 @@ struct inet_bind_bucket { unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; - struct hlist_head owners; + struct hlist_head bhash2; }; struct inet_bind2_bucket { @@ -96,22 +96,17 @@ struct inet_bind2_bucket { int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) - unsigned short family; -#endif - union { -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr v6_rcv_saddr; + unsigned short addr_type; + struct in6_addr v6_rcv_saddr; +#define rcv_saddr v6_rcv_saddr.s6_addr32[3] +#else + __be32 rcv_saddr; #endif - __be32 rcv_saddr; - }; /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; + struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; - /* bhash has twsk in owners, but bhash2 has twsk in - * deathrow not to add a member in struct sock_common. - */ - struct hlist_head deathrow; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) @@ -241,7 +236,7 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, struct inet_bind2_bucket * inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - unsigned short port, int l3mdev, + struct inet_bind_bucket *tb, const struct sock *sk); void inet_bind2_bucket_destroy(struct kmem_cache *cachep, diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 74db6d97cae1..f9ddd47dc4f8 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -234,10 +234,7 @@ struct inet_sock { int uc_index; int mc_index; __be32 mc_addr; - struct { - __u16 lo; - __u16 hi; - } local_port_range; + u32 local_port_range; /* high << 16 | low */ struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; @@ -277,6 +274,7 @@ enum { INET_FLAGS_REPFLOW = 27, INET_FLAGS_RTALERT_ISOLATE = 28, INET_FLAGS_SNDFLOW = 29, + INET_FLAGS_RTALERT = 30, }; /* cmsg flags for inet */ @@ -310,11 +308,6 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) #define inet_assign_bit(nr, sk, val) \ assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) -static inline bool sk_is_inet(struct sock *sk) -{ - return sk->sk_family == AF_INET || sk->sk_family == AF_INET6; -} - /** * sk_to_full_sk - Access to a full socket * @sk: pointer to a socket diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index b14999ff55db..f28da08a37b4 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -75,13 +75,9 @@ struct inet_timewait_sock { struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; - struct hlist_node tw_bind2_node; }; #define tw_tclass tw_tos -#define twsk_for_each_bound_bhash2(__tw, list) \ - hlist_for_each_entry(__tw, list, tw_bind2_node) - static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) { return (struct inet_timewait_sock *)sk; diff --git a/include/net/ioam6.h b/include/net/ioam6.h index 781d2d8b2f29..2cbbee6e806a 100644 --- a/include/net/ioam6.h +++ b/include/net/ioam6.h @@ -12,6 +12,7 @@ #include <linux/net.h> #include <linux/ipv6.h> #include <linux/ioam6.h> +#include <linux/ioam6_genl.h> #include <linux/rhashtable-types.h> struct ioam6_namespace { @@ -65,4 +66,7 @@ void ioam6_exit(void); int ioam6_iptunnel_init(void); void ioam6_iptunnel_exit(void); +void ioam6_event(enum ioam6_event_type type, struct net *net, gfp_t gfp, + void *opt, unsigned int opt_len); + #endif /* _NET_IOAM6_H */ diff --git a/include/net/ip.h b/include/net/ip.h index 1fc4c8d69e33..25cb688bdc62 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -349,8 +349,14 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } -void inet_get_local_port_range(const struct net *net, int *low, int *high); -void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); +static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) +{ + u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); + + *low = range & 0xffff; + *high = range >> 16; +} +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) @@ -761,7 +767,7 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev); * Functions provided by ip_sockglue.c */ -void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb); +void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst); void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1ba9f4ddf2f6..323c94f1845b 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -30,12 +30,6 @@ #define RT6_DEBUG 2 -#if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) pr_debug(x) -#else -#define RT6_TRACE(x...) do { ; } while (0) -#endif - struct rt6_info; struct fib6_info; @@ -250,6 +244,25 @@ static inline bool fib6_requires_src(const struct fib6_info *rt) return rt->fib6_src.plen > 0; } +/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever + * been added to a table before. + */ +static inline void fib6_clean_expires(struct fib6_info *f6i) +{ + f6i->fib6_flags &= ~RTF_EXPIRES; + f6i->expires = 0; +} + +/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever + * been added to a table before. + */ +static inline void fib6_set_expires(struct fib6_info *f6i, + unsigned long expires) +{ + f6i->expires = expires; + f6i->fib6_flags |= RTF_EXPIRES; +} + static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->fib6_flags & RTF_EXPIRES) @@ -257,11 +270,6 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) return false; } -static inline bool fib6_has_expires(const struct fib6_info *f6i) -{ - return f6i->fib6_flags & RTF_EXPIRES; -} - /* Function to safely get fn->fn_sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely @@ -328,8 +336,10 @@ static inline bool fib6_info_hold_safe(struct fib6_info *f6i) static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) + if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) { + DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link)); call_rcu(&f6i->rcu, fib6_info_destroy_rcu); + } } enum fib6_walk_state { @@ -500,46 +510,36 @@ void fib6_gc_cleanup(void); int fib6_init(void); -/* fib6_info must be locked by the caller, and fib6_info->fib6_table can be - * NULL. +/* Add the route to the gc list if it is not already there + * + * The callers should hold f6i->fib6_table->tb6_lock. */ -static inline void fib6_set_expires_locked(struct fib6_info *f6i, - unsigned long expires) +static inline void fib6_add_gc_list(struct fib6_info *f6i) { - struct fib6_table *tb6; + /* If fib6_node is null, the f6i is not in (or removed from) the + * table. + * + * There is a gap between finding the f6i from the table and + * calling this function without the protection of the tb6_lock. + * This check makes sure the f6i is not added to the gc list when + * it is not on the table. + */ + if (!rcu_dereference_protected(f6i->fib6_node, + lockdep_is_held(&f6i->fib6_table->tb6_lock))) + return; - tb6 = f6i->fib6_table; - f6i->expires = expires; - if (tb6 && !fib6_has_expires(f6i)) - hlist_add_head(&f6i->gc_link, &tb6->tb6_gc_hlist); - f6i->fib6_flags |= RTF_EXPIRES; + if (hlist_unhashed(&f6i->gc_link)) + hlist_add_head(&f6i->gc_link, &f6i->fib6_table->tb6_gc_hlist); } -/* fib6_info must be locked by the caller, and fib6_info->fib6_table can be - * NULL. If fib6_table is NULL, the fib6_info will no be inserted into the - * list of GC candidates until it is inserted into a table. +/* Remove the route from the gc list if it is on the list. + * + * The callers should hold f6i->fib6_table->tb6_lock. */ -static inline void fib6_set_expires(struct fib6_info *f6i, - unsigned long expires) +static inline void fib6_remove_gc_list(struct fib6_info *f6i) { - spin_lock_bh(&f6i->fib6_table->tb6_lock); - fib6_set_expires_locked(f6i, expires); - spin_unlock_bh(&f6i->fib6_table->tb6_lock); -} - -static inline void fib6_clean_expires_locked(struct fib6_info *f6i) -{ - if (fib6_has_expires(f6i)) + if (!hlist_unhashed(&f6i->gc_link)) hlist_del_init(&f6i->gc_link); - f6i->fib6_flags &= ~RTF_EXPIRES; - f6i->expires = 0; -} - -static inline void fib6_clean_expires(struct fib6_info *f6i) -{ - spin_lock_bh(&f6i->fib6_table->tb6_lock); - fib6_clean_expires_locked(f6i); - spin_unlock_bh(&f6i->fib6_table->tb6_lock); } struct ipv6_route_iter { diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 28b065790261..a30c6aa9e5cf 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -170,7 +170,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, - u32 defrtr_usr_metric); + u32 defrtr_usr_metric, + int lifetime); void rt6_purge_dflt_routers(struct net *net); @@ -331,7 +332,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst rcu_read_lock(); idev = __in6_dev_get(dst->dev); if (idev) - mtu = idev->cnf.mtu6; + mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); out: diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index d4667b7797e3..9b2f69ba5e49 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -264,6 +264,7 @@ struct fib_dump_filter { bool filter_set; bool dump_routes; bool dump_exceptions; + bool rtnl_held; unsigned char protocol; unsigned char rt_type; unsigned int flags; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index f346b4efbc30..5cd64bb2104d 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -284,7 +284,8 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, - struct rtnl_link_ops *ops); + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); @@ -416,6 +417,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, return 0; } +static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, + const struct sk_buff *skb) +{ + __be16 payload_protocol = skb_protocol(skb, true); + + if (payload_protocol == htons(ETH_P_IPV6)) + return ip6_flowlabel((const struct ipv6hdr *)iph); + else + return 0; +} + static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 78d38dd88aba..88a8e554f7a1 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -534,13 +534,15 @@ static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb) return 0; } -static inline bool ipv6_accept_ra(struct inet6_dev *idev) +static inline bool ipv6_accept_ra(const struct inet6_dev *idev) { + s32 accept_ra = READ_ONCE(idev->cnf.accept_ra); + /* If forwarding is enabled, RA are not accepted unless the special * hybrid mode (accept_ra=2) is enabled. */ - return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 : - idev->cnf.accept_ra; + return READ_ONCE(idev->cnf.forwarding) ? accept_ra == 2 : + accept_ra; } #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ @@ -784,11 +786,6 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) cpu_to_be32(0x0000ffff))) == 0UL; } -static inline bool ipv6_addr_v4mapped_any(const struct in6_addr *a) -{ - return ipv6_addr_v4mapped(a) && ipv4_is_zeronet(a->s6_addr32[3]); -} - static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h index f9e88401d7da..8b2055d64a6b 100644 --- a/include/net/iucv/iucv.h +++ b/include/net/iucv/iucv.h @@ -80,7 +80,7 @@ struct iucv_array { u32 length; } __attribute__ ((aligned (8))); -extern struct bus_type iucv_bus; +extern const struct bus_type iucv_bus; extern struct device *iucv_root; /* @@ -489,7 +489,7 @@ struct iucv_interface { int (*path_sever)(struct iucv_path *path, u8 userdata[16]); int (*iucv_register)(struct iucv_handler *handler, int smp); void (*iucv_unregister)(struct iucv_handler *handler, int smp); - struct bus_type *bus; + const struct bus_type *bus; struct device *root; }; diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h index 7e73f8e5e497..1d55ba7c45be 100644 --- a/include/net/llc_pdu.h +++ b/include/net/llc_pdu.h @@ -262,8 +262,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, */ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) { - if (skb->protocol == htons(ETH_P_802_2)) - memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); + memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); } /** @@ -275,8 +274,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) */ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) { - if (skb->protocol == htons(ETH_P_802_2)) - memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); + memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); } /** diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 580781ff9dcf..353488ab94a2 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7,7 +7,7 @@ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2023 Intel Corporation + * Copyright (C) 2018 - 2024 Intel Corporation */ #ifndef MAC80211_H @@ -214,6 +214,10 @@ struct ieee80211_low_level_stats { * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel, * this is used only with channel switching with CSA * @IEEE80211_CHANCTX_CHANGE_MIN_WIDTH: The min required channel width changed + * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider + * bandwidth) OFDMA settings need to be changed + * @IEEE80211_CHANCTX_CHANGE_PUNCTURING: The punctured channel(s) bitmap + * was changed. */ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_WIDTH = BIT(0), @@ -221,6 +225,19 @@ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_RADAR = BIT(2), IEEE80211_CHANCTX_CHANGE_CHANNEL = BIT(3), IEEE80211_CHANCTX_CHANGE_MIN_WIDTH = BIT(4), + IEEE80211_CHANCTX_CHANGE_AP = BIT(5), + IEEE80211_CHANCTX_CHANGE_PUNCTURING = BIT(6), +}; + +/** + * struct ieee80211_chan_req - A channel "request" + * @oper: channel definition to use for operation + * @ap: the channel definition of the AP, if any + * (otherwise the chan member is %NULL) + */ +struct ieee80211_chan_req { + struct cfg80211_chan_def oper; + struct cfg80211_chan_def ap; }; /** @@ -231,6 +248,8 @@ enum ieee80211_chanctx_change { * * @def: the channel definition * @min_def: the minimum channel definition currently required. + * @ap: the channel definition the AP actually is operating as, + * for use with (wider bandwidth) OFDMA * @rx_chains_static: The number of RX chains that must always be * active on the channel to receive MIMO transmissions * @rx_chains_dynamic: The number of RX chains that must be enabled @@ -243,6 +262,7 @@ enum ieee80211_chanctx_change { struct ieee80211_chanctx_conf { struct cfg80211_chan_def def; struct cfg80211_chan_def min_def; + struct cfg80211_chan_def ap; u8 rx_chains_static, rx_chains_dynamic; @@ -340,8 +360,8 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_FILS_DISCOVERY: FILS discovery status changed. * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response * status changed. - * @BSS_CHANGED_EHT_PUNCTURING: The channel puncturing bitmap changed. * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed. + * @BSS_CHANGED_MLD_TTLM: TID to link mapping was changed */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, @@ -376,8 +396,8 @@ enum ieee80211_bss_change { BSS_CHANGED_HE_BSS_COLOR = 1<<29, BSS_CHANGED_FILS_DISCOVERY = 1<<30, BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = 1<<31, - BSS_CHANGED_EHT_PUNCTURING = BIT_ULL(32), BSS_CHANGED_MLD_VALID_LINKS = BIT_ULL(33), + BSS_CHANGED_MLD_TTLM = BIT_ULL(34), /* when adding here, make sure to change ieee80211_reconfig */ }; @@ -476,9 +496,9 @@ struct ieee80211_ba_event { /** * struct ieee80211_event - event to be sent to the driver * @type: The event itself. See &enum ieee80211_event_type. - * @rssi: relevant if &type is %RSSI_EVENT - * @mlme: relevant if &type is %AUTH_EVENT - * @ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT + * @u.rssi: relevant if &type is %RSSI_EVENT + * @u.mlme: relevant if &type is %AUTH_EVENT + * @u.ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT * @u:union holding the fields above */ struct ieee80211_event { @@ -537,12 +557,14 @@ struct ieee80211_fils_discovery { * to that BSS) that can change during the lifetime of the BSS. * * @vif: reference to owning VIF + * @bss: the cfg80211 bss descriptor. Valid only for a station, and only + * when associated. Note: This contains information which is not + * necessarily authenticated. For example, information coming from probe + * responses. * @addr: (link) address used locally * @link_id: link ID, or 0 for non-MLO * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE * @uora_exists: is the UORA element advertised by AP - * @ack_enabled: indicates support to receive a multi-TID that solicits either - * ACK, BACK or both * @uora_ocw_range: UORA element's OCW Range field * @frame_time_rts_th: HE duration RTS threshold, in units of 32us * @he_support: does this BSS support HE @@ -583,7 +605,7 @@ struct ieee80211_fils_discovery { * @mcast_rate: per-band multicast rate index + 1 (0: disabled) * @bssid: The BSSID for this BSS * @enable_beacon: whether beaconing should be enabled or not - * @chandef: Channel definition for this BSS -- the hardware might be + * @chanreq: Channel request for this BSS -- the hardware might be * configured a higher bandwidth than this BSS uses, for example. * @mu_group: VHT MU-MIMO group membership data * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation. @@ -644,9 +666,7 @@ struct ieee80211_fils_discovery { * @tx_pwr_env_num: number of @tx_pwr_env. * @pwr_reduction: power constraint of BSS. * @eht_support: does this BSS support EHT - * @eht_puncturing: bitmap to indicate which channels are punctured in this BSS * @csa_active: marks whether a channel switch is going on. - * @csa_punct_bitmap: new puncturing bitmap for channel switch * @mu_mimo_owner: indicates interface owns MU-MIMO capability * @chanctx_conf: The channel context this interface is assigned to, or %NULL * when it is not assigned. This pointer is RCU-protected due to the TX @@ -684,6 +704,7 @@ struct ieee80211_fils_discovery { */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; + struct cfg80211_bss *bss; const u8 *bssid; unsigned int link_id; @@ -716,7 +737,7 @@ struct ieee80211_bss_conf { u32 cqm_rssi_hyst; s32 cqm_rssi_low; s32 cqm_rssi_high; - struct cfg80211_chan_def chandef; + struct ieee80211_chan_req chanreq; struct ieee80211_mu_group_data mu_group; bool qos; bool hidden_ssid; @@ -749,10 +770,8 @@ struct ieee80211_bss_conf { u8 tx_pwr_env_num; u8 pwr_reduction; bool eht_support; - u16 eht_puncturing; bool csa_active; - u16 csa_punct_bitmap; bool mu_mimo_owner; struct ieee80211_chanctx_conf __rcu *chanctx_conf; @@ -1150,11 +1169,6 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * @ack: union part for pure ACK data * @ack.cookie: cookie for the ACK * @driver_data: array of driver_data pointers - * @ampdu_ack_len: number of acked aggregated frames. - * relevant only if IEEE80211_TX_STAT_AMPDU was set. - * @ampdu_len: number of aggregated frames. - * relevant only if IEEE80211_TX_STAT_AMPDU was set. - * @ack_signal: signal strength of the ACK frame */ struct ieee80211_tx_info { /* common information */ @@ -1362,6 +1376,9 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * the frame. * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on * the frame. + * @RX_FLAG_MACTIME: The timestamp passed in the RX status (@mactime + * field) is valid if this field is non-zero, and the position + * where the timestamp was sampled depends on the value. * @RX_FLAG_MACTIME_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the first symbol of the MPDU * was received. This is useful in monitor mode and for proper IBSS @@ -1371,6 +1388,11 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * (including FCS) was received. * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the SYNC preamble was received. + * @RX_FLAG_MACTIME_IS_RTAP_TS64: The timestamp passed in the RX status @mactime + * is only for use in the radiotap timestamp header, not otherwise a valid + * @mactime value. Note this is a separate flag so that we continue to see + * %RX_FLAG_MACTIME as unset. Also note that in this case the timestamp is + * reported to be 64 bits wide, not just 32. * @RX_FLAG_NO_SIGNAL_VAL: The signal strength value is not present. * Valid only for data frames (mainly A-MPDU) * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference @@ -1441,12 +1463,12 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), RX_FLAG_DECRYPTED = BIT(1), - RX_FLAG_MACTIME_PLCP_START = BIT(2), + RX_FLAG_ONLY_MONITOR = BIT(2), RX_FLAG_MMIC_STRIPPED = BIT(3), RX_FLAG_IV_STRIPPED = BIT(4), RX_FLAG_FAILED_FCS_CRC = BIT(5), RX_FLAG_FAILED_PLCP_CRC = BIT(6), - RX_FLAG_MACTIME_START = BIT(7), + RX_FLAG_MACTIME_IS_RTAP_TS64 = BIT(7), RX_FLAG_NO_SIGNAL_VAL = BIT(8), RX_FLAG_AMPDU_DETAILS = BIT(9), RX_FLAG_PN_VALIDATED = BIT(10), @@ -1455,8 +1477,10 @@ enum mac80211_rx_flags { RX_FLAG_AMPDU_IS_LAST = BIT(13), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), RX_FLAG_AMPDU_DELIM_CRC_KNOWN = BIT(15), - RX_FLAG_MACTIME_END = BIT(16), - RX_FLAG_ONLY_MONITOR = BIT(17), + RX_FLAG_MACTIME = BIT(16) | BIT(17), + RX_FLAG_MACTIME_PLCP_START = 1 << 16, + RX_FLAG_MACTIME_START = 2 << 16, + RX_FLAG_MACTIME_END = 3 << 16, RX_FLAG_SKIP_MONITOR = BIT(18), RX_FLAG_AMSDU_MORE = BIT(19), RX_FLAG_RADIOTAP_TLV_AT_END = BIT(20), @@ -1739,8 +1763,9 @@ struct ieee80211_conf { * @chandef: the new channel to switch to * @count: the number of TBTT's until the channel switch event * @delay: maximum delay between the time the AP transmitted the last beacon in - * current channel and the expected time of the first beacon in the new - * channel, expressed in TU. + * current channel and the expected time of the first beacon in the new + * channel, expressed in TU. + * @link_id: the link ID of the link doing the channel switch, 0 for non-MLO */ struct ieee80211_channel_switch { u64 timestamp; @@ -1748,6 +1773,7 @@ struct ieee80211_channel_switch { bool block_tx; struct cfg80211_chan_def chandef; u8 count; + u8 link_id; u32 delay; }; @@ -1769,6 +1795,10 @@ struct ieee80211_channel_switch { * this is not pure P2P vif. * @IEEE80211_VIF_EML_ACTIVE: The driver indicates that EML operation is * enabled for the interface. + * @IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW: Ignore wider bandwidth OFDMA + * operation on this interface and request a channel context without + * the AP definition. Use this e.g. because the device is able to + * handle OFDMA (downlink and trigger for uplink) on a per-AP basis. */ enum ieee80211_vif_flags { IEEE80211_VIF_BEACON_FILTER = BIT(0), @@ -1776,6 +1806,7 @@ enum ieee80211_vif_flags { IEEE80211_VIF_SUPPORTS_UAPSD = BIT(2), IEEE80211_VIF_GET_NOA_UPDATE = BIT(3), IEEE80211_VIF_EML_ACTIVE = BIT(4), + IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW = BIT(5), }; @@ -1805,9 +1836,11 @@ enum ieee80211_offload_flags { * @ps: power-save mode (STA only). This flag is NOT affected by * offchannel/dynamic_ps operations. * @aid: association ID number, valid only when @assoc is true - * @eml_cap: EML capabilities as described in P802.11be_D2.2 Figure 9-1002k. + * @eml_cap: EML capabilities as described in P802.11be_D4.1 Figure 9-1001j. * @eml_med_sync_delay: Medium Synchronization delay as described in - * P802.11be_D2.2 Figure 9-1002j. + * P802.11be_D4.1 Figure 9-1001i. + * @mld_capa_op: MLD Capabilities and Operations per P802.11be_D4.1 + * Figure 9-1001k * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The * may filter ARP queries targeted for other addresses than listed here. * The driver must allow ARP queries targeted for all address listed here @@ -1832,6 +1865,7 @@ struct ieee80211_vif_cfg { u16 aid; u16 eml_cap; u16 eml_med_sync_delay; + u16 mld_capa_op; __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; int arp_addr_cnt; @@ -1842,6 +1876,35 @@ struct ieee80211_vif_cfg { u8 ap_addr[ETH_ALEN] __aligned(2); }; +#define IEEE80211_TTLM_NUM_TIDS 8 + +/** + * struct ieee80211_neg_ttlm - negotiated TID to link map info + * + * @downlink: bitmap of active links per TID for downlink, or 0 if mapping for + * this TID is not included. + * @uplink: bitmap of active links per TID for uplink, or 0 if mapping for this + * TID is not included. + * @valid: info is valid or not. + */ +struct ieee80211_neg_ttlm { + u16 downlink[IEEE80211_TTLM_NUM_TIDS]; + u16 uplink[IEEE80211_TTLM_NUM_TIDS]; + bool valid; +}; + +/** + * enum ieee80211_neg_ttlm_res - return value for negotiated TTLM handling + * @NEG_TTLM_RES_ACCEPT: accept the request + * @NEG_TTLM_RES_REJECT: reject the request + * @NEG_TTLM_RES_SUGGEST_PREFERRED: reject and suggest a new mapping + */ +enum ieee80211_neg_ttlm_res { + NEG_TTLM_RES_ACCEPT, + NEG_TTLM_RES_REJECT, + NEG_TTLM_RES_SUGGEST_PREFERRED +}; + /** * struct ieee80211_vif - per-interface data * @@ -1860,6 +1923,11 @@ struct ieee80211_vif_cfg { * API calls meant for that purpose. * @dormant_links: bitmap of valid but disabled links, or 0 for non-MLO. * Must be a subset of valid_links. + * @suspended_links: subset of dormant_links representing links that are + * suspended. + * 0 for non-MLO. + * @neg_ttlm: negotiated TID to link mapping info. + * see &struct ieee80211_neg_ttlm. * @addr: address of this interface * @p2p: indicates whether this AP or STA interface is a p2p * interface, i.e. a GO or p2p-sta respectively @@ -1897,7 +1965,8 @@ struct ieee80211_vif { struct ieee80211_vif_cfg cfg; struct ieee80211_bss_conf bss_conf; struct ieee80211_bss_conf __rcu *link_conf[IEEE80211_MLD_MAX_NUM_LINKS]; - u16 valid_links, active_links, dormant_links; + u16 valid_links, active_links, dormant_links, suspended_links; + struct ieee80211_neg_ttlm neg_ttlm; u8 addr[ETH_ALEN] __aligned(2); bool p2p; @@ -1944,6 +2013,21 @@ static inline bool ieee80211_vif_is_mld(const struct ieee80211_vif *vif) return vif->valid_links != 0; } +/** + * ieee80211_vif_link_active - check if a given link is active + * @vif: the vif + * @link_id: the link ID to check + * Return: %true if the vif is an MLD and the link is active, or if + * the vif is not an MLD and the link ID is 0; %false otherwise. + */ +static inline bool ieee80211_vif_link_active(const struct ieee80211_vif *vif, + unsigned int link_id) +{ + if (!ieee80211_vif_is_mld(vif)) + return link_id == 0; + return vif->active_links & BIT(link_id); +} + #define for_each_vif_active_link(vif, link, link_id) \ for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++) \ if ((!(vif)->active_links || \ @@ -2038,6 +2122,8 @@ static inline bool lockdep_vif_wiphy_mutex_held(struct ieee80211_vif *vif) * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver * for a AES_CMAC key to indicate that it requires sequence number * generation only + * @IEEE80211_KEY_FLAG_SPP_AMSDU: SPP A-MSDUs can be used with this key + * (set by mac80211 from the sta->spp_amsdu flag) */ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_GENERATE_IV_MGMT = BIT(0), @@ -2051,6 +2137,7 @@ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_PUT_MIC_SPACE = BIT(8), IEEE80211_KEY_FLAG_NO_AUTO_TX = BIT(9), IEEE80211_KEY_FLAG_GENERATE_MMIE = BIT(10), + IEEE80211_KEY_FLAG_SPP_AMSDU = BIT(11), }; /** @@ -2349,6 +2436,7 @@ struct ieee80211_link_sta { * would be assigned to link[link_id] where link_id is the id assigned * by the AP. * @valid_links: bitmap of valid links, or 0 for non-MLO + * @spp_amsdu: indicates whether the STA uses SPP A-MSDU or not. */ struct ieee80211_sta { u8 addr[ETH_ALEN]; @@ -2362,6 +2450,7 @@ struct ieee80211_sta { bool tdls_initiator; bool mfp; bool mlo; + bool spp_amsdu; u8 max_amsdu_subframes; struct ieee80211_sta_aggregates *cur; @@ -2686,6 +2775,14 @@ struct ieee80211_txq { * @IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX: Hardware/driver handles transmitting * multicast frames on all links, mac80211 should not do that. * + * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT + * and connecting with a lower bandwidth instead + * + * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so + * no need to stop queues. This really should be set by a driver that + * implements MLO, so operation can continue on other links when one + * link is switching. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2743,6 +2840,8 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP, IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, + IEEE80211_HW_DISALLOW_PUNCTURING, + IEEE80211_HW_HANDLES_QUIET_CSA, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS @@ -2831,8 +2930,6 @@ enum ieee80211_hw_flags { * the default is _GI | _BANDWIDTH. * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values. * - * @radiotap_he: HE radiotap validity flags - * * @radiotap_timestamp: Information for the radiotap timestamp field; if the * @units_pos member is set to a non-negative value then the timestamp * field will be added and populated from the &struct ieee80211_rx_status @@ -4177,7 +4274,7 @@ struct ieee80211_prep_tx_info { * after a channel switch procedure is completed, allowing the * driver to go back to a normal configuration. * @abort_channel_switch: This is an optional callback that is called - * when channel switch procedure was completed, allowing the + * when channel switch procedure was aborted, allowing the * driver to go back to a normal configuration. * @channel_switch_rx_beacon: This is an optional callback that is called * when channel switch procedure is in progress and additional beacon with @@ -4267,6 +4364,8 @@ struct ieee80211_prep_tx_info { * disable background CAC/radar detection. * @net_fill_forward_path: Called from .ndo_fill_forward_path in order to * resolve a path for hardware flow offloading + * @can_activate_links: Checks if a specific active_links bitmap is + * supported by the driver. * @change_vif_links: Change the valid links on an interface, note that while * removing the old link information is still valid (link_conf pointer), * but may immediately disappear after the function returns. The old or @@ -4286,6 +4385,10 @@ struct ieee80211_prep_tx_info { * flow offloading for flows originating from the vif. * Note that the driver must not assume that the vif driver_data is valid * at this point, since the callback can be called during netdev teardown. + * @can_neg_ttlm: for managed interface, requests the driver to determine + * if the requested TID-To-Link mapping can be accepted or not. + * If it's not accepted the driver may suggest a preferred mapping and + * modify @ttlm parameter with the suggested TID-to-Link mapping. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -4567,7 +4670,8 @@ struct ieee80211_ops { struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); void (*abort_channel_switch)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf); void (*channel_switch_rx_beacon)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); @@ -4647,6 +4751,9 @@ struct ieee80211_ops { struct ieee80211_sta *sta, struct net_device_path_ctx *ctx, struct net_device_path *path); + bool (*can_activate_links)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u16 active_links); int (*change_vif_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 old_links, u16 new_links, @@ -4663,6 +4770,9 @@ struct ieee80211_ops { struct net_device *dev, enum tc_setup_type type, void *type_data); + enum ieee80211_neg_ttlm_res + (*can_neg_ttlm)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_neg_ttlm *ttlm); }; /** @@ -5445,6 +5555,7 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, /** * ieee80211_beacon_update_cntdwn - request mac80211 to decrement the beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * * The beacon counter should be updated after each beacon transmission. * This function is called implicitly when @@ -5454,7 +5565,8 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, * * Return: new countdown value */ -u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif); +u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, + unsigned int link_id); /** * ieee80211_beacon_set_cntdwn - request mac80211 to set beacon countdown @@ -5472,20 +5584,23 @@ void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter); /** * ieee80211_csa_finish - notify mac80211 about channel switch * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * * After a channel switch announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the channel can be changed. */ -void ieee80211_csa_finish(struct ieee80211_vif *vif); +void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1 * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * * This function returns whether the countdown reached zero. */ -bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif); +bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, + unsigned int link_id); /** * ieee80211_color_change_finish - notify mac80211 about color change @@ -5809,12 +5924,11 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, * ieee80211_remove_key - remove the given key * @keyconf: the parameter passed with the set key * + * Context: Must be called with the wiphy mutex held. + * * Remove the given key. If the key was uploaded to the hardware at the * time this function is called, it is not deleted in the hardware but * instead assumed to have been removed already. - * - * Note that due to locking considerations this function can (currently) - * only be called during key iteration (ieee80211_iter_keys().) */ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); @@ -5822,6 +5936,7 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN * @vif: the virtual interface to add the key on * @keyconf: new key data + * @link_id: the link id of the key or -1 for non-MLO * * When GTK rekeying was done while the system was suspended, (a) new * key(s) will be available. These will be needed by mac80211 for proper @@ -5849,7 +5964,8 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); */ struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, - struct ieee80211_key_conf *keyconf); + struct ieee80211_key_conf *keyconf, + int link_id); /** * ieee80211_gtk_rekey_notify - notify userspace supplicant of rekeying @@ -6368,12 +6484,12 @@ ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * + * Context: Must be called with wiphy mutex held; can sleep. + * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. This is intended for use in WoWLAN if the device - * needs reprogramming of the keys during suspend. Note that due - * to locking reasons, it is also only safe to call this at few - * spots since it must hold the RTNL and be able to sleep. + * needs reprogramming of the keys during suspend. * * The order in which the keys are iterated matches the order * in which they were originally installed and handed to the @@ -7407,11 +7523,10 @@ ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is * aware of. - * @gfp: allocation flags */ void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, - u64 color_bitmap, gfp_t gfp); + u64 color_bitmap); /** * ieee80211_is_tx_data - check if frame is a data frame @@ -7435,6 +7550,9 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * @vif: interface to set active links on * @active_links: the new active links bitmap * + * Context: Must be called with wiphy mutex held; may sleep; calls + * back into the driver. + * * This changes the active links on an interface. The interface * must be in client mode (in AP mode, all links are always active), * and @active_links must be a subset of the vif's valid_links. @@ -7442,6 +7560,7 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * If a link is switched off and another is switched on at the same * time (e.g. active_links going from 0x1 to 0x10) then you will get * a sequence of calls like + * * - change_vif_links(0x11) * - unassign_vif_chanctx(link_id=0) * - change_sta_links(0x11) for each affected STA (the AP) @@ -7451,10 +7570,6 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * - change_sta_links(0x10) for each affected STA (the AP) * - assign_vif_chanctx(link_id=4) * - change_vif_links(0x10) - * - * Note: This function acquires some mac80211 locks and must not - * be called with any driver locks held that could cause a - * lock dependency inversion. Best call it without locks. */ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); @@ -7471,4 +7586,17 @@ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); void ieee80211_set_active_links_async(struct ieee80211_vif *vif, u16 active_links); +/* for older drivers - let's not document these ... */ +int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx); +void ieee80211_emulate_remove_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx); +void ieee80211_emulate_change_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx, + u32 changed); +int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, + struct ieee80211_vif_chanctx_switch *vifs, + int n_vifs, + enum ieee80211_chanctx_switch_mode mode); + #endif /* MAC80211_H */ diff --git a/include/net/macsec.h b/include/net/macsec.h index ebf9bc54036a..dbd22180cc5c 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -247,6 +247,23 @@ struct macsec_secy { /** * struct macsec_context - MACsec context for hardware offloading + * @netdev: a valid pointer to a struct net_device if @offload == + * MACSEC_OFFLOAD_MAC + * @phydev: a valid pointer to a struct phy_device if @offload == + * MACSEC_OFFLOAD_PHY + * @offload: MACsec offload status + * @secy: pointer to a MACsec SecY + * @rx_sc: pointer to a RX SC + * @update_pn: when updating the SA, update the next PN + * @assoc_num: association number of the target SA + * @key: key of the target SA + * @rx_sa: pointer to an RX SA if a RX SA is added/updated/removed + * @tx_sa: pointer to an TX SA if a TX SA is added/updated/removed + * @tx_sc_stats: pointer to TX SC stats structure + * @tx_sa_stats: pointer to TX SA stats structure + * @rx_sc_stats: pointer to RX SC stats structure + * @rx_sa_stats: pointer to RX SA stats structure + * @dev_stats: pointer to dev stats structure */ struct macsec_context { union { @@ -277,6 +294,33 @@ struct macsec_context { /** * struct macsec_ops - MACsec offloading operations + * @mdo_dev_open: called when the MACsec interface transitions to the up state + * @mdo_dev_stop: called when the MACsec interface transitions to the down + * state + * @mdo_add_secy: called when a new SecY is added + * @mdo_upd_secy: called when the SecY flags are changed or the MAC address of + * the MACsec interface is changed + * @mdo_del_secy: called when the hw offload is disabled or the MACsec + * interface is removed + * @mdo_add_rxsc: called when a new RX SC is added + * @mdo_upd_rxsc: called when a certain RX SC is updated + * @mdo_del_rxsc: called when a certain RX SC is removed + * @mdo_add_rxsa: called when a new RX SA is added + * @mdo_upd_rxsa: called when a certain RX SA is updated + * @mdo_del_rxsa: called when a certain RX SA is removed + * @mdo_add_txsa: called when a new TX SA is added + * @mdo_upd_txsa: called when a certain TX SA is updated + * @mdo_del_txsa: called when a certain TX SA is removed + * @mdo_get_dev_stats: called when dev stats are read + * @mdo_get_tx_sc_stats: called when TX SC stats are read + * @mdo_get_tx_sa_stats: called when TX SA stats are read + * @mdo_get_rx_sc_stats: called when RX SC stats are read + * @mdo_get_rx_sa_stats: called when RX SA stats are read + * @mdo_insert_tx_tag: called to insert the TX tag + * @needed_headroom: number of bytes reserved at the beginning of the sk_buff + * for the TX tag + * @needed_tailroom: number of bytes reserved at the end of the sk_buff for the + * TX tag */ struct macsec_ops { /* Device wide */ @@ -303,6 +347,11 @@ struct macsec_ops { int (*mdo_get_tx_sa_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sc_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sa_stats)(struct macsec_context *ctx); + /* Offload tag */ + int (*mdo_insert_tx_tag)(struct phy_device *phydev, + struct sk_buff *skb); + unsigned int needed_headroom; + unsigned int needed_tailroom; }; void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); @@ -325,4 +374,9 @@ static inline void *macsec_netdev_priv(const struct net_device *dev) return netdev_priv(dev); } +static inline u64 sci_to_cpu(sci_t sci) +{ + return be64_to_cpu((__force __be64)sci); +} + #endif /* _NET_MACSEC_H_ */ diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 88b6ef7ce1a6..27684135bb4d 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -66,6 +66,7 @@ enum { GDMA_DEVICE_NONE = 0, GDMA_DEVICE_HWC = 1, GDMA_DEVICE_MANA = 2, + GDMA_DEVICE_MANA_IB = 3, }; struct gdma_resource { @@ -149,6 +150,7 @@ struct gdma_general_req { #define GDMA_MESSAGE_V1 1 #define GDMA_MESSAGE_V2 2 +#define GDMA_MESSAGE_V3 3 struct gdma_general_resp { struct gdma_resp_hdr hdr; @@ -293,6 +295,7 @@ struct gdma_queue { u32 head; u32 tail; + struct list_head entry; /* Extra fields specific to EQ/CQ. */ union { @@ -328,6 +331,7 @@ struct gdma_queue_spec { void *context; unsigned long log2_throttle_limit; + unsigned int msix_index; } eq; struct { @@ -344,7 +348,9 @@ struct gdma_queue_spec { struct gdma_irq_context { void (*handler)(void *arg); - void *arg; + /* Protect the eq_list */ + spinlock_t lock; + struct list_head eq_list; char name[MANA_IRQ_NAME_SZ]; }; @@ -355,7 +361,6 @@ struct gdma_context { unsigned int max_num_queues; unsigned int max_num_msix; unsigned int num_msix_usable; - struct gdma_resource msix_resource; struct gdma_irq_context *irq_contexts; /* L2 MTU */ @@ -387,6 +392,9 @@ struct gdma_context { /* Azure network adapter */ struct gdma_dev mana; + + /* Azure RDMA adapter */ + struct gdma_dev mana_ib; }; #define MAX_NUM_GDMA_DEVICES 4 diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 6e3e9c1363db..76147feb0d10 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -353,6 +353,25 @@ struct mana_tx_qp { struct mana_ethtool_stats { u64 stop_queue; u64 wake_queue; + u64 hc_rx_discards_no_wqe; + u64 hc_rx_err_vport_disabled; + u64 hc_rx_bytes; + u64 hc_rx_ucast_pkts; + u64 hc_rx_ucast_bytes; + u64 hc_rx_bcast_pkts; + u64 hc_rx_bcast_bytes; + u64 hc_rx_mcast_pkts; + u64 hc_rx_mcast_bytes; + u64 hc_tx_err_gf_disabled; + u64 hc_tx_err_vport_disabled; + u64 hc_tx_err_inval_vportoffset_pkt; + u64 hc_tx_err_vlan_enforcement; + u64 hc_tx_err_eth_type_enforcement; + u64 hc_tx_err_sa_enforcement; + u64 hc_tx_err_sqpdid_enforcement; + u64 hc_tx_err_cqpdid_enforcement; + u64 hc_tx_err_mtu_violation; + u64 hc_tx_err_inval_oob; u64 hc_tx_bytes; u64 hc_tx_ucast_pkts; u64 hc_tx_ucast_bytes; @@ -360,6 +379,7 @@ struct mana_ethtool_stats { u64 hc_tx_bcast_bytes; u64 hc_tx_mcast_pkts; u64 hc_tx_mcast_bytes; + u64 hc_tx_err_gdma; u64 tx_cqe_err; u64 tx_cqe_unknown_type; u64 rx_coalesced_err; @@ -602,8 +622,8 @@ struct mana_query_gf_stat_resp { struct gdma_resp_hdr hdr; u64 reported_stats; /* rx errors/discards */ - u64 discard_rx_nowqe; - u64 err_rx_vport_disabled; + u64 rx_discards_nowqe; + u64 rx_err_vport_disabled; /* rx bytes/packets */ u64 hc_rx_bytes; u64 hc_rx_ucast_pkts; @@ -613,16 +633,16 @@ struct mana_query_gf_stat_resp { u64 hc_rx_mcast_pkts; u64 hc_rx_mcast_bytes; /* tx errors */ - u64 err_tx_gf_disabled; - u64 err_tx_vport_disabled; - u64 err_tx_inval_vport_offset_pkt; - u64 err_tx_vlan_enforcement; - u64 err_tx_ethtype_enforcement; - u64 err_tx_SA_enforecement; - u64 err_tx_SQPDID_enforcement; - u64 err_tx_CQPDID_enforcement; - u64 err_tx_mtu_violation; - u64 err_tx_inval_oob; + u64 tx_err_gf_disabled; + u64 tx_err_vport_disabled; + u64 tx_err_inval_vport_offset_pkt; + u64 tx_err_vlan_enforcement; + u64 tx_err_ethtype_enforcement; + u64 tx_err_SA_enforcement; + u64 tx_err_SQPDID_enforcement; + u64 tx_err_CQPDID_enforcement; + u64 tx_err_mtu_violation; + u64 tx_err_inval_oob; /* tx bytes/packets */ u64 hc_tx_bytes; u64 hc_tx_ucast_pkts; @@ -632,7 +652,7 @@ struct mana_query_gf_stat_resp { u64 hc_tx_mcast_pkts; u64 hc_tx_mcast_bytes; /* tx error */ - u64 err_tx_gdma; + u64 tx_err_gdma; }; /* HW DATA */ /* Configure vPort Rx Steering */ diff --git a/include/net/mctp.h b/include/net/mctp.h index da86e106c91d..7b17c52e8ce2 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -87,7 +87,7 @@ struct mctp_sock { }; /* Key for matching incoming packets to sockets or reassembly contexts. - * Packets are matched on (src,dest,tag). + * Packets are matched on (peer EID, local EID, tag). * * Lifetime / locking requirements: * @@ -133,6 +133,7 @@ struct mctp_sock { * - through an expiry timeout, on a per-socket timer */ struct mctp_sk_key { + unsigned int net; mctp_eid_t peer_addr; mctp_eid_t local_addr; /* MCTP_ADDR_ANY for local owned tags */ __u8 tag; /* incoming tag match; invert TO for local */ @@ -249,12 +250,14 @@ struct mctp_route { struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, mctp_eid_t daddr); +/* always takes ownership of skb */ int mctp_local_output(struct sock *sk, struct mctp_route *rt, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, - mctp_eid_t daddr, mctp_eid_t saddr, + unsigned int netid, + mctp_eid_t local, mctp_eid_t peer, bool manual, u8 *tagp); /* routing <--> device interface */ diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 07022bb0d44d..0d28172193fa 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -162,7 +162,7 @@ struct neighbour { struct rcu_head rcu; struct net_device *dev; netdevice_tracker dev_tracker; - u8 primary_key[0]; + u8 primary_key[]; } __randomize_layout; struct neigh_ops { diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 13b3a4e29fdb..20c34bd7a077 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -67,8 +67,6 @@ struct net { */ spinlock_t rules_mod_lock; - atomic_t dev_unreg_count; - unsigned int dev_base_seq; /* protected by rtnl_mutex */ u32 ifindex; @@ -450,6 +448,9 @@ struct pernet_operations { void (*pre_exit)(struct net *net); void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); + /* Following method is called with RTNL held. */ + void (*exit_batch_rtnl)(struct list_head *net_exit_list, + struct list_head *dev_kill_list); unsigned int *id; size_t size; }; diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index d68b0a483431..1ec408585373 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -4,6 +4,62 @@ #include <linux/netdevice.h> +/* See the netdev.yaml spec for definition of each statistic */ +struct netdev_queue_stats_rx { + u64 bytes; + u64 packets; + u64 alloc_fail; +}; + +struct netdev_queue_stats_tx { + u64 bytes; + u64 packets; +}; + +/** + * struct netdev_stat_ops - netdev ops for fine grained stats + * @get_queue_stats_rx: get stats for a given Rx queue + * @get_queue_stats_tx: get stats for a given Tx queue + * @get_base_stats: get base stats (not belonging to any live instance) + * + * Query stats for a given object. The values of the statistics are undefined + * on entry (specifically they are *not* zero-initialized). Drivers should + * assign values only to the statistics they collect. Statistics which are not + * collected must be left undefined. + * + * Queue objects are not necessarily persistent, and only currently active + * queues are queried by the per-queue callbacks. This means that per-queue + * statistics will not generally add up to the total number of events for + * the device. The @get_base_stats callback allows filling in the delta + * between events for currently live queues and overall device history. + * When the statistics for the entire device are queried, first @get_base_stats + * is issued to collect the delta, and then a series of per-queue callbacks. + * Only statistics which are set in @get_base_stats will be reported + * at the device level, meaning that unlike in queue callbacks, setting + * a statistic to zero in @get_base_stats is a legitimate thing to do. + * This is because @get_base_stats has a second function of designating which + * statistics are in fact correct for the entire device (e.g. when history + * for some of the events is not maintained, and reliable "total" cannot + * be provided). + * + * Device drivers can assume that when collecting total device stats, + * the @get_base_stats and subsequent per-queue calls are performed + * "atomically" (without releasing the rtnl_lock). + * + * Device drivers are encouraged to reset the per-queue statistics when + * number of queues change. This is because the primary use case for + * per-queue statistics is currently to detect traffic imbalance. + */ +struct netdev_stat_ops { + void (*get_queue_stats_rx)(struct net_device *dev, int idx, + struct netdev_queue_stats_rx *stats); + void (*get_queue_stats_tx)(struct net_device *dev, int idx, + struct netdev_queue_stats_tx *stats); + void (*get_base_stats)(struct net_device *dev, + struct netdev_queue_stats_rx *rx, + struct netdev_queue_stats_tx *tx); +}; + /** * DOC: Lockless queue stopping / waking helpers. * @@ -128,7 +184,7 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue, netdev_txq_completed_mb(txq, pkts, bytes); \ \ _res = -1; \ - if (pkts && likely(get_desc > start_thrs)) { \ + if (pkts && likely(get_desc >= start_thrs)) { \ _res = 1; \ if (unlikely(netif_tx_queue_stopped(txq)) && \ !(down_cond)) { \ diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index cdcafb30d437..aa1716fb0e53 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -21,6 +21,10 @@ struct netdev_rx_queue { #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif + /* NAPI instance for the queue + * Readers and writers must hold RTNL + */ + struct napi_struct *napi; } ____cacheline_aligned_in_smp; /* diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index fe1507c1db82..a763dd327c6e 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -62,6 +62,8 @@ struct nf_flowtable_type { enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule); void (*free)(struct nf_flowtable *ft); + void (*get)(struct nf_flowtable *ft); + void (*put)(struct nf_flowtable *ft); nf_hookfn *hook; struct module *owner; }; @@ -72,12 +74,13 @@ enum nf_flowtable_flags { }; struct nf_flowtable { - struct list_head list; - struct rhashtable rhashtable; - int priority; + unsigned int flags; /* readonly in datapath */ + int priority; /* control path (padding hole) */ + struct rhashtable rhashtable; /* datapath, read-mostly members come first */ + + struct list_head list; /* slowpath parts */ const struct nf_flowtable_type *type; struct delayed_work gc_work; - unsigned int flags; struct flow_block flow_block; struct rw_semaphore flow_block_lock; /* Guards flow_block */ possible_net_t net; @@ -240,6 +243,11 @@ nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, } list_add_tail(&block_cb->list, &block->cb_list); + up_write(&flow_table->flow_block_lock); + + if (flow_table->type->get) + flow_table->type->get(flow_table); + return 0; unlock: up_write(&flow_table->flow_block_lock); @@ -262,10 +270,13 @@ nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, WARN_ON(true); } up_write(&flow_table->flow_block_lock); + + if (flow_table->type->put) + flow_table->type->put(flow_table); } void flow_offload_route_init(struct flow_offload *flow, - const struct nf_flow_route *route); + struct nf_flow_route *route); int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); void flow_offload_refresh(struct nf_flowtable *flow_table, diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index c81021ab07aa..4aeffddb7586 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -35,7 +35,6 @@ struct nf_queue_handler { void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); -void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3bbd13ab1ecf..e27c28b612e4 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -178,9 +178,9 @@ static inline __be32 nft_reg_load_be32(const u32 *sreg) return *(__force __be32 *)sreg; } -static inline void nft_reg_store64(u32 *dreg, u64 val) +static inline void nft_reg_store64(u64 *dreg, u64 val) { - put_unaligned(val, (u64 *)dreg); + put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) @@ -205,6 +205,7 @@ static inline void nft_data_copy(u32 *dst, const struct nft_data *src, * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number + * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message @@ -282,6 +283,7 @@ struct nft_elem_priv { }; * * @key: element key * @key_end: closing element key + * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { @@ -325,10 +327,10 @@ struct nft_set_iter { * @dtype: data type * @dlen: data length * @objtype: object type - * @flags: flags * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval + * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions @@ -351,9 +353,9 @@ struct nft_set_desc { /** * enum nft_set_class - performance class * - * @NFT_LOOKUP_O_1: constant, O(1) - * @NFT_LOOKUP_O_LOG_N: logarithmic, O(log N) - * @NFT_LOOKUP_O_N: linear, O(N) + * @NFT_SET_CLASS_O_1: constant, O(1) + * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) + * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, @@ -422,9 +424,13 @@ struct nft_set_ext; * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements + * @commit: commit set elements + * @abort: abort set elements * @privsize: function to return size of set private data + * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance + * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster @@ -540,13 +546,16 @@ struct nft_set_elem_expr { * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data - * @expr: stateful expression + * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length + * @num_exprs: numbers of exprs + * @exprs: stateful expression + * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { @@ -692,6 +701,7 @@ extern const struct nft_set_ext_type nft_set_ext_types[]; * * @len: length of extension area * @offset: offsets of individual extension types + * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; @@ -798,10 +808,16 @@ static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ex return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } -static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) +static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, + u64 tstamp) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && - time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext)); + time_after_eq64(tstamp, *nft_set_ext_expiration(ext)); +} + +static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) +{ + return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, @@ -840,6 +856,7 @@ struct nft_expr_ops; * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present + * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference @@ -881,14 +898,22 @@ struct nft_offload_ctx; * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function + * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu + * @destroy_clone: destruction clone function * @dump: function to dump parameters - * @type: expression type * @validate: validate expression, called during loop detection + * @reduce: reduce expression + * @gc: garbage collection expression + * @offload: hardware offload expression + * @offload_action: function to report true/false to allocate one slot or not in the flow + * offload array + * @offload_stats: function to synchronize hardware stats via updating the counter expression + * @type: expression type * @data: extra data to attach to this expression operation */ struct nft_expr_ops { @@ -1041,14 +1066,21 @@ struct nft_rule_blob { /** * struct nft_chain - nf_tables chain * + * @blob_gen_0: rule blob pointer to the current generation + * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain - * @flags: bitmask of enum nft_chain_flags + * @flags: bitmask of enum NFTA_CHAIN_FLAGS + * @bound: bind or not + * @genmask: generation mask * @name: name of the chain + * @udlen: user data length + * @udata: user data in the chain + * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; @@ -1146,6 +1178,7 @@ struct nft_hook { * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy + * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) @@ -1244,6 +1277,12 @@ static inline bool nft_table_has_owner(const struct nft_table *table) return table->flags & NFT_TABLE_F_OWNER; } +static inline bool nft_table_is_orphan(const struct nft_table *table) +{ + return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) == + NFT_TABLE_F_PERSIST; +} + static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || @@ -1274,11 +1313,13 @@ struct nft_object_hash_key { * struct nft_object - nf_tables stateful object * * @list: table stateful object list node - * @key: keys that identify this object * @rhlhead: nft_objname_ht node + * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle + * @udlen: length of user data + * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ @@ -1322,6 +1363,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute + * @family: address family for AF-specific object types * @policy: netlink attribute policy */ struct nft_object_type { @@ -1331,6 +1373,7 @@ struct nft_object_type { struct list_head list; u32 type; unsigned int maxattr; + u8 family; struct module *owner; const struct nla_policy *policy; }; @@ -1344,6 +1387,7 @@ struct nft_object_type { * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object + * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, @@ -1379,9 +1423,8 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle - * @dev_name: array of device names + * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector - * @ops: array of hooks */ struct nft_flowtable { struct list_head list; @@ -1748,6 +1791,7 @@ struct nftables_pernet { struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; + u64 tstamp; unsigned int base_seq; unsigned int gc_seq; u8 validate_state; @@ -1760,6 +1804,11 @@ static inline struct nftables_pernet *nft_pernet(const struct net *net) return net_generic(net, nf_tables_net_id); } +static inline u64 nft_net_tstamp(const struct net *net) +{ + return nft_pernet(net)->tstamp; +} + #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index 947973623dc7..60a7d0ce3080 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -30,7 +30,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) return -1; len = iph_totlen(pkt->skb, iph); - thoff = iph->ihl * 4; + thoff = skb_network_offset(pkt->skb) + (iph->ihl * 4); if (pkt->skb->len < len) return -1; else if (len < thoff) diff --git a/include/net/netkit.h b/include/net/netkit.h index 0ba2e6b847ca..9ec0163739f4 100644 --- a/include/net/netkit.h +++ b/include/net/netkit.h @@ -10,6 +10,7 @@ int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +INDIRECT_CALLABLE_DECLARE(struct net_device *netkit_peer_dev(struct net_device *dev)); #else static inline int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -34,5 +35,10 @@ static inline int netkit_prog_query(const union bpf_attr *attr, { return -EINVAL; } + +static inline struct net_device *netkit_peer_dev(struct net_device *dev) +{ + return NULL; +} #endif /* CONFIG_NETKIT */ #endif /* __NET_NETKIT_H */ diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 43ae50337685..f3ab0b8a4b18 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -145,15 +145,14 @@ struct netlbl_lsm_cache { * processing. * */ -#define NETLBL_CATMAP_MAPTYPE u64 #define NETLBL_CATMAP_MAPCNT 4 -#define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8) +#define NETLBL_CATMAP_MAPSIZE (sizeof(u64) * 8) #define NETLBL_CATMAP_SIZE (NETLBL_CATMAP_MAPSIZE * \ NETLBL_CATMAP_MAPCNT) -#define NETLBL_CATMAP_BIT (NETLBL_CATMAP_MAPTYPE)0x01 +#define NETLBL_CATMAP_BIT ((u64)0x01) struct netlbl_lsm_catmap { u32 startbit; - NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT]; + u64 bitmap[NETLBL_CATMAP_MAPCNT]; struct netlbl_lsm_catmap *next; }; diff --git a/include/net/netlink.h b/include/net/netlink.h index 83bdf787aeee..c19ff921b661 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1011,6 +1011,20 @@ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) } /** + * nlmsg_new_large - Allocate a new netlink message with non-contiguous + * physical memory + * @payload: size of the message payload + * + * The allocated skb is unable to have frag page for shinfo->frags*, + * as the NULL setting for skb->head in netlink_skb_destructor() will + * bypass most of the handling in skb_release_data() + */ +static inline struct sk_buff *nlmsg_new_large(size_t payload) +{ + return netlink_alloc_large_skb(nlmsg_total_size(payload), 0); +} + +/** * nlmsg_end - Finalize a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header @@ -1073,21 +1087,29 @@ static inline void nlmsg_free(struct sk_buff *skb) } /** - * nlmsg_multicast - multicast a netlink message + * nlmsg_multicast_filtered - multicast a netlink message with filter function * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags + * @filter: filter function + * @filter_data: filter function private data + * + * Return: 0 on success, negative error code for failure. */ -static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, - u32 portid, unsigned int group, gfp_t flags) +static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb, + u32 portid, unsigned int group, + gfp_t flags, + netlink_filter_fn filter, + void *filter_data) { int err; NETLINK_CB(skb).dst_group = group; - err = netlink_broadcast(sk, skb, portid, group, flags); + err = netlink_broadcast_filtered(sk, skb, portid, group, flags, + filter, filter_data); if (err > 0) err = 0; @@ -1095,6 +1117,21 @@ static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, } /** + * nlmsg_multicast - multicast a netlink message + * @sk: netlink socket to spread messages to + * @skb: netlink message as socket buffer + * @portid: own netlink portid to avoid sending to yourself + * @group: multicast group id + * @flags: allocation flags + */ +static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags) +{ + return nlmsg_multicast_filtered(sk, skb, portid, group, flags, + NULL, NULL); +} + +/** * nlmsg_unicast - unicast a netlink message * @sk: netlink socket to spread message to * @skb: netlink message as socket buffer @@ -1200,7 +1237,7 @@ static inline void *nla_data(const struct nlattr *nla) * nla_len - length of payload * @nla: netlink attribute */ -static inline int nla_len(const struct nlattr *nla) +static inline u16 nla_len(const struct nlattr *nla) { return nla->nla_len - NLA_HDRLEN; } diff --git a/include/net/netmem.h b/include/net/netmem.h new file mode 100644 index 000000000000..d8b810245c1d --- /dev/null +++ b/include/net/netmem.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Network memory + * + * Author: Mina Almasry <almasrymina@google.com> + */ + +#ifndef _NET_NETMEM_H +#define _NET_NETMEM_H + +/** + * typedef netmem_ref - a nonexistent type marking a reference to generic + * network memory. + * + * A netmem_ref currently is always a reference to a struct page. This + * abstraction is introduced so support for new memory types can be added. + * + * Use the supplied helpers to obtain the underlying memory pointer and fields. + */ +typedef unsigned long __bitwise netmem_ref; + +/* This conversion fails (returns NULL) if the netmem_ref is not struct page + * backed. + * + * Currently struct page is the only possible netmem, and this helper never + * fails. + */ +static inline struct page *netmem_to_page(netmem_ref netmem) +{ + return (__force struct page *)netmem; +} + +/* Converting from page to netmem is always safe, because a page can always be + * a netmem. + */ +static inline netmem_ref page_to_netmem(struct page *page) +{ + return (__force netmem_ref)page; +} + +#endif /* _NET_NETMEM_H */ diff --git a/include/net/netns/core.h b/include/net/netns/core.h index a91ef9f8de60..78214f1b43a2 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -13,6 +13,7 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + int sysctl_optmem_max; u8 sysctl_txrehash; #ifdef CONFIG_PROC_FS diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 73f43f699199..c356c458b340 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -19,8 +19,7 @@ struct hlist_head; struct fib_table; struct sock; struct local_ports { - seqlock_t lock; - int range[2]; + u32 range; /* high << 16 | low */ bool warned; }; @@ -42,6 +41,38 @@ struct inet_timewait_death_row { struct tcp_fastopen_context; struct netns_ipv4 { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. + * Please update the document when adding new fields. + */ + + /* TX readonly hotpath cache lines */ + __cacheline_group_begin(netns_ipv4_read_tx); + u8 sysctl_tcp_early_retrans; + u8 sysctl_tcp_tso_win_divisor; + u8 sysctl_tcp_tso_rtt_log; + u8 sysctl_tcp_autocorking; + int sysctl_tcp_min_snd_mss; + unsigned int sysctl_tcp_notsent_lowat; + int sysctl_tcp_limit_output_bytes; + int sysctl_tcp_min_rtt_wlen; + int sysctl_tcp_wmem[3]; + u8 sysctl_ip_fwd_use_pmtu; + __cacheline_group_end(netns_ipv4_read_tx); + + /* TXRX readonly hotpath cache lines */ + __cacheline_group_begin(netns_ipv4_read_txrx); + u8 sysctl_tcp_moderate_rcvbuf; + __cacheline_group_end(netns_ipv4_read_txrx); + + /* RX readonly hotpath cache line */ + __cacheline_group_begin(netns_ipv4_read_rx); + u8 sysctl_ip_early_demux; + u8 sysctl_tcp_early_demux; + int sysctl_tcp_reordering; + int sysctl_tcp_rmem[3]; + __cacheline_group_end(netns_ipv4_read_rx); + struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; @@ -96,17 +127,14 @@ struct netns_ipv4 { u8 sysctl_ip_default_ttl; u8 sysctl_ip_no_pmtu_disc; - u8 sysctl_ip_fwd_use_pmtu; u8 sysctl_ip_fwd_update_priority; u8 sysctl_ip_nonlocal_bind; u8 sysctl_ip_autobind_reuse; /* Shall we try to damage output packets if routing dev changes? */ u8 sysctl_ip_dynaddr; - u8 sysctl_ip_early_demux; #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_raw_l3mdev_accept; #endif - u8 sysctl_tcp_early_demux; u8 sysctl_udp_early_demux; u8 sysctl_nexthop_compat_mode; @@ -119,7 +147,6 @@ struct netns_ipv4 { u8 sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probe_floor; int sysctl_tcp_base_mss; - int sysctl_tcp_min_snd_mss; int sysctl_tcp_probe_threshold; u32 sysctl_tcp_probe_interval; @@ -135,17 +162,14 @@ struct netns_ipv4 { u8 sysctl_tcp_backlog_ack_defer; u8 sysctl_tcp_pingpong_thresh; - int sysctl_tcp_reordering; u8 sysctl_tcp_retries1; u8 sysctl_tcp_retries2; u8 sysctl_tcp_orphan_retries; u8 sysctl_tcp_tw_reuse; int sysctl_tcp_fin_timeout; - unsigned int sysctl_tcp_notsent_lowat; u8 sysctl_tcp_sack; u8 sysctl_tcp_window_scaling; u8 sysctl_tcp_timestamps; - u8 sysctl_tcp_early_retrans; u8 sysctl_tcp_recovery; u8 sysctl_tcp_thin_linear_timeouts; u8 sysctl_tcp_slow_start_after_idle; @@ -161,21 +185,13 @@ struct netns_ipv4 { u8 sysctl_tcp_frto; u8 sysctl_tcp_nometrics_save; u8 sysctl_tcp_no_ssthresh_metrics_save; - u8 sysctl_tcp_moderate_rcvbuf; - u8 sysctl_tcp_tso_win_divisor; u8 sysctl_tcp_workaround_signed_windows; - int sysctl_tcp_limit_output_bytes; int sysctl_tcp_challenge_ack_limit; - int sysctl_tcp_min_rtt_wlen; u8 sysctl_tcp_min_tso_segs; - u8 sysctl_tcp_tso_rtt_log; - u8 sysctl_tcp_autocorking; u8 sysctl_tcp_reflect_tos; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; - int sysctl_tcp_wmem[3]; - int sysctl_tcp_rmem[3]; unsigned int sysctl_tcp_child_ehash_entries; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 582212ada3ba..fc752a50f91b 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -22,5 +22,7 @@ struct netns_smc { int sysctl_smcr_testlink_time; int sysctl_wmem; int sysctl_rmem; + int sysctl_max_links_per_lgr; + int sysctl_max_conns_per_lgr; }; #endif diff --git a/include/net/nexthop.h b/include/net/nexthop.h index d92046a4a078..7ec9cc80f11c 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -47,6 +47,8 @@ struct nh_config { bool nh_grp_res_has_idle_timer; bool nh_grp_res_has_unbalanced_timer; + bool nh_hw_stats; + struct nlattr *nh_encap; u16 nh_encap_type; @@ -95,8 +97,14 @@ struct nh_res_table { struct nh_res_bucket nh_buckets[] __counted_by(num_nh_buckets); }; +struct nh_grp_entry_stats { + u64_stats_t packets; + struct u64_stats_sync syncp; +}; + struct nh_grp_entry { struct nexthop *nh; + struct nh_grp_entry_stats __percpu *stats; u8 weight; union { @@ -114,6 +122,7 @@ struct nh_grp_entry { struct list_head nh_list; struct nexthop *nh_parent; /* nexthop of group with this entry */ + u64 packets_hw; }; struct nh_group { @@ -124,6 +133,7 @@ struct nh_group { bool resilient; bool fdb_nh; bool has_v4; + bool hw_stats; struct nh_res_table __rcu *res_table; struct nh_grp_entry nh_entries[] __counted_by(num_nh); @@ -157,6 +167,7 @@ enum nexthop_event_type { NEXTHOP_EVENT_REPLACE, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, NEXTHOP_EVENT_BUCKET_REPLACE, + NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, }; enum nh_notifier_info_type { @@ -164,6 +175,7 @@ enum nh_notifier_info_type { NH_NOTIFIER_INFO_TYPE_GRP, NH_NOTIFIER_INFO_TYPE_RES_TABLE, NH_NOTIFIER_INFO_TYPE_RES_BUCKET, + NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS, }; struct nh_notifier_single_info { @@ -187,6 +199,7 @@ struct nh_notifier_grp_entry_info { struct nh_notifier_grp_info { u16 num_nh; bool is_fdb; + bool hw_stats; struct nh_notifier_grp_entry_info nh_entries[] __counted_by(num_nh); }; @@ -200,9 +213,21 @@ struct nh_notifier_res_bucket_info { struct nh_notifier_res_table_info { u16 num_nh_buckets; + bool hw_stats; struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets); }; +struct nh_notifier_grp_hw_stats_entry_info { + u32 id; + u64 packets; +}; + +struct nh_notifier_grp_hw_stats_info { + u16 num_nh; + bool hw_stats_used; + struct nh_notifier_grp_hw_stats_entry_info stats[] __counted_by(num_nh); +}; + struct nh_notifier_info { struct net *net; struct netlink_ext_ack *extack; @@ -213,17 +238,22 @@ struct nh_notifier_info { struct nh_notifier_grp_info *nh_grp; struct nh_notifier_res_table_info *nh_res_table; struct nh_notifier_res_bucket_info *nh_res_bucket; + struct nh_notifier_grp_hw_stats_info *nh_grp_hw_stats; }; }; int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); +int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity); +void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, + unsigned int nh_idx, + u64 delta_packets); /* caller is holding rcu or rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); @@ -316,7 +346,7 @@ static inline int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh, u8 rt_family) { - struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 5dee575fbe86..3d07abacf08b 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -196,7 +196,7 @@ struct nfc_dev { }; #define to_nfc_dev(_dev) container_of(_dev, struct nfc_dev, dev) -extern struct class nfc_class; +extern const struct class nfc_class; struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 4ebd544ae977..1d397c1a0043 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -11,7 +11,7 @@ * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * - * Basic use involves replacing and alloc_pages() calls with page_pool_alloc(), + * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * @@ -29,7 +29,7 @@ * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for - * 'struct page' and atomic operation for page->pp_frag_count. + * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call @@ -37,15 +37,15 @@ * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * - * page_pool_put_page() may be called multi times on the same page if a page is - * split into multi fragments. For the last fragment, it will either recycle the - * page, or in case of page->_refcount > 1, it will release the DMA mapping and - * in-flight state accounting. + * page_pool_put_page() may be called multiple times on the same page if a page + * is split into multiple fragments. For the last fragment, it will either + * recycle the page, or in case of page->_refcount > 1, it will release the DMA + * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in - * the same page when a page is split, the API user must setup pool->p.max_len + * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ @@ -55,16 +55,12 @@ #include <net/page_pool/types.h> #ifdef CONFIG_PAGE_POOL_STATS +/* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); -/* - * Drivers that wish to harvest page pool stats and report them to users - * (perhaps via ethtool, debugfs, or another mechanism) can allocate a - * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. - */ -bool page_pool_get_stats(struct page_pool *pool, +bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) @@ -214,69 +210,82 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) return pool->p.dma_dir; } -/* pp_frag_count represents the number of writers who can update the page - * either by updating skb->data or via DMA mappings for the device. - * We can't rely on the page refcnt for that as we don't know who might be - * holding page references and we can't reliably destroy or sync DMA mappings - * of the fragments. +/** + * page_pool_fragment_page() - split a fresh page into fragments + * @page: page to split + * @nr: references to set + * + * pp_ref_count represents the number of outstanding references to the page, + * which will be freed using page_pool APIs (rather than page allocator APIs + * like put_page()). Such references are usually held by page_pool-aware + * objects like skbs marked for page pool recycling. * - * When pp_frag_count reaches 0 we can either recycle the page if the page - * refcnt is 1 or return it back to the memory allocator and destroy any - * mappings we have. + * This helper allows the caller to take (set) multiple references to a + * freshly allocated page. The page must be freshly allocated (have a + * pp_ref_count of 1). This is commonly done by drivers and + * "fragment allocators" to save atomic operations - either when they know + * upfront how many references they will need; or to take MAX references and + * return the unused ones with a single atomic dec(), instead of performing + * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { - atomic_long_set(&page->pp_frag_count, nr); + atomic_long_set(&page->pp_ref_count, nr); } -static inline long page_pool_defrag_page(struct page *page, long nr) +static inline long page_pool_unref_page(struct page *page, long nr) { long ret; - /* If nr == pp_frag_count then we have cleared all remaining + /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case - * for pp_frag_count draining. + * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be - * partitioned into only 2 or 3 pieces; but also unify the pp_frag_count + * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ - if (atomic_long_read(&page->pp_frag_count) == nr) { + if (atomic_long_read(&page->pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case - * here for pp_frag_count draining, which is a rare case. + * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) - atomic_long_set(&page->pp_frag_count, 1); + atomic_long_set(&page->pp_ref_count, 1); return 0; } - ret = atomic_long_sub_return(nr, &page->pp_frag_count); + ret = atomic_long_sub_return(nr, &page->pp_ref_count); WARN_ON(ret < 0); - /* We are the last user here too, reset pp_frag_count back to 1 to + /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call - * page_pool_defrag_page() currently. + * page_pool_unref_page() currently. */ if (unlikely(!ret)) - atomic_long_set(&page->pp_frag_count, 1); + atomic_long_set(&page->pp_ref_count, 1); return ret; } -static inline bool page_pool_is_last_frag(struct page *page) +static inline void page_pool_ref_page(struct page *page) +{ + atomic_long_inc(&page->pp_ref_count); +} + +static inline bool page_pool_is_last_ref(struct page *page) { - /* If page_pool_defrag_page() returns 0, we were the last user */ - return page_pool_defrag_page(page, 1) == 0; + /* If page_pool_unref_page() returns 0, we were the last user */ + return page_pool_unref_page(page, 1) == 0; } /** @@ -301,10 +310,10 @@ static inline void page_pool_put_page(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_frag(page)) + if (!page_pool_is_last_ref(page)) return; - page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); + page_pool_put_unrefed_page(pool, page, dma_sync_size, allow_direct); #endif } diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 6fc5134095ed..5e43a08d3231 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -5,6 +5,7 @@ #include <linux/dma-direction.h> #include <linux/ptr_ring.h> +#include <linux/types.h> #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA * map/unmap @@ -17,8 +18,9 @@ * Please note DMA-sync-for-CPU is still * device driver responsibility */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ - PP_FLAG_DMA_SYNC_DEV) +#define PP_FLAG_SYSTEM_POOL BIT(2) /* Global system page_pool */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ + PP_FLAG_SYSTEM_POOL) /* * Fast allocation side cache array/stack @@ -48,24 +50,30 @@ struct pp_alloc_cache { * @pool_size: size of the ptr_ring * @nid: NUMA node id to allocate from pages from * @dev: device, for DMA pre-mapping purposes + * @netdev: netdev this pool will serve (leave as NULL if none or multiple) * @napi: NAPI which is the sole consumer of pages, otherwise NULL * @dma_dir: DMA mapping direction * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV */ struct page_pool_params { - unsigned int flags; - unsigned int order; - unsigned int pool_size; - int nid; - struct device *dev; - struct napi_struct *napi; - enum dma_data_direction dma_dir; - unsigned int max_len; - unsigned int offset; + struct_group_tagged(page_pool_params_fast, fast, + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; + struct device *dev; + struct napi_struct *napi; + enum dma_data_direction dma_dir; + unsigned int max_len; + unsigned int offset; + ); + struct_group_tagged(page_pool_params_slow, slow, + struct net_device *netdev; /* private: used by test code only */ - void (*init_callback)(struct page *page, void *arg); - void *init_arg; + void (*init_callback)(struct page *page, void *arg); + void *init_arg; + ); }; #ifdef CONFIG_PAGE_POOL_STATS @@ -119,7 +127,10 @@ struct page_pool_stats { #endif struct page_pool { - struct page_pool_params p; + struct page_pool_params_fast p; + + int cpuid; + bool has_init_callback; long frag_users; struct page *frag_page; @@ -178,27 +189,34 @@ struct page_pool { refcount_t user_cnt; u64 destroy_cnt; + + /* Slow/Control-path information follows */ + struct page_pool_params_slow slow; + /* User-facing fields, protected by page_pools_lock */ + struct { + struct hlist_node list; + u64 detach_time; + u32 napi_id; + u32 id; + } user; }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); struct page_pool *page_pool_create(const struct page_pool_params *params); +struct page_pool *page_pool_create_percpu(const struct page_pool_params *params, + int cpuid); struct xdp_mem_info; #ifdef CONFIG_PAGE_POOL -void page_pool_unlink_napi(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), struct xdp_mem_info *mem); void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count); #else -static inline void page_pool_unlink_napi(struct page_pool *pool) -{ -} - static inline void page_pool_destroy(struct page_pool *pool) { } @@ -215,9 +233,9 @@ static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, } #endif -void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, - bool allow_direct); +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, + bool allow_direct); static inline bool is_page_pool_compiled_in(void) { diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a76c9171db0e..a4ee43f493bb 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -24,6 +24,8 @@ struct tcf_walker { int register_tcf_proto_ops(struct tcf_proto_ops *ops); void unregister_tcf_proto_ops(struct tcf_proto_ops *ops); +#define NET_CLS_ALIAS_PREFIX "net-cls-" +#define MODULE_ALIAS_NET_CLS(kind) MODULE_ALIAS(NET_CLS_ALIAS_PREFIX kind) struct tcf_block_ext_info { enum flow_block_binder_type binder_type; @@ -154,12 +156,6 @@ __cls_set_class(unsigned long *clp, unsigned long cl) return xchg(clp, cl); } -static inline void tcf_set_drop_reason(struct tcf_result *res, - enum skb_drop_reason reason) -{ - res->drop_reason = reason; -} - static inline void __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9fa1d0794dfa..d7b7b6cd4aa1 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -100,6 +100,8 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, int register_qdisc(struct Qdisc_ops *qops); void unregister_qdisc(struct Qdisc_ops *qops); +#define NET_SCH_ALIAS_PREFIX "net-sch-" +#define MODULE_ALIAS_NET_SCH(id) MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id) void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); @@ -275,24 +277,6 @@ static inline void skb_txtime_consumed(struct sk_buff *skb) skb->tstamp = ktime_set(0, 0); } -struct tc_skb_cb { - struct qdisc_skb_cb qdisc_cb; - - u16 mru; - u8 post_ct:1; - u8 post_ct_snat:1; - u8 post_ct_dnat:1; - u16 zone; /* Only valid if post_ct = true */ -}; - -static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) -{ - struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; - - BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); - return cb; -} - static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, unsigned long cl, struct qdisc_walker *arg) diff --git a/include/net/protocol.h b/include/net/protocol.h index 6aef8cb11cc8..b2499f88f8f8 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -46,6 +46,7 @@ struct net_protocol { * socket lookup? */ icmp_strict_tag_validation:1; + u32 secret; }; #if IS_ENABLED(CONFIG_IPV6) @@ -59,6 +60,7 @@ struct inet6_protocol { __be32 info); unsigned int flags; /* INET6_PROTO_xxx */ + u32 secret; }; #define INET6_PROTO_NOPOLICY 0x1 @@ -68,6 +70,7 @@ struct inet6_protocol { struct net_offload { struct offload_callbacks callbacks; unsigned int flags; /* Flags used by IPv6 for now */ + u32 secret; }; /* This should be set for any extension header which is compatible with GSO. */ #define INET6_PROTO_GSO_EXTHDR 0x1 diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 144c39db9898..8839133d6f6b 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -83,6 +83,45 @@ static inline struct sock *req_to_sk(struct request_sock *req) return (struct sock *)req; } +/** + * skb_steal_sock - steal a socket from an sk_buff + * @skb: sk_buff to steal the socket from + * @refcounted: is set to true if the socket is reference-counted + * @prefetched: is set to true if the socket was assigned from bpf + */ +static inline struct sock *skb_steal_sock(struct sk_buff *skb, + bool *refcounted, bool *prefetched) +{ + struct sock *sk = skb->sk; + + if (!sk) { + *prefetched = false; + *refcounted = false; + return NULL; + } + + *prefetched = skb_sk_is_prefetched(skb); + if (*prefetched) { +#if IS_ENABLED(CONFIG_SYN_COOKIES) + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + struct request_sock *req = inet_reqsk(sk); + + *refcounted = false; + sk = req->rsk_listener; + req->rsk_listener = NULL; + return sk; + } +#endif + *refcounted = sk_is_refcounted(sk); + } else { + *refcounted = true; + } + + skb->destructor = NULL; + skb->sk = NULL; + return sk; +} + static inline struct request_sock * reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) diff --git a/include/net/route.h b/include/net/route.h index 980ab474eabd..d4a0147942f1 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -37,9 +37,6 @@ #define RTO_ONLINK 0x01 -#define RT_CONN_FLAGS(sk) (RT_TOS(READ_ONCE(inet_sk(sk)->tos)) | sock_flag(sk, SOCK_LOCALROUTE)) -#define RT_CONN_FLAGS_TOS(sk,tos) (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE)) - static inline __u8 ip_sock_rt_scope(const struct sock *sk) { if (sock_flag(sk, SOCK_LOCALROUTE)) @@ -163,8 +160,8 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi __u8 proto, __u8 tos, int oif) { flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos, - RT_SCOPE_UNIVERSE, proto, - sk ? inet_sk_flowi_flags(sk) : 0, + sk ? ip_sock_rt_scope(sk) : RT_SCOPE_UNIVERSE, + proto, sk ? inet_sk_flowi_flags(sk) : 0, daddr, saddr, dport, sport, sock_net_uid(net, sk)); if (sk) security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); diff --git a/include/net/rps.h b/include/net/rps.h new file mode 100644 index 000000000000..7660243e905b --- /dev/null +++ b/include/net/rps.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_RPS_H +#define _NET_RPS_H + +#include <linux/types.h> +#include <linux/static_key.h> +#include <net/sock.h> +#include <net/hotdata.h> + +#ifdef CONFIG_RPS + +extern struct static_key_false rps_needed; +extern struct static_key_false rfs_needed; + +/* + * This structure holds an RPS map which can be of variable length. The + * map is an array of CPUs. + */ +struct rps_map { + unsigned int len; + struct rcu_head rcu; + u16 cpus[]; +}; +#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) + +/* + * The rps_dev_flow structure contains the mapping of a flow to a CPU, the + * tail pointer for that CPU's input queue at the time of last enqueue, and + * a hardware filter index. + */ +struct rps_dev_flow { + u16 cpu; + u16 filter; + unsigned int last_qtail; +}; +#define RPS_NO_FILTER 0xffff + +/* + * The rps_dev_flow_table structure contains a table of flow mappings. + */ +struct rps_dev_flow_table { + unsigned int mask; + struct rcu_head rcu; + struct rps_dev_flow flows[]; +}; +#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ + ((_num) * sizeof(struct rps_dev_flow))) + +/* + * The rps_sock_flow_table contains mappings of flows to the last CPU + * on which they were processed by the application (set in recvmsg). + * Each entry is a 32bit value. Upper part is the high-order bits + * of flow hash, lower part is CPU number. + * rps_cpu_mask is used to partition the space, depending on number of + * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 + * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, + * meaning we use 32-6=26 bits for the hash. + */ +struct rps_sock_flow_table { + u32 mask; + + u32 ents[] ____cacheline_aligned_in_smp; +}; +#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) + +#define RPS_NO_CPU 0xffff + +static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, + u32 hash) +{ + unsigned int index = hash & table->mask; + u32 val = hash & ~net_hotdata.rps_cpu_mask; + + /* We only give a hint, preemption can change CPU under us */ + val |= raw_smp_processor_id(); + + /* The following WRITE_ONCE() is paired with the READ_ONCE() + * here, and another one in get_rps_cpu(). + */ + if (READ_ONCE(table->ents[index]) != val) + WRITE_ONCE(table->ents[index], val); +} + +#endif /* CONFIG_RPS */ + +static inline void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + if (!hash) + return; + rcu_read_lock(); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); + if (sock_flow_table) + rps_record_sock_flow(sock_flow_table, hash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rfs_needed)) { + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) { + /* This READ_ONCE() is paired with the WRITE_ONCE() + * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). + */ + sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); + } + } +#endif +} + +#endif /* _NET_RPS_H */ diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 6506221c5fe3..3bfb80bad173 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -12,6 +12,7 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), + RTNL_FLAG_DUMP_UNLOCKED = BIT(2), }; enum rtnl_kinds { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index dcb9160e6467..cefe0c4bdae3 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -19,6 +19,7 @@ #include <net/gen_stats.h> #include <net/rtnetlink.h> #include <net/flow_offload.h> +#include <linux/xarray.h> struct Qdisc_ops; struct qdisc_walker; @@ -237,12 +238,7 @@ static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) { -#ifdef CONFIG_BQL - /* Non-BQL migrated drivers will return 0, too. */ - return dql_avail(&txq->dql); -#else - return 0; -#endif + return netdev_queue_dql_avail(txq); } struct Qdisc_class_ops { @@ -332,7 +328,6 @@ struct tcf_result { }; const struct tcf_proto *goto_tp; }; - enum skb_drop_reason drop_reason; }; struct tcf_chain; @@ -375,6 +370,10 @@ struct tcf_proto_ops { struct nlattr **tca, struct netlink_ext_ack *extack); void (*tmplt_destroy)(void *tmplt_priv); + void (*tmplt_reoffload)(struct tcf_chain *chain, + bool add, + flow_setup_cb_t *cb, + void *cb_priv); struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, u32 handle); @@ -457,6 +456,7 @@ struct tcf_chain { }; struct tcf_block { + struct xarray ports; /* datapath accessible */ /* Lock protects tcf_block and lifetime-management data of chains * attached to the block (refcnt, action_refcnt, explicitly_created). */ @@ -483,6 +483,8 @@ struct tcf_block { struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */ }; +struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index); + static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain) { return lockdep_is_held(&chain->filter_chain_lock); @@ -1037,6 +1039,37 @@ static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) return skb; } +struct tc_skb_cb { + struct qdisc_skb_cb qdisc_cb; + u32 drop_reason; + + u16 zone; /* Only valid if post_ct = true */ + u16 mru; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; +}; + +static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) +{ + struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; + + BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); + return cb; +} + +static inline enum skb_drop_reason +tcf_get_drop_reason(const struct sk_buff *skb) +{ + return tc_skb_cb(skb)->drop_reason; +} + +static inline void tcf_set_drop_reason(const struct sk_buff *skb, + enum skb_drop_reason reason) +{ + tc_skb_cb(skb)->drop_reason = reason; +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ diff --git a/include/net/scm.h b/include/net/scm.h index e8c76b4be2fe..92276a2c5543 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -5,6 +5,7 @@ #include <linux/limits.h> #include <linux/net.h> #include <linux/cred.h> +#include <linux/file.h> #include <linux/security.h> #include <linux/pid.h> #include <linux/nsproxy.h> @@ -24,6 +25,7 @@ struct scm_creds { struct scm_fp_list { short count; + short count_unix; short max; struct user_struct *user; struct file *fp[SCM_MAX_FD]; @@ -208,5 +210,13 @@ static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, scm_destroy_cred(scm); } +static inline int scm_recv_one_fd(struct file *f, int __user *ufd, + unsigned int flags) +{ + if (!ufd) + return -EFAULT; + return receive_fd(f, ufd, flags); +} + #endif /* __LINUX_NET_SCM_H */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 5a24d6d8522a..f24a1bbcb3ef 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -242,10 +242,7 @@ struct sctp_sock { int do_auto_asconf; }; -static inline struct sctp_sock *sctp_sk(const struct sock *sk) -{ - return (struct sctp_sock *)sk; -} +#define sctp_sk(ptr) container_of_const(ptr, struct sctp_sock, inet.sk) static inline struct sock *sctp_opt2sk(const struct sctp_sock *sp) { diff --git a/include/net/smc.h b/include/net/smc.h index a002552be29c..c9dcb30e3fd9 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -52,9 +52,14 @@ struct smcd_dmb { struct smcd_dev; struct ism_client; +struct smcd_gid { + u64 gid; + u64 gid_ext; +}; + struct smcd_ops { - int (*query_remote_gid)(struct smcd_dev *dev, u64 rgid, u32 vid_valid, - u32 vid); + int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid, + u32 vid_valid, u32 vid); int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, struct ism_client *client); int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); @@ -62,14 +67,13 @@ struct smcd_ops { int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); int (*set_vlan_required)(struct smcd_dev *dev); int (*reset_vlan_required)(struct smcd_dev *dev); - int (*signal_event)(struct smcd_dev *dev, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info); + int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, + u32 trigger_irq, u32 event_code, u64 info); int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); int (*supports_v2)(void); - u8* (*get_system_eid)(void); - u64 (*get_local_gid)(struct smcd_dev *dev); + void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); u16 (*get_chid)(struct smcd_dev *dev); struct device* (*get_dev)(struct smcd_dev *dev); }; diff --git a/include/net/sock.h b/include/net/sock.h index 1d6931caf0c3..b5e00702acc1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -76,19 +76,6 @@ * the other protocols. */ -/* Define this to get the SOCK_DBG debugging facility. */ -#define SOCK_DEBUGGING -#ifdef SOCK_DEBUGGING -#define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ - printk(KERN_DEBUG msg); } while (0) -#else -/* Validate arguments and do nothing */ -static inline __printf(2, 3) -void SOCK_DEBUG(const struct sock *sk, const char *msg, ...) -{ -} -#endif - /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. @@ -277,8 +264,6 @@ struct sk_filter; * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes - * @__sk_flags_offset: empty field used to determine location of bitfield - * @sk_padding: unused element for alignment * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) @@ -352,7 +337,6 @@ struct sk_filter; * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference - * @sk_bind2_node: bind node in the bhash2 table */ struct sock { /* @@ -394,14 +378,10 @@ struct sock { #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash - /* early demux fields */ - struct dst_entry __rcu *sk_rx_dst; - int sk_rx_dst_ifindex; - u32 sk_rx_dst_cookie; + __cacheline_group_begin(sock_write_rx); - socket_lock_t sk_lock; atomic_t sk_drops; - int sk_rcvlowat; + __s32 sk_peek_off; struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* @@ -418,18 +398,24 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; } sk_backlog; - #define sk_rmem_alloc sk_backlog.rmem_alloc - int sk_forward_alloc; - u32 sk_reserved_mem; + __cacheline_group_end(sock_write_rx); + + __cacheline_group_begin(sock_read_rx); + /* early demux fields */ + struct dst_entry __rcu *sk_rx_dst; + int sk_rx_dst_ifindex; + u32 sk_rx_dst_cookie; + #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_ll_usec; - /* ===== mostly read cache line ===== */ unsigned int sk_napi_id; + u16 sk_busy_poll_budget; + u8 sk_prefer_busy_poll; #endif + u8 sk_userlocks; int sk_rcvbuf; - int sk_disconnects; struct sk_filter __rcu *sk_filter; union { @@ -438,15 +424,33 @@ struct sock { struct socket_wq *sk_wq_raw; /* public: */ }; + + void (*sk_data_ready)(struct sock *sk); + long sk_rcvtimeo; + int sk_rcvlowat; + __cacheline_group_end(sock_read_rx); + + __cacheline_group_begin(sock_read_rxtx); + int sk_err; + struct socket *sk_socket; + struct mem_cgroup *sk_memcg; #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif + __cacheline_group_end(sock_read_rxtx); - struct dst_entry __rcu *sk_dst_cache; + __cacheline_group_begin(sock_write_rxtx); + socket_lock_t sk_lock; + u32 sk_reserved_mem; + int sk_forward_alloc; + u32 sk_tsflags; + __cacheline_group_end(sock_write_rxtx); + + __cacheline_group_begin(sock_write_tx); + int sk_write_pending; atomic_t sk_omem_alloc; int sk_sndbuf; - /* ===== cache line for TX ===== */ int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; @@ -455,22 +459,36 @@ struct sock { struct rb_root tcp_rtx_queue; }; struct sk_buff_head sk_write_queue; - __s32 sk_peek_off; - int sk_write_pending; - __u32 sk_dst_pending_confirm; + u32 sk_dst_pending_confirm; u32 sk_pacing_status; /* see enum sk_pacing */ - long sk_sndtimeo; + struct page_frag sk_frag; struct timer_list sk_timer; - __u32 sk_priority; - __u32 sk_mark; + unsigned long sk_pacing_rate; /* bytes per second */ + atomic_t sk_zckey; + atomic_t sk_tskey; + __cacheline_group_end(sock_write_tx); + + __cacheline_group_begin(sock_read_tx); unsigned long sk_max_pacing_rate; - struct page_frag sk_frag; + long sk_sndtimeo; + u32 sk_priority; + u32 sk_mark; + struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; - int sk_gso_type; +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); +#endif + u16 sk_gso_type; + u16 sk_gso_max_segs; unsigned int sk_gso_max_size; gfp_t sk_allocation; - __u32 sk_txhash; + u32 sk_txhash; + u8 sk_pacing_shift; + bool sk_use_task_frag; + __cacheline_group_end(sock_read_tx); /* * Because of non atomicity rules, all @@ -479,64 +497,44 @@ struct sock { u8 sk_gso_disabled : 1, sk_kern_sock : 1, sk_no_check_tx : 1, - sk_no_check_rx : 1, - sk_userlocks : 4; - u8 sk_pacing_shift; + sk_no_check_rx : 1; + u8 sk_shutdown; u16 sk_type; u16 sk_protocol; - u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; - int sk_err, - sk_err_soft; + int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; - u8 sk_txrehash; -#ifdef CONFIG_NET_RX_BUSY_POLL - u8 sk_prefer_busy_poll; - u16 sk_busy_poll_budget; -#endif spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; - long sk_rcvtimeo; ktime_t sk_stamp; #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif - atomic_t sk_tskey; - atomic_t sk_zckey; - u32 sk_tsflags; - u8 sk_shutdown; + int sk_disconnects; + u8 sk_txrehash; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; - bool sk_use_task_frag; - struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY void *sk_security; #endif struct sock_cgroup_data sk_cgrp_data; - struct mem_cgroup *sk_memcg; void (*sk_state_change)(struct sock *sk); - void (*sk_data_ready)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); -#ifdef CONFIG_SOCK_VALIDATE_XMIT - struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, - struct net_device *dev, - struct sk_buff *skb); -#endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL @@ -544,7 +542,6 @@ struct sock { #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; - struct hlist_node sk_bind2_node; }; enum sk_pacing { @@ -873,16 +870,6 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_add_head(&sk->sk_bind_node, list); } -static inline void __sk_del_bind2_node(struct sock *sk) -{ - __hlist_del(&sk->sk_bind2_node); -} - -static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) -{ - hlist_add_head(&sk->sk_bind2_node, list); -} - #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ @@ -900,8 +887,6 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) -#define sk_for_each_bound_bhash2(__sk, list) \ - hlist_for_each_entry(__sk, list, sk_bind2_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset @@ -1132,41 +1117,6 @@ static inline void sk_incoming_cpu_update(struct sock *sk) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } -static inline void sock_rps_record_flow_hash(__u32 hash) -{ -#ifdef CONFIG_RPS - struct rps_sock_flow_table *sock_flow_table; - - rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); - rps_record_sock_flow(sock_flow_table, hash); - rcu_read_unlock(); -#endif -} - -static inline void sock_rps_record_flow(const struct sock *sk) -{ -#ifdef CONFIG_RPS - if (static_branch_unlikely(&rfs_needed)) { - /* Reading sk->sk_rxhash might incur an expensive cache line - * miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. - */ - if (sk->sk_state == TCP_ESTABLISHED) { - /* This READ_ONCE() is paired with the WRITE_ONCE() - * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). - */ - sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); - } - } -#endif -} static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) @@ -1458,6 +1408,7 @@ sk_memory_allocated(const struct sock *sk) /* 1 MB per cpu, in page units */ #define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) +extern int sysctl_mem_pcpu_rsv; static inline void sk_memory_allocated_add(struct sock *sk, int amt) @@ -1466,7 +1417,7 @@ sk_memory_allocated_add(struct sock *sk, int amt) preempt_disable(); local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt); - if (local_reserve >= SK_MEMORY_PCPU_RESERVE) { + if (local_reserve >= READ_ONCE(sysctl_mem_pcpu_rsv)) { __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve); atomic_long_add(local_reserve, sk->sk_prot->memory_allocated); } @@ -1480,7 +1431,7 @@ sk_memory_allocated_sub(struct sock *sk, int amt) preempt_disable(); local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt); - if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) { + if (local_reserve <= -READ_ONCE(sysctl_mem_pcpu_rsv)) { __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve); atomic_long_add(local_reserve, sk->sk_prot->memory_allocated); } @@ -2794,9 +2745,30 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) &skb_shinfo(skb)->tskey); } +static inline bool sk_is_inet(const struct sock *sk) +{ + int family = READ_ONCE(sk->sk_family); + + return family == AF_INET || family == AF_INET6; +} + static inline bool sk_is_tcp(const struct sock *sk) { - return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; + return sk_is_inet(sk) && + sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static inline bool sk_is_udp(const struct sock *sk) +{ + return sk_is_inet(sk) && + sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP; +} + +static inline bool sk_is_stream_unix(const struct sock *sk) +{ + return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; } /** @@ -2838,31 +2810,6 @@ sk_is_refcounted(struct sock *sk) return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } -/** - * skb_steal_sock - steal a socket from an sk_buff - * @skb: sk_buff to steal the socket from - * @refcounted: is set to true if the socket is reference-counted - * @prefetched: is set to true if the socket was assigned from bpf - */ -static inline struct sock * -skb_steal_sock(struct sk_buff *skb, bool *refcounted, bool *prefetched) -{ - if (skb->sk) { - struct sock *sk = skb->sk; - - *refcounted = true; - *prefetched = skb_sk_is_prefetched(skb); - if (*prefetched) - *refcounted = sk_is_refcounted(sk); - skb->destructor = NULL; - skb->sk = NULL; - return sk; - } - *prefetched = false; - *refcounted = false; - return NULL; -} - /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. @@ -2920,7 +2867,6 @@ extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern int sysctl_tstamp_allow_data; -extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; diff --git a/include/net/switchdev.h b/include/net/switchdev.h index a43062d4c734..8346b0d29542 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -308,6 +308,9 @@ void switchdev_deferred_process(void); int switchdev_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack); +bool switchdev_port_obj_act_is_deferred(struct net_device *dev, + enum switchdev_notifier_type nt, + const struct switchdev_obj *obj); int switchdev_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack); diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index 8a6dbfb23336..77f87c622a2e 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -58,6 +58,11 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) return to_ct_params(a)->nf_ft; } +static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) +{ + return to_ct_params(a)->helper; +} + #else static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; } static inline int tcf_ct_action(const struct tc_action *a) { return 0; } @@ -65,6 +70,10 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) { return NULL; } +static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) +{ + return NULL; +} #endif /* CONFIG_NF_CONNTRACK */ #if IS_ENABLED(CONFIG_NET_ACT_CT) diff --git a/include/net/tc_act/tc_ipt.h b/include/net/tc_act/tc_ipt.h deleted file mode 100644 index 4225fcb1c6ba..000000000000 --- a/include/net/tc_act/tc_ipt.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NET_TC_IPT_H -#define __NET_TC_IPT_H - -#include <net/act_api.h> - -struct xt_entry_target; - -struct tcf_ipt { - struct tc_action common; - u32 tcfi_hook; - char *tcfi_tname; - struct xt_entry_target *tcfi_t; -}; -#define to_ipt(a) ((struct tcf_ipt *)a) - -#endif /* __NET_TC_IPT_H */ diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 32ce8ea36950..75722d967bf2 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -8,6 +8,7 @@ struct tcf_mirred { struct tc_action common; int tcfm_eaction; + u32 tcfm_blockid; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; netdevice_tracker tcfm_dev_tracker; diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h index a6d481b5bcbc..a608546bcefc 100644 --- a/include/net/tc_wrapper.h +++ b/include/net/tc_wrapper.h @@ -117,10 +117,6 @@ static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, if (a->ops->act == tcf_ife_act) return tcf_ife_act(skb, a, res); #endif -#if IS_BUILTIN(CONFIG_NET_ACT_IPT) - if (a->ops->act == tcf_ipt_act) - return tcf_ipt_act(skb, a, res); -#endif #if IS_BUILTIN(CONFIG_NET_ACT_SIMP) if (a->ops->act == tcf_simp_act) return tcf_simp_act(skb, a, res); diff --git a/include/net/tcp.h b/include/net/tcp.h index d2f0736b76b8..6ae35199d3b3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -348,7 +348,7 @@ void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); +enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); @@ -396,8 +396,8 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race); -int tcp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb); +enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag); void tcp_clear_retrans(struct tcp_sock *tp); @@ -490,13 +490,30 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst, u32 tsoff); -int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, - u32 cookie); + struct dst_entry *dst); +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, - const struct tcp_request_sock_ops *af_ops, - struct sock *sk, struct sk_buff *skb); + struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *tcp_opt, + int mss, u32 tsoff); + +#if IS_ENABLED(CONFIG_BPF) +struct bpf_tcp_req_attrs { + u32 rcv_tsval; + u32 rcv_tsecr; + u16 mss; + u8 rcv_wscale; + u8 snd_wscale; + u8 ecn_ok; + u8 wscale_ok; + u8 sack_ok; + u8 tstamp_ok; + u8 usec_ts_ok; + u8 reserved[3]; +}; +#endif + #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. @@ -576,18 +593,50 @@ static inline u32 tcp_cookie_time(void) return val; } +/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ +static inline u64 tcp_ns_to_ts(bool usec_ts, u64 val) +{ + if (usec_ts) + return div_u64(val, NSEC_PER_USEC); + + return div_u64(val, NSEC_PER_MSEC); +} + u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); -bool cookie_ecn_ok(const struct tcp_options_received *opt, - const struct net *net, const struct dst_entry *dst); + +static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *dst) +{ + return READ_ONCE(net->ipv4.sysctl_tcp_ecn) || + dst_feature(dst, RTAX_FEATURE_ECN); +} + +#if IS_ENABLED(CONFIG_BPF) +static inline bool cookie_bpf_ok(struct sk_buff *skb) +{ + return skb->sk; +} + +struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb); +#else +static inline bool cookie_bpf_ok(struct sk_buff *skb) +{ + return false; +} + +static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return NULL; +} +#endif /* From net/ipv6/syncookies.c */ -int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, - u32 cookie); +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, @@ -1514,17 +1563,22 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } -static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) +static inline void __tcp_adjust_rcv_ssthresh(struct sock *sk, u32 new_ssthresh) { int unused_mem = sk_unused_reserved_mem(sk); struct tcp_sock *tp = tcp_sk(sk); - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + tp->rcv_ssthresh = min(tp->rcv_ssthresh, new_ssthresh); if (unused_mem) tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh, tcp_win_from_space(sk, unused_mem)); } +static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) +{ + __tcp_adjust_rcv_ssthresh(sk, 4U * tcp_sk(sk)->advmss); +} + void tcp_cleanup_rbuf(struct sock *sk, int copied); void __tcp_cleanup_rbuf(struct sock *sk, int copied); @@ -1783,8 +1837,6 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG -#include <linux/jump_label.h> -extern struct static_key_false_deferred tcp_md5_needed; struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family, bool any_l3index); @@ -2499,7 +2551,7 @@ struct tcp_ulp_ops { /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ - int (*get_info)(const struct sock *sk, struct sk_buff *skb); + int (*get_info)(struct sock *sk, struct sk_buff *skb); size_t (*get_info_size)(const struct sock *sk); /* clone ulp */ void (*clone)(const struct request_sock *req, struct sock *newsk, diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index b56be10838f0..471e177362b4 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -62,11 +62,17 @@ static inline int tcp_ao_maclen(const struct tcp_ao_key *key) return key->maclen; } +/* Use tcp_ao_len_aligned() for TCP header calculations */ static inline int tcp_ao_len(const struct tcp_ao_key *key) { return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr); } +static inline int tcp_ao_len_aligned(const struct tcp_ao_key *key) +{ + return round_up(tcp_ao_len(key), 4); +} + static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key) { return key->digest_size; @@ -121,12 +127,35 @@ struct tcp_ao_info { struct rcu_head rcu; }; +#ifdef CONFIG_TCP_MD5SIG +#include <linux/jump_label.h> +extern struct static_key_false_deferred tcp_md5_needed; +#define static_branch_tcp_md5() static_branch_unlikely(&tcp_md5_needed.key) +#else +#define static_branch_tcp_md5() false +#endif +#ifdef CONFIG_TCP_AO +/* TCP-AO structures and functions */ +#include <linux/jump_label.h> +extern struct static_key_false_deferred tcp_ao_needed; +#define static_branch_tcp_ao() static_branch_unlikely(&tcp_ao_needed.key) +#else +#define static_branch_tcp_ao() false +#endif + +static inline bool tcp_hash_should_produce_warnings(void) +{ + return static_branch_tcp_md5() || static_branch_tcp_ao(); +} + #define tcp_hash_fail(msg, family, skb, fmt, ...) \ do { \ const struct tcphdr *th = tcp_hdr(skb); \ char hdr_flags[6]; \ char *f = hdr_flags; \ \ + if (!tcp_hash_should_produce_warnings()) \ + break; \ if (th->fin) \ *f++ = 'F'; \ if (th->syn) \ @@ -153,9 +182,6 @@ do { \ #ifdef CONFIG_TCP_AO /* TCP-AO structures and functions */ -#include <linux/jump_label.h> -extern struct static_key_false_deferred tcp_ao_needed; - struct tcp4_ao_context { __be32 saddr; __be32 daddr; @@ -265,8 +291,7 @@ void tcp_ao_established(struct sock *sk); void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb); void tcp_ao_connect_init(struct sock *sk); void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, - struct tcp_request_sock *treq, - unsigned short int family, int l3index); + struct request_sock *req, unsigned short int family); #else /* CONFIG_TCP_AO */ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, @@ -277,8 +302,7 @@ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, } static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, - struct tcp_request_sock *treq, - unsigned short int family, int l3index) + struct request_sock *req, unsigned short int family) { } diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h index cc00118acca1..d60e8148ff4c 100644 --- a/include/net/tcp_states.h +++ b/include/net/tcp_states.h @@ -22,6 +22,7 @@ enum { TCP_LISTEN, TCP_CLOSING, /* Now a valid state */ TCP_NEW_SYN_RECV, + TCP_BOUND_INACTIVE, /* Pseudo-state for inet_diag */ TCP_MAX_STATES /* Leave at the end! */ }; @@ -43,6 +44,7 @@ enum { TCPF_LISTEN = (1 << TCP_LISTEN), TCPF_CLOSING = (1 << TCP_CLOSING), TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV), + TCPF_BOUND_INACTIVE = (1 << TCP_BOUND_INACTIVE), }; #endif /* _LINUX_TCP_STATES_H */ diff --git a/include/net/tls.h b/include/net/tls.h index 962f0c501111..340ad43971e4 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -97,9 +97,6 @@ struct tls_sw_context_tx { struct tls_rec *open_rec; struct list_head tx_list; atomic_t encrypt_pending; - /* protect crypto_wait with encrypt_pending */ - spinlock_t encrypt_compl_lock; - int async_notify; u8 async_capable:1; #define BIT_TX_SCHEDULED 0 @@ -136,8 +133,6 @@ struct tls_sw_context_rx { struct tls_strparser strp; atomic_t decrypt_pending; - /* protect crypto_wait with decrypt_pending*/ - spinlock_t decrypt_compl_lock; struct sk_buff_head async_hold; struct wait_queue_head wq; }; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 6a9f8a5f387c..33ba6fc151cf 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -210,22 +210,23 @@ struct vxlan_rdst { }; struct vxlan_config { - union vxlan_addr remote_ip; - union vxlan_addr saddr; - __be32 vni; - int remote_ifindex; - int mtu; - __be16 dst_port; - u16 port_min; - u16 port_max; - u8 tos; - u8 ttl; - __be32 label; - u32 flags; - unsigned long age_interval; - unsigned int addrmax; - bool no_share; - enum ifla_vxlan_df df; + union vxlan_addr remote_ip; + union vxlan_addr saddr; + __be32 vni; + int remote_ifindex; + int mtu; + __be16 dst_port; + u16 port_min; + u16 port_max; + u8 tos; + u8 ttl; + __be32 label; + enum ifla_vxlan_label_policy label_policy; + u32 flags; + unsigned long age_interval; + unsigned int addrmax; + bool no_share; + enum ifla_vxlan_df df; }; enum { diff --git a/include/net/xdp.h b/include/net/xdp.h index 349c36fb5fd8..e6770dd40c91 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -16,7 +16,7 @@ * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how - * the driver have configured a given RX-ring queue. + * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP @@ -32,7 +32,7 @@ * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver - * naturally need to stop the RX-ring before purging and reallocating + * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. @@ -369,7 +369,12 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) static inline bool xdp_metalen_invalid(unsigned long metalen) { - return (metalen & (sizeof(__u32) - 1)) || (metalen > 32); + unsigned long meta_max; + + meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); + BUILD_BUG_ON(!__builtin_constant_p(meta_max)); + + return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { @@ -399,6 +404,10 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ + NETDEV_XDP_RX_METADATA_VLAN_TAG, \ + bpf_xdp_metadata_rx_vlan_tag, \ + xmo_rx_vlan_tag) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, @@ -427,6 +436,7 @@ enum xdp_rss_hash_type { XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ + XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, @@ -442,11 +452,13 @@ enum xdp_rss_hash_type { XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, @@ -457,6 +469,8 @@ struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); + int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, + u16 *vlan_tci); }; #ifdef CONFIG_NET diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index f83128007fb0..3cb4dc9bd70e 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -30,6 +30,7 @@ struct xdp_umem { struct user_struct *user; refcount_t users; u8 flags; + u8 tx_metadata_len; bool zc; struct page **pgs; int id; @@ -92,12 +93,105 @@ struct xdp_sock { struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; +/* + * AF_XDP TX metadata hooks for network devices. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. + * + * void (*tmo_request_timestamp)(void *priv) + * Called when AF_XDP frame requested egress timestamp. + * + * u64 (*tmo_fill_timestamp)(void *priv) + * Called when AF_XDP frame, that had requested egress timestamp, + * received a completion. The hook needs to return the actual HW timestamp. + * + * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv) + * Called when AF_XDP frame requested HW checksum offload. csum_start + * indicates position where checksumming should start. + * csum_offset indicates position where checksum should be stored. + * + */ +struct xsk_tx_metadata_ops { + void (*tmo_request_timestamp)(void *priv); + u64 (*tmo_fill_timestamp)(void *priv); + void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); +}; + #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); +/** + * xsk_tx_metadata_to_compl - Save enough relevant metadata information + * to perform tx completion in the future. + * @meta: pointer to AF_XDP metadata area + * @compl: pointer to output struct xsk_tx_metadata_to_compl + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. The value of @compl should be stored + * and passed to xsk_tx_metadata_complete upon TX completion. + */ +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ + if (!meta) + return; + + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + compl->tx_timestamp = &meta->completion.tx_timestamp; + else + compl->tx_timestamp = NULL; +} + +/** + * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission + * and call appropriate xsk_tx_metadata_ops operation. + * @meta: pointer to AF_XDP metadata area + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. + */ +static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!meta) + return; + + if (ops->tmo_request_timestamp) + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + ops->tmo_request_timestamp(priv); + + if (ops->tmo_request_checksum) + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) + ops->tmo_request_checksum(meta->request.csum_start, + meta->request.csum_offset, priv); +} + +/** + * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion + * and call appropriate xsk_tx_metadata_ops operation. + * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device upon + * AF_XDP egress completion. + */ +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!compl) + return; + + *compl->tx_timestamp = ops->tmo_fill_timestamp(priv); +} + #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -114,6 +208,23 @@ static inline void __xsk_map_flush(void) { } +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ +} + +static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 1f6fc8c7a84c..c9aec9ab6191 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -12,6 +12,12 @@ #define XDP_UMEM_MIN_CHUNK_SHIFT 11 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) +struct xsk_cb_desc { + void *src; + u8 off; + u8 bytes; +}; + #ifdef CONFIG_XDP_SOCKETS void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); @@ -47,6 +53,12 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, xp_set_rxq_info(pool, rxq); } +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) +{ + xp_fill_cb(pool, desc); +} + static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) { #ifdef CONFIG_NET_RX_BUSY_POLL @@ -147,11 +159,29 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) return ret; } +static inline void xsk_buff_del_tail(struct xdp_buff *tail) +{ + struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + + list_del(&xskb->xskb_list_node); +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + xskb_list_node); + return &frag->xdp; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; xdp->data_meta = xdp->data; xdp->data_end = xdp->data + size; + xdp->flags = 0; } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, @@ -165,6 +195,30 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } +#define XDP_TXMD_FLAGS_VALID ( \ + XDP_TXMD_FLAGS_TIMESTAMP | \ + XDP_TXMD_FLAGS_CHECKSUM | \ + 0) + +static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +{ + return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); +} + +static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + struct xsk_tx_metadata *meta; + + if (!pool->tx_metadata_len) + return NULL; + + meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return NULL; /* no way to signal the error to the user */ + + return meta; +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); @@ -250,6 +304,11 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, { } +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) +{ +} + static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) { return 0; @@ -309,6 +368,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) return NULL; } +static inline void xsk_buff_del_tail(struct xdp_buff *tail) +{ +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + return NULL; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } @@ -324,6 +392,16 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } +static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +{ + return false; +} + +static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return NULL; +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { } diff --git a/include/net/xfrm.h b/include/net/xfrm.h index c9bb0f892f55..57c743b7e4fe 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -51,8 +51,10 @@ #ifdef CONFIG_XFRM_STATISTICS #define XFRM_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.xfrm_statistics, field) +#define XFRM_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.xfrm_statistics, field, val) #else #define XFRM_INC_STATS(net, field) ((void)(net)) +#define XFRM_ADD_STATS(net, field, val) ((void)(net)) #endif @@ -1577,22 +1579,20 @@ struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, unsigned short family); int xfrm_state_check_expire(struct xfrm_state *x); +void xfrm_state_update_stats(struct net *net); #ifdef CONFIG_XFRM_OFFLOAD -static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) +static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) { struct xfrm_dev_offload *xdo = &x->xso; struct net_device *dev = xdo->dev; - if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) - return; - if (dev && dev->xfrmdev_ops && - dev->xfrmdev_ops->xdo_dev_state_update_curlft) - dev->xfrmdev_ops->xdo_dev_state_update_curlft(x); + dev->xfrmdev_ops->xdo_dev_state_update_stats) + dev->xfrmdev_ops->xdo_dev_state_update_stats(x); } #else -static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) {} +static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) {} #endif void xfrm_state_insert(struct xfrm_state *x); int xfrm_state_add(struct xfrm_state *x); @@ -2190,4 +2190,13 @@ static inline int register_xfrm_interface_bpf(void) #endif +#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF) +int register_xfrm_state_bpf(void); +#else +static inline int register_xfrm_state_bpf(void) +{ + return 0; +} +#endif + #endif /* _NET_XFRM_H */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index b0bdff26fc88..99dd7376df6a 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -12,6 +12,7 @@ struct xsk_buff_pool; struct xdp_rxq_info; +struct xsk_cb_desc; struct xsk_queue; struct xdp_desc; struct xdp_umem; @@ -33,6 +34,7 @@ struct xdp_buff_xsk { }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) +#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) struct xsk_dma_map { dma_addr_t *dma_pages; @@ -77,10 +79,12 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; bool dma_need_sync; bool unaligned; + bool tx_sw_csum; void *addrs; /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when @@ -132,6 +136,7 @@ static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_p /* AF_XDP ZC drivers, via xdp_sock_buff.h */ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +void xp_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc); int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages); void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); @@ -233,4 +238,9 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } +static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) +{ + return pool->tx_metadata_len > 0; +} + #endif /* XSK_BUFF_POOL_H_ */ |