summaryrefslogtreecommitdiff
path: root/include/net
diff options
context:
space:
mode:
Diffstat (limited to 'include/net')
-rw-r--r--include/net/act_api.h25
-rw-r--r--include/net/af_unix.h2
-rw-r--r--include/net/aligned_data.h22
-rw-r--r--include/net/bluetooth/bluetooth.h11
-rw-r--r--include/net/bluetooth/hci.h10
-rw-r--r--include/net/bluetooth/hci_core.h41
-rw-r--r--include/net/bond_options.h1
-rw-r--r--include/net/bonding.h3
-rw-r--r--include/net/cfg80211.h221
-rw-r--r--include/net/devlink.h18
-rw-r--r--include/net/dropreason-core.h39
-rw-r--r--include/net/dsa.h2
-rw-r--r--include/net/dst.h38
-rw-r--r--include/net/gro.h6
-rw-r--r--include/net/inet6_hashtables.h2
-rw-r--r--include/net/inet_hashtables.h8
-rw-r--r--include/net/ip.h15
-rw-r--r--include/net/ip6_route.h4
-rw-r--r--include/net/ip6_tunnel.h5
-rw-r--r--include/net/ip_tunnels.h2
-rw-r--r--include/net/libeth/rx.h28
-rw-r--r--include/net/libeth/tx.h36
-rw-r--r--include/net/libeth/types.h106
-rw-r--r--include/net/libeth/xdp.h1879
-rw-r--r--include/net/libeth/xsk.h685
-rw-r--r--include/net/lwtunnel.h8
-rw-r--r--include/net/mac80211.h69
-rw-r--r--include/net/mana/gdma.h27
-rw-r--r--include/net/mana/mana.h173
-rw-r--r--include/net/mctp.h57
-rw-r--r--include/net/ndisc.h9
-rw-r--r--include/net/neighbour.h22
-rw-r--r--include/net/netdev_queues.h9
-rw-r--r--include/net/netfilter/ipv4/nf_conntrack_ipv4.h3
-rw-r--r--include/net/netfilter/nf_conntrack.h2
-rw-r--r--include/net/netfilter/nf_conntrack_l4proto.h13
-rw-r--r--include/net/netfilter/nf_log.h3
-rw-r--r--include/net/netfilter/nf_reject.h1
-rw-r--r--include/net/netfilter/nf_tables.h19
-rw-r--r--include/net/netfilter/nf_tables_core.h50
-rw-r--r--include/net/netlink.h14
-rw-r--r--include/net/netmem.h181
-rw-r--r--include/net/netns/conntrack.h13
-rw-r--r--include/net/netns/mctp.h20
-rw-r--r--include/net/page_pool/helpers.h14
-rw-r--r--include/net/pfcp.h2
-rw-r--r--include/net/request_sock.h4
-rw-r--r--include/net/route.h6
-rw-r--r--include/net/sctp/structs.h2
-rw-r--r--include/net/sock.h23
-rw-r--r--include/net/tc_act/tc_connmark.h1
-rw-r--r--include/net/tc_act/tc_csum.h10
-rw-r--r--include/net/tc_act/tc_ct.h11
-rw-r--r--include/net/tc_act/tc_ctinfo.h7
-rw-r--r--include/net/tc_act/tc_gate.h9
-rw-r--r--include/net/tc_act/tc_mpls.h10
-rw-r--r--include/net/tc_act/tc_nat.h1
-rw-r--r--include/net/tc_act/tc_pedit.h1
-rw-r--r--include/net/tc_act/tc_police.h12
-rw-r--r--include/net/tc_act/tc_sample.h9
-rw-r--r--include/net/tc_act/tc_skbedit.h1
-rw-r--r--include/net/tc_act/tc_vlan.h9
-rw-r--r--include/net/tcp.h11
-rw-r--r--include/net/tcx.h1
-rw-r--r--include/net/udp.h1
-rw-r--r--include/net/udp_tunnel.h103
-rw-r--r--include/net/vxlan.h5
-rw-r--r--include/net/x25.h1
-rw-r--r--include/net/xdp_sock.h1
69 files changed, 3761 insertions, 396 deletions
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 404df8557f6a..2894cfff2da3 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -76,19 +76,24 @@ static inline void tcf_lastuse_update(struct tcf_t *tm)
{
unsigned long now = jiffies;
- if (tm->lastuse != now)
- tm->lastuse = now;
- if (unlikely(!tm->firstuse))
- tm->firstuse = now;
+ if (READ_ONCE(tm->lastuse) != now)
+ WRITE_ONCE(tm->lastuse, now);
+ if (unlikely(!READ_ONCE(tm->firstuse)))
+ WRITE_ONCE(tm->firstuse, now);
}
static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm)
{
- dtm->install = jiffies_to_clock_t(jiffies - stm->install);
- dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse);
- dtm->firstuse = stm->firstuse ?
- jiffies_to_clock_t(jiffies - stm->firstuse) : 0;
- dtm->expires = jiffies_to_clock_t(stm->expires);
+ unsigned long firstuse, now = jiffies;
+
+ dtm->install = jiffies_to_clock_t(now - READ_ONCE(stm->install));
+ dtm->lastuse = jiffies_to_clock_t(now - READ_ONCE(stm->lastuse));
+
+ firstuse = READ_ONCE(stm->firstuse);
+ dtm->firstuse = firstuse ?
+ jiffies_to_clock_t(now - firstuse) : 0;
+
+ dtm->expires = jiffies_to_clock_t(READ_ONCE(stm->expires));
}
static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats)
@@ -170,14 +175,12 @@ static inline void tc_action_net_exit(struct list_head *net_list,
{
struct net *net;
- rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
struct tc_action_net *tn = net_generic(net, id);
tcf_idrinfo_destroy(tn->ops, tn->idrinfo);
kfree(tn->idrinfo);
}
- rtnl_unlock();
}
int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 1af1841b7601..34f53dde65ce 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -47,6 +47,8 @@ struct unix_sock {
#define peer_wait peer_wq.wait
wait_queue_entry_t peer_wake;
struct scm_stat scm_stat;
+ int inq_len;
+ bool recvmsg_inq;
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
struct sk_buff *oob_skb;
#endif
diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h
new file mode 100644
index 000000000000..e1a1c8aedc79
--- /dev/null
+++ b/include/net/aligned_data.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_ALIGNED_DATA_H
+#define _NET_ALIGNED_DATA_H
+
+#include <linux/atomic.h>
+#include <linux/types.h>
+
+/* Structure holding cacheline aligned fields on SMP builds.
+ * Each field or group should have an ____cacheline_aligned_in_smp
+ * attribute to ensure no accidental false sharing can happen.
+ */
+struct net_aligned_data {
+ atomic64_t net_cookie ____cacheline_aligned_in_smp;
+#if defined(CONFIG_INET)
+ atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;
+ atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
+#endif
+};
+
+extern struct net_aligned_data net_aligned_data;
+
+#endif /* _NET_ALIGNED_DATA_H */
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 114299bd8b98..ada5b56a4413 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -244,6 +244,12 @@ struct bt_codecs {
#define BT_ISO_BASE 20
+/* Socket option value 21 reserved */
+
+#define BT_PKT_SEQNUM 22
+
+#define BT_SCM_PKT_SEQNUM 0x05
+
__printf(1, 2)
void bt_info(const char *fmt, ...);
__printf(1, 2)
@@ -391,7 +397,8 @@ struct bt_sock {
enum {
BT_SK_DEFER_SETUP,
BT_SK_SUSPEND,
- BT_SK_PKT_STATUS
+ BT_SK_PKT_STATUS,
+ BT_SK_PKT_SEQNUM,
};
struct bt_sock_list {
@@ -475,6 +482,7 @@ struct bt_skb_cb {
u8 pkt_type;
u8 force_active;
u16 expect;
+ u16 pkt_seqnum;
u8 incoming:1;
u8 pkt_status:2;
union {
@@ -488,6 +496,7 @@ struct bt_skb_cb {
#define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type
#define hci_skb_pkt_status(skb) bt_cb((skb))->pkt_status
+#define hci_skb_pkt_seqnum(skb) bt_cb((skb))->pkt_seqnum
#define hci_skb_expect(skb) bt_cb((skb))->expect
#define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode
#define hci_skb_event(skb) bt_cb((skb))->hci.req_event
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index c79901f2dc2a..df1847b74e55 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -562,6 +562,7 @@ enum {
#define LE_LINK 0x80
#define CIS_LINK 0x82
#define BIS_LINK 0x83
+#define PA_LINK 0x84
#define INVALID_LINK 0xff
/* LMP features */
@@ -2634,6 +2635,7 @@ struct hci_ev_le_conn_complete {
#define LE_EXT_ADV_DIRECT_IND 0x0004
#define LE_EXT_ADV_SCAN_RSP 0x0008
#define LE_EXT_ADV_LEGACY_PDU 0x0010
+#define LE_EXT_ADV_DATA_STATUS_MASK 0x0060
#define LE_EXT_ADV_EVT_TYPE_MASK 0x007f
#define ADDR_LE_DEV_PUBLIC 0x00
@@ -2837,7 +2839,7 @@ struct hci_evt_le_create_big_complete {
} __packed;
#define HCI_EVT_LE_BIG_SYNC_ESTABLISHED 0x1d
-struct hci_evt_le_big_sync_estabilished {
+struct hci_evt_le_big_sync_established {
__u8 status;
__u8 handle;
__u8 latency[3];
@@ -2851,6 +2853,12 @@ struct hci_evt_le_big_sync_estabilished {
__le16 bis[];
} __packed;
+#define HCI_EVT_LE_BIG_SYNC_LOST 0x1e
+struct hci_evt_le_big_sync_lost {
+ __u8 handle;
+ __u8 reason;
+} __packed;
+
#define HCI_EVT_LE_BIG_INFO_ADV_REPORT 0x22
struct hci_evt_le_big_info_adv_report {
__le16 sync_handle;
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index f79f59e67114..4dc11c66f7b8 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -29,6 +29,7 @@
#include <linux/idr.h>
#include <linux/leds.h>
#include <linux/rculist.h>
+#include <linux/spinlock.h>
#include <linux/srcu.h>
#include <net/bluetooth/hci.h>
@@ -94,6 +95,7 @@ struct discovery_state {
u16 uuid_count;
u8 (*uuids)[16];
unsigned long name_resolve_timeout;
+ spinlock_t lock;
};
#define SUSPEND_NOTIFIER_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */
@@ -889,6 +891,7 @@ static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb,
static inline void discovery_init(struct hci_dev *hdev)
{
+ spin_lock_init(&hdev->discovery.lock);
hdev->discovery.state = DISCOVERY_STOPPED;
INIT_LIST_HEAD(&hdev->discovery.all);
INIT_LIST_HEAD(&hdev->discovery.unknown);
@@ -903,8 +906,11 @@ static inline void hci_discovery_filter_clear(struct hci_dev *hdev)
hdev->discovery.report_invalid_rssi = true;
hdev->discovery.rssi = HCI_RSSI_INVALID;
hdev->discovery.uuid_count = 0;
+
+ spin_lock(&hdev->discovery.lock);
kfree(hdev->discovery.uuids);
hdev->discovery.uuids = NULL;
+ spin_unlock(&hdev->discovery.lock);
}
bool hci_discovery_active(struct hci_dev *hdev);
@@ -1009,6 +1015,7 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c)
break;
case CIS_LINK:
case BIS_LINK:
+ case PA_LINK:
h->iso_num++;
break;
}
@@ -1036,6 +1043,7 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c)
break;
case CIS_LINK:
case BIS_LINK:
+ case PA_LINK:
h->iso_num--;
break;
}
@@ -1054,6 +1062,7 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type)
return h->sco_num;
case CIS_LINK:
case BIS_LINK:
+ case PA_LINK:
return h->iso_num;
default:
return 0;
@@ -1136,7 +1145,7 @@ hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev)
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != BIS_LINK)
+ if (c->type != PA_LINK)
continue;
if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags))
@@ -1331,7 +1340,7 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev,
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != BIS_LINK)
+ if (c->type != PA_LINK)
continue;
if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) {
@@ -1346,7 +1355,8 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev,
}
static inline struct hci_conn *
-hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state)
+hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state,
+ __u8 role)
{
struct hci_conn_hash *h = &hdev->conn_hash;
struct hci_conn *c;
@@ -1354,7 +1364,7 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state)
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != BIS_LINK || c->state != state)
+ if (c->type != BIS_LINK || c->state != state || c->role != role)
continue;
if (handle == c->iso_qos.bcast.big) {
@@ -1400,7 +1410,7 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle)
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != BIS_LINK)
+ if (c->type != PA_LINK)
continue;
/* Ignore the listen hcon, we are looking
@@ -1420,26 +1430,6 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle)
return NULL;
}
-static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev,
- __u8 type, __u16 state)
-{
- struct hci_conn_hash *h = &hdev->conn_hash;
- struct hci_conn *c;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type == type && c->state == state) {
- rcu_read_unlock();
- return c;
- }
- }
-
- rcu_read_unlock();
-
- return NULL;
-}
-
typedef void (*hci_conn_func_t)(struct hci_conn *conn, void *data);
static inline void hci_conn_hash_list_state(struct hci_dev *hdev,
hci_conn_func_t func, __u8 type,
@@ -2019,6 +2009,7 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
case CIS_LINK:
case BIS_LINK:
+ case PA_LINK:
return iso_connect_ind(hdev, bdaddr, flags);
default:
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 18687ccf0638..022b122a9fb6 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -77,6 +77,7 @@ enum {
BOND_OPT_NS_TARGETS,
BOND_OPT_PRIO,
BOND_OPT_COUPLED_CONTROL,
+ BOND_OPT_BROADCAST_NEIGH,
BOND_OPT_LAST
};
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 95f67b308c19..e06f0d63b2c1 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -115,6 +115,8 @@ static inline int is_netpoll_tx_blocked(struct net_device *dev)
#define is_netpoll_tx_blocked(dev) (0)
#endif
+DECLARE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled);
+
struct bond_params {
int mode;
int xmit_policy;
@@ -149,6 +151,7 @@ struct bond_params {
struct in6_addr ns_targets[BOND_MAX_NS_TARGETS];
#endif
int coupled_control;
+ int broadcast_neighbor;
/* 2 bytes of padding : see ether_addr_equal_64bits() */
u8 ad_actor_system[ETH_ALEN + 2];
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 10248d527616..406626ff6cc8 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -560,7 +560,7 @@ struct ieee80211_sta_s1g_cap {
* @vht_cap: VHT capabilities in this band
* @s1g_cap: S1G capabilities in this band
* @edmg_cap: EDMG capabilities in this band
- * @s1g_cap: S1G capabilities in this band (S1B band only, of course)
+ * @s1g_cap: S1G capabilities in this band (S1G band only, of course)
* @n_iftype_data: number of iftype data entries
* @iftype_data: interface type data entries. Note that the bits in
* @types_mask inside this structure cannot overlap (i.e. only
@@ -633,7 +633,7 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
const struct ieee80211_sband_iftype_data *data;
int i;
- if (WARN_ON(iftype >= NL80211_IFTYPE_MAX))
+ if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
return NULL;
if (iftype == NL80211_IFTYPE_AP_VLAN)
@@ -1424,6 +1424,23 @@ struct cfg80211_unsol_bcast_probe_resp {
};
/**
+ * struct cfg80211_s1g_short_beacon - S1G short beacon data.
+ *
+ * @update: Set to true if the feature configuration should be updated.
+ * @short_head: Short beacon head.
+ * @short_tail: Short beacon tail.
+ * @short_head_len: Short beacon head len.
+ * @short_tail_len: Short beacon tail len.
+ */
+struct cfg80211_s1g_short_beacon {
+ bool update;
+ const u8 *short_head;
+ const u8 *short_tail;
+ size_t short_head_len;
+ size_t short_tail_len;
+};
+
+/**
* struct cfg80211_ap_settings - AP configuration
*
* Used to configure an AP interface.
@@ -1463,6 +1480,8 @@ struct cfg80211_unsol_bcast_probe_resp {
* @fils_discovery: FILS discovery transmission parameters
* @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
* @mbssid_config: AP settings for multiple bssid
+ * @s1g_long_beacon_period: S1G long beacon period
+ * @s1g_short_beacon: S1G short beacon data
*/
struct cfg80211_ap_settings {
struct cfg80211_chan_def chandef;
@@ -1496,6 +1515,8 @@ struct cfg80211_ap_settings {
struct cfg80211_fils_discovery fils_discovery;
struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
struct cfg80211_mbssid_config mbssid_config;
+ u8 s1g_long_beacon_period;
+ struct cfg80211_s1g_short_beacon s1g_short_beacon;
};
@@ -1507,11 +1528,13 @@ struct cfg80211_ap_settings {
* @beacon: beacon data
* @fils_discovery: FILS discovery transmission parameters
* @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
+ * @s1g_short_beacon: S1G short beacon data
*/
struct cfg80211_ap_update {
struct cfg80211_beacon_data beacon;
struct cfg80211_fils_discovery fils_discovery;
struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
+ struct cfg80211_s1g_short_beacon s1g_short_beacon;
};
/**
@@ -1526,6 +1549,7 @@ struct cfg80211_ap_update {
* @n_counter_offsets_beacon: number of csa counters the beacon (tail)
* @n_counter_offsets_presp: number of csa counters in the probe response
* @beacon_after: beacon data to be used on the new channel
+ * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
* @radar_required: whether radar detection is required on the new channel
* @block_tx: whether transmissions should be blocked while changing
* @count: number of beacons until switch
@@ -1540,6 +1564,7 @@ struct cfg80211_csa_settings {
unsigned int n_counter_offsets_beacon;
unsigned int n_counter_offsets_presp;
struct cfg80211_beacon_data beacon_after;
+ struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
bool radar_required;
bool block_tx;
u8 count;
@@ -1555,6 +1580,7 @@ struct cfg80211_csa_settings {
* @counter_offset_beacon: offsets of the counters within the beacon (tail)
* @counter_offset_presp: offsets of the counters within the probe response
* @beacon_next: beacon data to be used after the color change
+ * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
* @count: number of beacons until the color change
* @color: the color used after the change
* @link_id: defines the link on which color change is expected during MLO.
@@ -1565,6 +1591,7 @@ struct cfg80211_color_change_settings {
u16 counter_offset_beacon;
u16 counter_offset_presp;
struct cfg80211_beacon_data beacon_next;
+ struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
u8 count;
u8 color;
u8 link_id;
@@ -1653,6 +1680,7 @@ struct sta_txpwr {
* @he_6ghz_capa: HE 6 GHz Band capabilities of station
* @eht_capa: EHT capabilities of station
* @eht_capa_len: the length of the EHT capabilities
+ * @s1g_capa: S1G capabilities of station
*/
struct link_station_parameters {
const u8 *mld_mac;
@@ -1671,6 +1699,7 @@ struct link_station_parameters {
const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
const struct ieee80211_eht_cap_elem *eht_capa;
u8 eht_capa_len;
+ const struct ieee80211_s1g_cap *s1g_capa;
};
/**
@@ -2018,6 +2047,99 @@ struct cfg80211_tid_stats {
#define IEEE80211_MAX_CHAINS 4
/**
+ * struct link_station_info - link station information
+ *
+ * Link station information filled by driver for get_station() and
+ * dump_station().
+ * @filled: bit flag of flags using the bits of &enum nl80211_sta_info to
+ * indicate the relevant values in this struct for them
+ * @connected_time: time(in secs) since a link of station is last connected
+ * @inactive_time: time since last activity for link station(tx/rx)
+ * in milliseconds
+ * @assoc_at: bootime (ns) of the last association of link of station
+ * @rx_bytes: bytes (size of MPDUs) received from this link of station
+ * @tx_bytes: bytes (size of MPDUs) transmitted to this link of station
+ * @signal: The signal strength, type depends on the wiphy's signal_type.
+ * For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
+ * @signal_avg: Average signal strength, type depends on the wiphy's
+ * signal_type. For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_
+ * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg
+ * @chain_signal: per-chain signal strength of last received packet in dBm
+ * @chain_signal_avg: per-chain signal strength average in dBm
+ * @txrate: current unicast bitrate from this link of station
+ * @rxrate: current unicast bitrate to this link of station
+ * @rx_packets: packets (MSDUs & MMPDUs) received from this link of station
+ * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this link of station
+ * @tx_retries: cumulative retry counts (MPDUs) for this link of station
+ * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK)
+ * @rx_dropped_misc: Dropped for un-specified reason.
+ * @bss_param: current BSS parameters
+ * @beacon_loss_count: Number of times beacon loss event has triggered.
+ * @expected_throughput: expected throughput in kbps (including 802.11 headers)
+ * towards this station.
+ * @rx_beacon: number of beacons received from this peer
+ * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received
+ * from this peer
+ * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
+ * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer
+ * @airtime_weight: current airtime scheduling weight
+ * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
+ * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
+ * Note that this doesn't use the @filled bit, but is used if non-NULL.
+ * @ack_signal: signal strength (in dBm) of the last ACK frame.
+ * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
+ * been sent.
+ * @rx_mpdu_count: number of MPDUs received from this station
+ * @fcs_err_count: number of packets (MPDUs) received from this station with
+ * an FCS error. This counter should be incremented only when TA of the
+ * received packet with an FCS error matches the peer MAC address.
+ * @addr: For MLO STA connection, filled with address of the link of station.
+ */
+struct link_station_info {
+ u64 filled;
+ u32 connected_time;
+ u32 inactive_time;
+ u64 assoc_at;
+ u64 rx_bytes;
+ u64 tx_bytes;
+ s8 signal;
+ s8 signal_avg;
+
+ u8 chains;
+ s8 chain_signal[IEEE80211_MAX_CHAINS];
+ s8 chain_signal_avg[IEEE80211_MAX_CHAINS];
+
+ struct rate_info txrate;
+ struct rate_info rxrate;
+ u32 rx_packets;
+ u32 tx_packets;
+ u32 tx_retries;
+ u32 tx_failed;
+ u32 rx_dropped_misc;
+ struct sta_bss_parameters bss_param;
+
+ u32 beacon_loss_count;
+
+ u32 expected_throughput;
+
+ u64 tx_duration;
+ u64 rx_duration;
+ u64 rx_beacon;
+ u8 rx_beacon_signal_avg;
+
+ u16 airtime_weight;
+
+ s8 ack_signal;
+ s8 avg_ack_signal;
+ struct cfg80211_tid_stats *pertid;
+
+ u32 rx_mpdu_count;
+ u32 fcs_err_count;
+
+ u8 addr[ETH_ALEN] __aligned(2);
+};
+
+/**
* struct station_info - station information
*
* Station information filled by driver for get_station() and dump_station.
@@ -2101,6 +2223,11 @@ struct cfg80211_tid_stats {
* dump_station() callbacks. User space needs this information to determine
* the accepted and rejected affiliated links of the connected station.
* @assoc_resp_ies_len: Length of @assoc_resp_ies buffer in octets.
+ * @valid_links: bitmap of valid links, or 0 for non-MLO. Drivers fill this
+ * information in cfg80211_new_sta(), cfg80211_del_sta_sinfo(),
+ * get_station() and dump_station() callbacks.
+ * @links: reference to Link sta entries for MLO STA, all link specific
+ * information is accessed through links[link_id].
*/
struct station_info {
u64 filled;
@@ -2165,6 +2292,9 @@ struct station_info {
u8 mld_addr[ETH_ALEN] __aligned(2);
const u8 *assoc_resp_ies;
size_t assoc_resp_ies_len;
+
+ u16 valid_links;
+ struct link_station_info *links[IEEE80211_MLD_MAX_NUM_LINKS];
};
/**
@@ -2645,15 +2775,16 @@ struct cfg80211_scan_6ghz_params {
* @wiphy: the wiphy this was for
* @scan_start: time (in jiffies) when the scan started
* @wdev: the wireless device to scan for
- * @info: (internal) information about completed scan
- * @notified: (internal) scan request was notified as done or aborted
* @no_cck: used to send probe requests at non CCK rate in 2GHz band
* @mac_addr: MAC address used with randomisation
* @mac_addr_mask: MAC address mask used with randomisation, bits that
* are 0 in the mask should be randomised, bits that are 1 should
* be taken from the @mac_addr
* @scan_6ghz: relevant for split scan request only,
- * true if this is the second scan request
+ * true if this is a 6 GHz scan request
+ * @first_part: %true if this is the first part of a split scan request or a
+ * scan that was not split. May be %true for a @scan_6ghz scan if no other
+ * channels were requested
* @n_6ghz_params: number of 6 GHz params
* @scan_6ghz_params: 6 GHz params
* @bssid: BSSID to scan for (most commonly, the wildcard BSSID)
@@ -2677,14 +2808,11 @@ struct cfg80211_scan_request {
u8 mac_addr[ETH_ALEN] __aligned(2);
u8 mac_addr_mask[ETH_ALEN] __aligned(2);
u8 bssid[ETH_ALEN] __aligned(2);
-
- /* internal */
struct wiphy *wiphy;
unsigned long scan_start;
- struct cfg80211_scan_info info;
- bool notified;
bool no_cck;
bool scan_6ghz;
+ bool first_part;
u32 n_6ghz_params;
struct cfg80211_scan_6ghz_params *scan_6ghz_params;
s8 tsf_report_link_id;
@@ -4752,12 +4880,14 @@ struct cfg80211_ops {
int (*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev,
int rate[NUM_NL80211_BANDS]);
- int (*set_wiphy_params)(struct wiphy *wiphy, u32 changed);
+ int (*set_wiphy_params)(struct wiphy *wiphy, int radio_idx,
+ u32 changed);
int (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
+ int radio_idx,
enum nl80211_tx_power_setting type, int mbm);
int (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
- unsigned int link_id, int *dbm);
+ int radio_idx, unsigned int link_id, int *dbm);
void (*rfkill_poll)(struct wiphy *wiphy);
@@ -4819,8 +4949,10 @@ struct cfg80211_ops {
struct wireless_dev *wdev,
struct mgmt_frame_regs *upd);
- int (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant);
- int (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant);
+ int (*set_antenna)(struct wiphy *wiphy, int radio_idx,
+ u32 tx_ant, u32 rx_ant);
+ int (*get_antenna)(struct wiphy *wiphy, int radio_idx,
+ u32 *tx_ant, u32 *rx_ant);
int (*sched_scan_start)(struct wiphy *wiphy,
struct net_device *dev,
@@ -5443,6 +5575,18 @@ struct wiphy_iftype_akm_suites {
};
/**
+ * struct wiphy_radio_cfg - physical radio config of a wiphy
+ * This structure describes the configurations of a physical radio in a
+ * wiphy. It is used to denote per-radio attributes belonging to a wiphy.
+ *
+ * @rts_threshold: RTS threshold (dot11RTSThreshold);
+ * -1 (default) = RTS/CTS disabled
+ */
+struct wiphy_radio_cfg {
+ u32 rts_threshold;
+};
+
+/**
* struct wiphy_radio_freq_range - wiphy frequency range
* @start_freq: start range edge frequency (kHz)
* @end_freq: end range edge frequency (kHz)
@@ -5697,6 +5841,10 @@ struct wiphy_radio {
* supports enabling HW timestamping for all peers (i.e. no need to
* specify a mac address).
*
+ * @radio_cfg: configuration of radios belonging to a muli-radio wiphy. This
+ * struct contains a list of all radio specific attributes and should be
+ * used only for multi-radio wiphy.
+ *
* @radio: radios belonging to this wiphy
* @n_radio: number of radios
*/
@@ -5786,6 +5934,8 @@ struct wiphy {
void (*reg_notifier)(struct wiphy *wiphy,
struct regulatory_request *request);
+ struct wiphy_radio_cfg *radio_cfg;
+
/* fields below are read-only, assigned by cfg80211 */
const struct ieee80211_regdomain __rcu *regd;
@@ -8466,6 +8616,17 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
/**
+ * cfg80211_link_sinfo_alloc_tid_stats - allocate per-tid statistics.
+ *
+ * @link_sinfo: the link station information
+ * @gfp: allocation flags
+ *
+ * Return: 0 on success. Non-zero on error.
+ */
+int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo,
+ gfp_t gfp);
+
+/**
* cfg80211_sinfo_release_content - release contents of station info
* @sinfo: the station information
*
@@ -8476,6 +8637,13 @@ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
static inline void cfg80211_sinfo_release_content(struct station_info *sinfo)
{
kfree(sinfo->pertid);
+
+ for (int link_id = 0; link_id < ARRAY_SIZE(sinfo->links); link_id++) {
+ if (sinfo->links[link_id]) {
+ kfree(sinfo->links[link_id]->pertid);
+ kfree(sinfo->links[link_id]);
+ }
+ }
}
/**
@@ -8880,6 +9048,7 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index,
/**
* cfg80211_rx_spurious_frame - inform userspace about a spurious frame
* @dev: The device the frame matched to
+ * @link_id: the link the frame was received on, -1 if not applicable or unknown
* @addr: the transmitter address
* @gfp: context flags
*
@@ -8889,13 +9058,14 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index,
* Return: %true if the frame was passed to userspace (or this failed
* for a reason other than not having a subscription.)
*/
-bool cfg80211_rx_spurious_frame(struct net_device *dev,
- const u8 *addr, gfp_t gfp);
+bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr,
+ int link_id, gfp_t gfp);
/**
* cfg80211_rx_unexpected_4addr_frame - inform about unexpected WDS frame
* @dev: The device the frame matched to
* @addr: the transmitter address
+ * @link_id: the link the frame was received on, -1 if not applicable or unknown
* @gfp: context flags
*
* This function is used in AP mode (only!) to inform userspace that
@@ -8905,8 +9075,8 @@ bool cfg80211_rx_spurious_frame(struct net_device *dev,
* Return: %true if the frame was passed to userspace (or this failed
* for a reason other than not having a subscription.)
*/
-bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev,
- const u8 *addr, gfp_t gfp);
+bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr,
+ int link_id, gfp_t gfp);
/**
* cfg80211_probe_status - notify userspace about probe status
@@ -9372,6 +9542,17 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
void (*iter)(const struct ieee80211_iface_combination *c,
void *data),
void *data);
+/**
+ * cfg80211_get_radio_idx_by_chan - get the radio index by the channel
+ *
+ * @wiphy: the wiphy
+ * @chan: channel for which the supported radio index is required
+ *
+ * Return: radio index on success or a negative error code
+ */
+int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy,
+ const struct ieee80211_channel *chan);
+
/**
* cfg80211_stop_iface - trigger interface disconnection
@@ -9736,6 +9917,11 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask);
* struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data
* @buf: MLO Reconfiguration Response frame (header + body)
* @len: length of the frame data
+ * @driver_initiated: Indicates whether the add links request is initiated by
+ * driver. This is set to true when the link reconfiguration request
+ * initiated by driver due to AP link recommendation requests
+ * (Ex: BTM (BSS Transition Management) request) handling offloaded to
+ * driver.
* @added_links: BIT mask of links successfully added to the association
* @links: per-link information indexed by link ID
* @links.bss: the BSS that MLO reconfiguration was requested for, ownership of
@@ -9748,6 +9934,7 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask);
struct cfg80211_mlo_reconf_done_data {
const u8 *buf;
size_t len;
+ bool driver_initiated;
u16 added_links;
struct {
struct cfg80211_bss *bss;
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 0091f23a40f7..93640a29427c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -118,6 +118,8 @@ struct devlink_rate {
u32 tx_priority;
u32 tx_weight;
+
+ u32 tc_bw[DEVLINK_RATE_TCS_MAX];
};
struct devlink_port {
@@ -423,6 +425,7 @@ enum devlink_param_type {
DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8,
DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16,
DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32,
+ DEVLINK_PARAM_TYPE_U64 = DEVLINK_VAR_ATTR_TYPE_U64,
DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING,
DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG,
};
@@ -431,6 +434,7 @@ union devlink_param_value {
u8 vu8;
u16 vu16;
u32 vu32;
+ u64 vu64;
char vstr[__DEVLINK_PARAM_MAX_STRING_VALUE];
bool vbool;
};
@@ -520,6 +524,8 @@ enum devlink_param_generic_id {
DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP,
DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE,
DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE,
+ DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC,
+ DEVLINK_PARAM_GENERIC_ID_CLOCK_ID,
/* add new param generic ids above here*/
__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -578,6 +584,12 @@ enum devlink_param_generic_id {
#define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME "event_eq_size"
#define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE DEVLINK_PARAM_TYPE_U32
+#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME "enable_phc"
+#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE DEVLINK_PARAM_TYPE_BOOL
+
+#define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id"
+#define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64
+
#define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \
{ \
.id = DEVLINK_PARAM_GENERIC_ID_##_id, \
@@ -1482,6 +1494,9 @@ struct devlink_ops {
u32 tx_priority, struct netlink_ext_ack *extack);
int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
u32 tx_weight, struct netlink_ext_ack *extack);
+ int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate,
+ void *priv, u32 *tc_bw,
+ struct netlink_ext_ack *extack);
int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv,
u64 tx_share, struct netlink_ext_ack *extack);
int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv,
@@ -1490,6 +1505,9 @@ struct devlink_ops {
u32 tx_priority, struct netlink_ext_ack *extack);
int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
u32 tx_weight, struct netlink_ext_ack *extack);
+ int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate,
+ void *priv, u32 *tc_bw,
+ struct netlink_ext_ack *extack);
int (*rate_node_new)(struct devlink_rate *rate_node, void **priv,
struct netlink_ext_ack *extack);
int (*rate_node_del)(struct devlink_rate *rate_node, void *priv,
diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index bcf9d7467e1a..d8ff24a33459 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -45,6 +45,7 @@
FN(TCP_LISTEN_OVERFLOW) \
FN(TCP_OLD_SEQUENCE) \
FN(TCP_INVALID_SEQUENCE) \
+ FN(TCP_INVALID_END_SEQUENCE) \
FN(TCP_INVALID_ACK_SEQUENCE) \
FN(TCP_RESET) \
FN(TCP_INVALID_SYN) \
@@ -121,6 +122,11 @@
FN(ARP_PVLAN_DISABLE) \
FN(MAC_IEEE_MAC_CONTROL) \
FN(BRIDGE_INGRESS_STP_STATE) \
+ FN(CAN_RX_INVALID_FRAME) \
+ FN(CANFD_RX_INVALID_FRAME) \
+ FN(CANXL_RX_INVALID_FRAME) \
+ FN(PFMEMALLOC) \
+ FN(DUALPI2_STEP_DROP) \
FNe(MAX)
/**
@@ -300,9 +306,15 @@ enum skb_drop_reason {
SKB_DROP_REASON_TCP_LISTEN_OVERFLOW,
/** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */
SKB_DROP_REASON_TCP_OLD_SEQUENCE,
- /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */
+ /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field. */
SKB_DROP_REASON_TCP_INVALID_SEQUENCE,
/**
+ * @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE:
+ * Not acceptable END_SEQ field.
+ * Corresponds to LINUX_MIB_BEYOND_WINDOW.
+ */
+ SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE,
+ /**
* @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ
* field because ack sequence is not in the window between snd_una
* and snd_nxt
@@ -574,6 +586,31 @@ enum skb_drop_reason {
*/
SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE,
/**
+ * @SKB_DROP_REASON_CAN_RX_INVALID_FRAME: received
+ * non conform CAN frame (or device is unable to receive CAN frames)
+ */
+ SKB_DROP_REASON_CAN_RX_INVALID_FRAME,
+ /**
+ * @SKB_DROP_REASON_CANFD_RX_INVALID_FRAME: received
+ * non conform CAN-FD frame (or device is unable to receive CAN frames)
+ */
+ SKB_DROP_REASON_CANFD_RX_INVALID_FRAME,
+ /**
+ * @SKB_DROP_REASON_CANXL_RX_INVALID_FRAME: received
+ * non conform CAN-XL frame (or device is unable to receive CAN frames)
+ */
+ SKB_DROP_REASON_CANXL_RX_INVALID_FRAME,
+ /**
+ * @SKB_DROP_REASON_PFMEMALLOC: packet allocated from memory reserve
+ * reached a path or socket not eligible for use of memory reserves
+ */
+ SKB_DROP_REASON_PFMEMALLOC,
+ /**
+ * @SKB_DROP_REASON_DUALPI2_STEP_DROP: dropped by the step drop
+ * threshold of DualPI2 qdisc.
+ */
+ SKB_DROP_REASON_DUALPI2_STEP_DROP,
+ /**
* @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
* shouldn't be used as a real 'reason' - only for tracing code gen
*/
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 55e2d97f247e..d73ea0880066 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -54,11 +54,13 @@ struct tc_action;
#define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26
#define DSA_TAG_PROTO_LAN937X_VALUE 27
#define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28
+#define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE 29
enum dsa_tag_protocol {
DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE,
DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE,
DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE,
+ DSA_TAG_PROTO_BRCM_LEGACY_FCS = DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE,
DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE,
DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE,
DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE,
diff --git a/include/net/dst.h b/include/net/dst.h
index 78c78cdce0e9..00467c1b5093 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -240,9 +240,9 @@ static inline void dst_hold(struct dst_entry *dst)
static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
{
- if (unlikely(time != dst->lastuse)) {
+ if (unlikely(time != READ_ONCE(dst->lastuse))) {
dst->__use++;
- dst->lastuse = time;
+ WRITE_ONCE(dst->lastuse, time);
}
}
@@ -431,13 +431,15 @@ static inline void dst_link_failure(struct sk_buff *skb)
static inline void dst_set_expires(struct dst_entry *dst, int timeout)
{
- unsigned long expires = jiffies + timeout;
+ unsigned long old, expires = jiffies + timeout;
if (expires == 0)
expires = 1;
- if (dst->expires == 0 || time_before(expires, dst->expires))
- dst->expires = expires;
+ old = READ_ONCE(dst->expires);
+
+ if (!old || time_before(expires, old))
+ WRITE_ONCE(dst->expires, expires);
}
static inline unsigned int dst_dev_overhead(struct dst_entry *dst,
@@ -456,7 +458,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *,
/* Output packet to network from transport. */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- return INDIRECT_CALL_INET(skb_dst(skb)->output,
+ return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->output),
ip6_output, ip_output,
net, sk, skb);
}
@@ -466,7 +468,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *));
/* Input packet from network to transport. */
static inline int dst_input(struct sk_buff *skb)
{
- return INDIRECT_CALL_INET(skb_dst(skb)->input,
+ return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->input),
ip6_input, ip_local_deliver, skb);
}
@@ -476,7 +478,7 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
u32));
static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
{
- if (dst->obsolete)
+ if (READ_ONCE(dst->obsolete))
dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check,
ipv4_dst_check, dst, cookie);
return dst;
@@ -561,6 +563,26 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}
+static inline struct net_device *dst_dev(const struct dst_entry *dst)
+{
+ return READ_ONCE(dst->dev);
+}
+
+static inline struct net_device *skb_dst_dev(const struct sk_buff *skb)
+{
+ return dst_dev(skb_dst(skb));
+}
+
+static inline struct net *skb_dst_dev_net(const struct sk_buff *skb)
+{
+ return dev_net(skb_dst_dev(skb));
+}
+
+static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb)
+{
+ return dev_net_rcu(skb_dst_dev(skb));
+}
+
struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie);
void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu, bool confirm_neigh);
diff --git a/include/net/gro.h b/include/net/gro.h
index 22d3a69e4404..a0fca7ac6e7e 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -534,6 +534,12 @@ static inline void gro_normal_list(struct gro_node *gro)
gro->rx_count = 0;
}
+static inline void gro_flush_normal(struct gro_node *gro, bool flush_old)
+{
+ gro_flush(gro, flush_old);
+ gro_normal_list(gro);
+}
+
/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
* pass the whole batch up to the stack.
*/
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index c32878c69179..ab3929a2a956 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -150,7 +150,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
int iif, int sdif,
bool *refcounted)
{
- struct net *net = dev_net_rcu(skb_dst(skb)->dev);
+ struct net *net = skb_dst_dev_net_rcu(skb);
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct sock *sk;
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4564b5d348b1..19dbd9081d5a 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -202,12 +202,6 @@ static inline spinlock_t *inet_ehash_lockp(
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo);
-static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h)
-{
- kfree(h->lhash2);
- h->lhash2 = NULL;
-}
-
static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
{
kvfree(hashinfo->ehash_locks);
@@ -487,7 +481,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
const int sdif,
bool *refcounted)
{
- struct net *net = dev_net_rcu(skb_dst(skb)->dev);
+ struct net *net = skb_dst_dev_net_rcu(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *sk;
diff --git a/include/net/ip.h b/include/net/ip.h
index 47ed6d23853d..befcba575129 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -59,6 +59,7 @@ struct inet_skb_parm {
#define IPSKB_L3SLAVE BIT(7)
#define IPSKB_NOPOLICY BIT(8)
#define IPSKB_MULTIPATH BIT(9)
+#define IPSKB_MCROUTE BIT(10)
u16 frag_max_size;
};
@@ -167,6 +168,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
int ip_local_deliver(struct sk_buff *skb);
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
int ip_mr_input(struct sk_buff *skb);
+int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
@@ -470,12 +472,12 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
rcu_read_lock();
- net = dev_net_rcu(dst->dev);
+ net = dev_net_rcu(dst_dev(dst));
if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
ip_mtu_locked(dst) ||
!forwarding) {
mtu = rt->rt_pmtu;
- if (mtu && time_before(jiffies, rt->dst.expires))
+ if (mtu && time_before(jiffies, READ_ONCE(rt->dst.expires)))
goto out;
}
@@ -484,7 +486,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
if (mtu)
goto out;
- mtu = READ_ONCE(dst->dev->mtu);
+ mtu = READ_ONCE(dst_dev(dst)->mtu);
if (unlikely(ip_mtu_locked(dst))) {
if (rt->rt_uses_gateway && mtu > 576)
@@ -504,16 +506,17 @@ out:
static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
const struct sk_buff *skb)
{
+ const struct dst_entry *dst = skb_dst(skb);
unsigned int mtu;
if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) {
bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;
- return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
+ return ip_dst_mtu_maybe_forward(dst, forwarding);
}
- mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
- return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu);
+ mtu = min(READ_ONCE(dst_dev(dst)->mtu), IP_MAX_MTU);
+ return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len,
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 6dbdf60b342f..9255f21818ee 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -274,7 +274,7 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
unsigned int mtu;
if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) {
- mtu = READ_ONCE(dst->dev->mtu);
+ mtu = READ_ONCE(dst_dev(dst)->mtu);
mtu -= lwtunnel_headroom(dst->lwtstate, mtu);
} else {
mtu = dst_mtu(dst);
@@ -337,7 +337,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst
mtu = IPV6_MIN_MTU;
rcu_read_lock();
- idev = __in6_dev_get(dst->dev);
+ idev = __in6_dev_get(dst_dev(dst));
if (idev)
mtu = READ_ONCE(idev->cnf.mtu6);
rcu_read_unlock();
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 399592405c72..120db2865811 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -152,13 +152,14 @@ int ip6_tnl_get_iflink(const struct net_device *dev);
int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu);
static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb,
- struct net_device *dev)
+ struct net_device *dev, u16 ip6cb_flags)
{
int pkt_len, err;
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+ IP6CB(skb)->flags = ip6cb_flags;
pkt_len = skb->len - skb_inner_network_offset(skb);
- err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb);
+ err = ip6_local_out(skb_dst_dev_net(skb), sk, skb);
if (dev) {
if (unlikely(net_xmit_eval(err)))
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 0c3d571a04a1..8cf1380f3656 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -603,7 +603,7 @@ static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, u8 proto,
- u8 tos, u8 ttl, __be16 df, bool xnet);
+ u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
gfp_t flags);
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h
index ab05024be518..5d991404845e 100644
--- a/include/net/libeth/rx.h
+++ b/include/net/libeth/rx.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
#ifndef __LIBETH_RX_H
#define __LIBETH_RX_H
@@ -13,8 +13,10 @@
/* Space reserved in front of each frame */
#define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN)
+#define LIBETH_XDP_HEADROOM (ALIGN(XDP_PACKET_HEADROOM, NET_SKB_PAD) + \
+ NET_IP_ALIGN)
/* Maximum headroom for worst-case calculations */
-#define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM
+#define LIBETH_MAX_HEADROOM LIBETH_XDP_HEADROOM
/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
#define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
/* Maximum supported L2-L4 header length */
@@ -31,7 +33,7 @@
/**
* struct libeth_fqe - structure representing an Rx buffer (fill queue element)
- * @page: page holding the buffer
+ * @netmem: network memory reference holding the buffer
* @offset: offset from the page start (to the headroom)
* @truesize: total space occupied by the buffer (w/ headroom and tailroom)
*
@@ -40,7 +42,7 @@
* former, @offset is always 0 and @truesize is always ```PAGE_SIZE```.
*/
struct libeth_fqe {
- struct page *page;
+ netmem_ref netmem;
u32 offset;
u32 truesize;
} __aligned_largest;
@@ -66,6 +68,7 @@ enum libeth_fqe_type {
* @count: number of descriptors/buffers the queue has
* @type: type of the buffers this queue has
* @hsplit: flag whether header split is enabled
+ * @xdp: flag indicating whether XDP is enabled
* @buf_len: HW-writeable length per each buffer
* @nid: ID of the closest NUMA node with memory
*/
@@ -81,6 +84,7 @@ struct libeth_fq {
/* Cold fields */
enum libeth_fqe_type type:2;
bool hsplit:1;
+ bool xdp:1;
u32 buf_len;
int nid;
@@ -102,15 +106,16 @@ static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i)
struct libeth_fqe *buf = &fq->fqes[i];
buf->truesize = fq->truesize;
- buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize);
- if (unlikely(!buf->page))
+ buf->netmem = page_pool_dev_alloc_netmem(fq->pp, &buf->offset,
+ &buf->truesize);
+ if (unlikely(!buf->netmem))
return DMA_MAPPING_ERROR;
- return page_pool_get_dma_addr(buf->page) + buf->offset +
+ return page_pool_get_dma_addr_netmem(buf->netmem) + buf->offset +
fq->pp->p.offset;
}
-void libeth_rx_recycle_slow(struct page *page);
+void libeth_rx_recycle_slow(netmem_ref netmem);
/**
* libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA
@@ -126,18 +131,19 @@ void libeth_rx_recycle_slow(struct page *page);
static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe,
u32 len)
{
- struct page *page = fqe->page;
+ netmem_ref netmem = fqe->netmem;
/* Very rare, but possible case. The most common reason:
* the last fragment contained FCS only, which was then
* stripped by the HW.
*/
if (unlikely(!len)) {
- libeth_rx_recycle_slow(page);
+ libeth_rx_recycle_slow(netmem);
return false;
}
- page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len);
+ page_pool_dma_sync_netmem_for_cpu(netmem_get_pp(netmem), netmem,
+ fqe->offset, len);
return true;
}
diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h
index 35614f9523f6..c3db5c6f1641 100644
--- a/include/net/libeth/tx.h
+++ b/include/net/libeth/tx.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
#ifndef __LIBETH_TX_H
#define __LIBETH_TX_H
@@ -12,11 +12,17 @@
/**
* enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion
- * @LIBETH_SQE_EMPTY: unused/empty, no action required
+ * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX/XSk frame, no action required
* @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required
* @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree()
* @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA
* @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats
+ * @__LIBETH_SQE_XDP_START: separator between skb and XDP types
+ * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats
+ * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats
+ * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA
+ * @LIBETH_SQE_XSK_TX: &libeth_xdp_buff on XSk queue, xsk_buff_free(), stats
+ * @LIBETH_SQE_XSK_TX_FRAG: &libeth_xdp_buff frag on XSk queue, xsk_buff_free()
*/
enum libeth_sqe_type {
LIBETH_SQE_EMPTY = 0U,
@@ -24,6 +30,13 @@ enum libeth_sqe_type {
LIBETH_SQE_SLAB,
LIBETH_SQE_FRAG,
LIBETH_SQE_SKB,
+
+ __LIBETH_SQE_XDP_START,
+ LIBETH_SQE_XDP_TX = __LIBETH_SQE_XDP_START,
+ LIBETH_SQE_XDP_XMIT,
+ LIBETH_SQE_XDP_XMIT_FRAG,
+ LIBETH_SQE_XSK_TX,
+ LIBETH_SQE_XSK_TX_FRAG,
};
/**
@@ -32,6 +45,9 @@ enum libeth_sqe_type {
* @rs_idx: index of the last buffer from the batch this one was sent in
* @raw: slab buffer to free via kfree()
* @skb: &sk_buff to consume
+ * @sinfo: skb shared info of an XDP_TX frame
+ * @xdpf: XDP frame from ::ndo_xdp_xmit()
+ * @xsk: XSk Rx frame from XDP_TX action
* @dma: DMA address to unmap
* @len: length of the mapped region to unmap
* @nr_frags: number of frags in the frame this buffer belongs to
@@ -46,6 +62,9 @@ struct libeth_sqe {
union {
void *raw;
struct sk_buff *skb;
+ struct skb_shared_info *sinfo;
+ struct xdp_frame *xdpf;
+ struct libeth_xdp_buff *xsk;
};
DEFINE_DMA_UNMAP_ADDR(dma);
@@ -71,7 +90,10 @@ struct libeth_sqe {
/**
* struct libeth_cq_pp - completion queue poll params
* @dev: &device to perform DMA unmapping
+ * @bq: XDP frame bulk to combine return operations
* @ss: onstack NAPI stats to fill
+ * @xss: onstack XDPSQ NAPI stats to fill
+ * @xdp_tx: number of XDP-not-XSk frames processed
* @napi: whether it's called from the NAPI context
*
* libeth uses this structure to access objects needed for performing full
@@ -80,7 +102,13 @@ struct libeth_sqe {
*/
struct libeth_cq_pp {
struct device *dev;
- struct libeth_sq_napi_stats *ss;
+ struct xdp_frame_bulk *bq;
+
+ union {
+ struct libeth_sq_napi_stats *ss;
+ struct libeth_xdpsq_napi_stats *xss;
+ };
+ u32 xdp_tx;
bool napi;
};
@@ -126,4 +154,6 @@ static inline void libeth_tx_complete(struct libeth_sqe *sqe,
sqe->type = LIBETH_SQE_EMPTY;
}
+void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp);
+
#endif /* __LIBETH_TX_H */
diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h
index 603825e45133..cf1d78a9dc38 100644
--- a/include/net/libeth/types.h
+++ b/include/net/libeth/types.h
@@ -1,10 +1,32 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
#ifndef __LIBETH_TYPES_H
#define __LIBETH_TYPES_H
-#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/* Stats */
+
+/**
+ * struct libeth_rq_napi_stats - "hot" counters to update in Rx polling loop
+ * @packets: received frames counter
+ * @bytes: sum of bytes of received frames above
+ * @fragments: sum of fragments of received S/G frames
+ * @hsplit: number of frames the device performed the header split for
+ * @raw: alias to access all the fields as an array
+ */
+struct libeth_rq_napi_stats {
+ union {
+ struct {
+ u32 packets;
+ u32 bytes;
+ u32 fragments;
+ u32 hsplit;
+ };
+ DECLARE_FLEX_ARRAY(u32, raw);
+ };
+};
/**
* struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop
@@ -22,4 +44,84 @@ struct libeth_sq_napi_stats {
};
};
+/**
+ * struct libeth_xdpsq_napi_stats - "hot" counters to update in XDP Tx
+ * completion loop
+ * @packets: completed frames counter
+ * @bytes: sum of bytes of completed frames above
+ * @fragments: sum of fragments of completed S/G frames
+ * @raw: alias to access all the fields as an array
+ */
+struct libeth_xdpsq_napi_stats {
+ union {
+ struct {
+ u32 packets;
+ u32 bytes;
+ u32 fragments;
+ };
+ DECLARE_FLEX_ARRAY(u32, raw);
+ };
+};
+
+/* XDP */
+
+/*
+ * The following structures should be embedded into driver's queue structure
+ * and passed to the libeth_xdp helpers, never used directly.
+ */
+
+/* XDPSQ sharing */
+
+/**
+ * struct libeth_xdpsq_lock - locking primitive for sharing XDPSQs
+ * @lock: spinlock for locking the queue
+ * @share: whether this particular queue is shared
+ */
+struct libeth_xdpsq_lock {
+ spinlock_t lock;
+ bool share;
+};
+
+/* XDPSQ clean-up timers */
+
+/**
+ * struct libeth_xdpsq_timer - timer for cleaning up XDPSQs w/o interrupts
+ * @xdpsq: queue this timer belongs to
+ * @lock: lock for the queue
+ * @dwork: work performing cleanups
+ *
+ * XDPSQs not using interrupts but lazy cleaning, i.e. only when there's no
+ * space for sending the current queued frame/bulk, must fire up timers to
+ * make sure there are no stale buffers to free.
+ */
+struct libeth_xdpsq_timer {
+ void *xdpsq;
+ struct libeth_xdpsq_lock *lock;
+
+ struct delayed_work dwork;
+};
+
+/* Rx polling path */
+
+/**
+ * struct libeth_xdp_buff_stash - struct for stashing &xdp_buff onto a queue
+ * @data: pointer to the start of the frame, xdp_buff.data
+ * @headroom: frame headroom, xdp_buff.data - xdp_buff.data_hard_start
+ * @len: frame linear space length, xdp_buff.data_end - xdp_buff.data
+ * @frame_sz: truesize occupied by the frame, xdp_buff.frame_sz
+ * @flags: xdp_buff.flags
+ *
+ * &xdp_buff is 56 bytes long on x64, &libeth_xdp_buff is 64 bytes. This
+ * structure carries only necessary fields to save/restore a partially built
+ * frame on the queue structure to finish it during the next NAPI poll.
+ */
+struct libeth_xdp_buff_stash {
+ void *data;
+ u16 headroom;
+ u16 len;
+
+ u32 frame_sz:24;
+ u32 flags:8;
+} __aligned_largest;
+
#endif /* __LIBETH_TYPES_H */
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
new file mode 100644
index 000000000000..f4880b50e804
--- /dev/null
+++ b/include/net/libeth/xdp.h
@@ -0,0 +1,1879 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef __LIBETH_XDP_H
+#define __LIBETH_XDP_H
+
+#include <linux/bpf_trace.h>
+#include <linux/unroll.h>
+
+#include <net/libeth/rx.h>
+#include <net/libeth/tx.h>
+#include <net/xsk_buff_pool.h>
+
+/*
+ * Defined as bits to be able to use them as a mask on Rx.
+ * Also used as internal return values on Tx.
+ */
+enum {
+ LIBETH_XDP_PASS = 0U,
+ LIBETH_XDP_DROP = BIT(0),
+ LIBETH_XDP_ABORTED = BIT(1),
+ LIBETH_XDP_TX = BIT(2),
+ LIBETH_XDP_REDIRECT = BIT(3),
+};
+
+/*
+ * &xdp_buff_xsk is the largest structure &libeth_xdp_buff gets casted to,
+ * pick maximum pointer-compatible alignment.
+ */
+#define __LIBETH_XDP_BUFF_ALIGN \
+ (IS_ALIGNED(sizeof(struct xdp_buff_xsk), 16) ? 16 : \
+ IS_ALIGNED(sizeof(struct xdp_buff_xsk), 8) ? 8 : \
+ sizeof(long))
+
+/**
+ * struct libeth_xdp_buff - libeth extension over &xdp_buff
+ * @base: main &xdp_buff
+ * @data: shortcut for @base.data
+ * @desc: RQ descriptor containing metadata for this buffer
+ * @priv: driver-private scratchspace
+ *
+ * The main reason for this is to have a pointer to the descriptor to be able
+ * to quickly get frame metadata from xdpmo and driver buff-to-xdp callbacks
+ * (as well as bigger alignment).
+ * Pointer/layout-compatible with &xdp_buff and &xdp_buff_xsk.
+ */
+struct libeth_xdp_buff {
+ union {
+ struct xdp_buff base;
+ void *data;
+ };
+
+ const void *desc;
+ unsigned long priv[]
+ __aligned(__LIBETH_XDP_BUFF_ALIGN);
+} __aligned(__LIBETH_XDP_BUFF_ALIGN);
+static_assert(offsetof(struct libeth_xdp_buff, data) ==
+ offsetof(struct xdp_buff_xsk, xdp.data));
+static_assert(offsetof(struct libeth_xdp_buff, desc) ==
+ offsetof(struct xdp_buff_xsk, cb));
+static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk),
+ __alignof(struct libeth_xdp_buff)));
+
+/**
+ * __LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack
+ * @name: name of the variable to declare
+ * @...: sizeof() of the driver-private data
+ */
+#define __LIBETH_XDP_ONSTACK_BUFF(name, ...) \
+ ___LIBETH_XDP_ONSTACK_BUFF(name, ##__VA_ARGS__)
+/**
+ * LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack
+ * @name: name of the variable to declare
+ * @...: type or variable name of the driver-private data
+ */
+#define LIBETH_XDP_ONSTACK_BUFF(name, ...) \
+ __LIBETH_XDP_ONSTACK_BUFF(name, __libeth_xdp_priv_sz(__VA_ARGS__))
+
+#define ___LIBETH_XDP_ONSTACK_BUFF(name, ...) \
+ __DEFINE_FLEX(struct libeth_xdp_buff, name, priv, \
+ LIBETH_XDP_PRIV_SZ(__VA_ARGS__ + 0), \
+ __uninitialized); \
+ LIBETH_XDP_ASSERT_PRIV_SZ(__VA_ARGS__ + 0)
+
+#define __libeth_xdp_priv_sz(...) \
+ CONCATENATE(__libeth_xdp_psz, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+#define __libeth_xdp_psz0(...)
+#define __libeth_xdp_psz1(...) sizeof(__VA_ARGS__)
+
+#define LIBETH_XDP_PRIV_SZ(sz) \
+ (ALIGN(sz, __alignof(struct libeth_xdp_buff)) / sizeof(long))
+
+/* Performs XSK_CHECK_PRIV_TYPE() */
+#define LIBETH_XDP_ASSERT_PRIV_SZ(sz) \
+ static_assert(offsetofend(struct xdp_buff_xsk, cb) >= \
+ struct_size_t(struct libeth_xdp_buff, priv, \
+ LIBETH_XDP_PRIV_SZ(sz)))
+
+/* XDPSQ sharing */
+
+DECLARE_STATIC_KEY_FALSE(libeth_xdpsq_share);
+
+/**
+ * libeth_xdpsq_num - calculate optimal number of XDPSQs for this device + sys
+ * @rxq: current number of active Rx queues
+ * @txq: current number of active Tx queues
+ * @max: maximum number of Tx queues
+ *
+ * Each RQ must have its own XDPSQ for XSk pairs, each CPU must have own XDPSQ
+ * for lockless sending (``XDP_TX``, .ndo_xdp_xmit()). Cap the maximum of these
+ * two with the number of SQs the device can have (minus used ones).
+ *
+ * Return: number of XDP Tx queues the device needs to use.
+ */
+static inline u32 libeth_xdpsq_num(u32 rxq, u32 txq, u32 max)
+{
+ return min(max(nr_cpu_ids, rxq), max - txq);
+}
+
+/**
+ * libeth_xdpsq_shared - whether XDPSQs can be shared between several CPUs
+ * @num: number of active XDPSQs
+ *
+ * Return: true if there's no 1:1 XDPSQ/CPU association, false otherwise.
+ */
+static inline bool libeth_xdpsq_shared(u32 num)
+{
+ return num < nr_cpu_ids;
+}
+
+/**
+ * libeth_xdpsq_id - get XDPSQ index corresponding to this CPU
+ * @num: number of active XDPSQs
+ *
+ * Helper for libeth_xdp routines, do not use in drivers directly.
+ *
+ * Return: XDPSQ index needs to be used on this CPU.
+ */
+static inline u32 libeth_xdpsq_id(u32 num)
+{
+ u32 ret = raw_smp_processor_id();
+
+ if (static_branch_unlikely(&libeth_xdpsq_share) &&
+ libeth_xdpsq_shared(num))
+ ret %= num;
+
+ return ret;
+}
+
+void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock,
+ const struct net_device *dev);
+void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock,
+ const struct net_device *dev);
+
+/**
+ * libeth_xdpsq_get - initialize &libeth_xdpsq_lock
+ * @lock: lock to initialize
+ * @dev: netdev which this lock belongs to
+ * @share: whether XDPSQs can be shared
+ *
+ * Tracks the current XDPSQ association and enables the static lock
+ * if needed.
+ */
+static inline void libeth_xdpsq_get(struct libeth_xdpsq_lock *lock,
+ const struct net_device *dev,
+ bool share)
+{
+ if (unlikely(share))
+ __libeth_xdpsq_get(lock, dev);
+}
+
+/**
+ * libeth_xdpsq_put - deinitialize &libeth_xdpsq_lock
+ * @lock: lock to deinitialize
+ * @dev: netdev which this lock belongs to
+ *
+ * Tracks the current XDPSQ association and disables the static lock
+ * if needed.
+ */
+static inline void libeth_xdpsq_put(struct libeth_xdpsq_lock *lock,
+ const struct net_device *dev)
+{
+ if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share)
+ __libeth_xdpsq_put(lock, dev);
+}
+
+void __libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock);
+void __libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock);
+
+/**
+ * libeth_xdpsq_lock - grab &libeth_xdpsq_lock if needed
+ * @lock: lock to take
+ *
+ * Touches the underlying spinlock only if the static key is enabled
+ * and the queue itself is marked as shareable.
+ */
+static inline void libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock)
+{
+ if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share)
+ __libeth_xdpsq_lock(lock);
+}
+
+/**
+ * libeth_xdpsq_unlock - free &libeth_xdpsq_lock if needed
+ * @lock: lock to free
+ *
+ * Touches the underlying spinlock only if the static key is enabled
+ * and the queue itself is marked as shareable.
+ */
+static inline void libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock)
+{
+ if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share)
+ __libeth_xdpsq_unlock(lock);
+}
+
+/* XDPSQ clean-up timers */
+
+void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq,
+ struct libeth_xdpsq_lock *lock,
+ void (*poll)(struct work_struct *work));
+
+/**
+ * libeth_xdpsq_deinit_timer - deinitialize &libeth_xdpsq_timer
+ * @timer: timer to deinitialize
+ *
+ * Flush and disable the underlying workqueue.
+ */
+static inline void libeth_xdpsq_deinit_timer(struct libeth_xdpsq_timer *timer)
+{
+ cancel_delayed_work_sync(&timer->dwork);
+}
+
+/**
+ * libeth_xdpsq_queue_timer - run &libeth_xdpsq_timer
+ * @timer: timer to queue
+ *
+ * Should be called after the queue was filled and the transmission was run
+ * to complete the pending buffers if no further sending will be done in a
+ * second (-> lazy cleaning won't happen).
+ * If the timer was already run, it will be requeued back to one second
+ * timeout again.
+ */
+static inline void libeth_xdpsq_queue_timer(struct libeth_xdpsq_timer *timer)
+{
+ mod_delayed_work_on(raw_smp_processor_id(), system_bh_highpri_wq,
+ &timer->dwork, HZ);
+}
+
+/**
+ * libeth_xdpsq_run_timer - wrapper to run a queue clean-up on a timer event
+ * @work: workqueue belonging to the corresponding timer
+ * @poll: driver-specific completion queue poll function
+ *
+ * Run the polling function on the locked queue and requeue the timer if
+ * there's more work to do.
+ * Designed to be used via LIBETH_XDP_DEFINE_TIMER() below.
+ */
+static __always_inline void
+libeth_xdpsq_run_timer(struct work_struct *work,
+ u32 (*poll)(void *xdpsq, u32 budget))
+{
+ struct libeth_xdpsq_timer *timer = container_of(work, typeof(*timer),
+ dwork.work);
+
+ libeth_xdpsq_lock(timer->lock);
+
+ if (poll(timer->xdpsq, U32_MAX))
+ libeth_xdpsq_queue_timer(timer);
+
+ libeth_xdpsq_unlock(timer->lock);
+}
+
+/* Common Tx bits */
+
+/**
+ * enum - libeth_xdp internal Tx flags
+ * @LIBETH_XDP_TX_BULK: one bulk size at which it will be flushed to the queue
+ * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled
+ * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent
+ * @LIBETH_XDP_TX_NDO: whether the send function is called from .ndo_xdp_xmit()
+ * @LIBETH_XDP_TX_XSK: whether the function is called for ``XDP_TX`` for XSk
+ */
+enum {
+ LIBETH_XDP_TX_BULK = DEV_MAP_BULK_SIZE,
+ LIBETH_XDP_TX_BATCH = 8,
+
+ LIBETH_XDP_TX_DROP = BIT(0),
+ LIBETH_XDP_TX_NDO = BIT(1),
+ LIBETH_XDP_TX_XSK = BIT(2),
+};
+
+/**
+ * enum - &libeth_xdp_tx_frame and &libeth_xdp_tx_desc flags
+ * @LIBETH_XDP_TX_LEN: only for ``XDP_TX``, [15:0] of ::len_fl is actual length
+ * @LIBETH_XDP_TX_CSUM: for XSk xmit, enable checksum offload
+ * @LIBETH_XDP_TX_XSKMD: for XSk xmit, mask of the metadata bits
+ * @LIBETH_XDP_TX_FIRST: indicates the frag is the first one of the frame
+ * @LIBETH_XDP_TX_LAST: whether the frag is the last one of the frame
+ * @LIBETH_XDP_TX_MULTI: whether the frame contains several frags
+ * @LIBETH_XDP_TX_FLAGS: only for ``XDP_TX``, [31:16] of ::len_fl is flags
+ */
+enum {
+ LIBETH_XDP_TX_LEN = GENMASK(15, 0),
+
+ LIBETH_XDP_TX_CSUM = XDP_TXMD_FLAGS_CHECKSUM,
+ LIBETH_XDP_TX_XSKMD = LIBETH_XDP_TX_LEN,
+
+ LIBETH_XDP_TX_FIRST = BIT(16),
+ LIBETH_XDP_TX_LAST = BIT(17),
+ LIBETH_XDP_TX_MULTI = BIT(18),
+
+ LIBETH_XDP_TX_FLAGS = GENMASK(31, 16),
+};
+
+/**
+ * struct libeth_xdp_tx_frame - represents one XDP Tx element
+ * @data: frame start pointer for ``XDP_TX``
+ * @len_fl: ``XDP_TX``, combined flags [31:16] and len [15:0] field for speed
+ * @soff: ``XDP_TX``, offset from @data to the start of &skb_shared_info
+ * @frag: one (non-head) frag for ``XDP_TX``
+ * @xdpf: &xdp_frame for the head frag for .ndo_xdp_xmit()
+ * @dma: DMA address of the non-head frag for .ndo_xdp_xmit()
+ * @xsk: ``XDP_TX`` for XSk, XDP buffer for any frag
+ * @len: frag length for XSk ``XDP_TX`` and .ndo_xdp_xmit()
+ * @flags: Tx flags for the above
+ * @opts: combined @len + @flags for the above for speed
+ * @desc: XSk xmit descriptor for direct casting
+ */
+struct libeth_xdp_tx_frame {
+ union {
+ /* ``XDP_TX`` */
+ struct {
+ void *data;
+ u32 len_fl;
+ u32 soff;
+ };
+
+ /* ``XDP_TX`` frag */
+ skb_frag_t frag;
+
+ /* .ndo_xdp_xmit(), XSk ``XDP_TX`` */
+ struct {
+ union {
+ struct xdp_frame *xdpf;
+ dma_addr_t dma;
+
+ struct libeth_xdp_buff *xsk;
+ };
+ union {
+ struct {
+ u32 len;
+ u32 flags;
+ };
+ aligned_u64 opts;
+ };
+ };
+
+ /* XSk xmit */
+ struct xdp_desc desc;
+ };
+} __aligned(sizeof(struct xdp_desc));
+static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) ==
+ offsetof(struct libeth_xdp_tx_frame, len_fl));
+static_assert(sizeof(struct libeth_xdp_tx_frame) == sizeof(struct xdp_desc));
+
+/**
+ * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending
+ * @prog: corresponding active XDP program, %NULL for .ndo_xdp_xmit()
+ * @dev: &net_device which the frames are transmitted on
+ * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure
+ * @act_mask: Rx only, mask of all the XDP prog verdicts for that NAPI session
+ * @count: current number of frames in @bulk
+ * @bulk: array of queued frames for bulk Tx
+ *
+ * All XDP Tx operations except XSk xmit queue each frame to the bulk first
+ * and flush it when @count reaches the array end. Bulk is always placed on
+ * the stack for performance. One bulk element contains all the data necessary
+ * for sending a frame and then freeing it on completion.
+ * For XSk xmit, Tx descriptor array from &xsk_buff_pool is casted directly
+ * to &libeth_xdp_tx_frame as they are compatible and the bulk structure is
+ * not used.
+ */
+struct libeth_xdp_tx_bulk {
+ const struct bpf_prog *prog;
+ struct net_device *dev;
+ void *xdpsq;
+
+ u32 act_mask;
+ u32 count;
+ struct libeth_xdp_tx_frame bulk[LIBETH_XDP_TX_BULK];
+} __aligned(sizeof(struct libeth_xdp_tx_frame));
+
+/**
+ * LIBETH_XDP_ONSTACK_BULK - declare &libeth_xdp_tx_bulk on the stack
+ * @bq: name of the variable to declare
+ *
+ * Helper to declare a bulk on the stack with a compiler hint that it should
+ * not be initialized automatically (with `CONFIG_INIT_STACK_ALL_*`) for
+ * performance reasons.
+ */
+#define LIBETH_XDP_ONSTACK_BULK(bq) \
+ struct libeth_xdp_tx_bulk bq __uninitialized
+
+/**
+ * struct libeth_xdpsq - abstraction for an XDPSQ
+ * @pool: XSk buffer pool for XSk ``XDP_TX`` and xmit
+ * @sqes: array of Tx buffers from the actual queue struct
+ * @descs: opaque pointer to the HW descriptor array
+ * @ntu: pointer to the next free descriptor index
+ * @count: number of descriptors on that queue
+ * @pending: pointer to the number of sent-not-completed descs on that queue
+ * @xdp_tx: pointer to the above, but only for non-XSk-xmit frames
+ * @lock: corresponding XDPSQ lock
+ *
+ * Abstraction for driver-independent implementation of Tx. Placed on the stack
+ * and filled by the driver before the transmission, so that the generic
+ * functions can access and modify driver-specific resources.
+ */
+struct libeth_xdpsq {
+ struct xsk_buff_pool *pool;
+ struct libeth_sqe *sqes;
+ void *descs;
+
+ u32 *ntu;
+ u32 count;
+
+ u32 *pending;
+ u32 *xdp_tx;
+ struct libeth_xdpsq_lock *lock;
+};
+
+/**
+ * struct libeth_xdp_tx_desc - abstraction for an XDP Tx descriptor
+ * @addr: DMA address of the frame
+ * @len: length of the frame
+ * @flags: XDP Tx flags
+ * @opts: combined @len + @flags for speed
+ *
+ * Filled by the generic functions and then passed to driver-specific functions
+ * to fill a HW Tx descriptor, always placed on the [function] stack.
+ */
+struct libeth_xdp_tx_desc {
+ dma_addr_t addr;
+ union {
+ struct {
+ u32 len;
+ u32 flags;
+ };
+ aligned_u64 opts;
+ };
+} __aligned_largest;
+
+/**
+ * libeth_xdp_ptr_to_priv - convert pointer to a libeth_xdp u64 priv
+ * @ptr: pointer to convert
+ *
+ * The main sending function passes private data as the largest scalar, u64.
+ * Use this helper when you want to pass a pointer there.
+ */
+#define libeth_xdp_ptr_to_priv(ptr) ({ \
+ typecheck_pointer(ptr); \
+ ((u64)(uintptr_t)(ptr)); \
+})
+/**
+ * libeth_xdp_priv_to_ptr - convert libeth_xdp u64 priv to a pointer
+ * @priv: private data to convert
+ *
+ * The main sending function passes private data as the largest scalar, u64.
+ * Use this helper when your callback takes this u64 and you want to convert
+ * it back to a pointer.
+ */
+#define libeth_xdp_priv_to_ptr(priv) ({ \
+ static_assert(__same_type(priv, u64)); \
+ ((const void *)(uintptr_t)(priv)); \
+})
+
+/*
+ * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len
+ * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead.
+ */
+#ifdef __LITTLE_ENDIAN
+#define __LIBETH_WORD_ACCESS 1
+#endif
+#ifdef __LIBETH_WORD_ACCESS
+#define __libeth_xdp_tx_len(flen, ...) \
+ .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0)))
+#else
+#define __libeth_xdp_tx_len(flen, ...) \
+ .len = (flen), .flags = (__VA_ARGS__ + 0)
+#endif
+
+/**
+ * libeth_xdp_tx_xmit_bulk - main XDP Tx function
+ * @bulk: array of frames to send
+ * @xdpsq: pointer to the driver-specific XDPSQ struct
+ * @n: number of frames to send
+ * @unroll: whether to unroll the queue filling loop for speed
+ * @priv: driver-specific private data
+ * @prep: callback for cleaning the queue and filling abstract &libeth_xdpsq
+ * @fill: internal callback for filling &libeth_sqe and &libeth_xdp_tx_desc
+ * @xmit: callback for filling a HW descriptor with the frame info
+ *
+ * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for
+ * all types of frames: ``XDP_TX``, .ndo_xdp_xmit(), XSk ``XDP_TX``, and XSk
+ * xmit.
+ * @prep must lock the queue as this function releases it at the end. @unroll
+ * greatly increases the object code size, but also greatly increases XSk xmit
+ * performance; for other types of frames, it's not enabled.
+ * The compilers inline all those onstack abstractions to direct data accesses.
+ *
+ * Return: number of frames actually placed on the queue, <= @n. The function
+ * can't fail, but can send less frames if there's no enough free descriptors
+ * available. The actual free space is returned by @prep from the driver.
+ */
+static __always_inline u32
+libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq,
+ u32 n, bool unroll, u64 priv,
+ u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq),
+ struct libeth_xdp_tx_desc
+ (*fill)(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv),
+ void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv))
+{
+ struct libeth_xdpsq sq __uninitialized;
+ u32 this, batched, off = 0;
+ u32 ntu, i = 0;
+
+ n = min(n, prep(xdpsq, &sq));
+ if (unlikely(!n))
+ goto unlock;
+
+ ntu = *sq.ntu;
+
+ this = sq.count - ntu;
+ if (likely(this > n))
+ this = n;
+
+again:
+ if (!unroll)
+ goto linear;
+
+ batched = ALIGN_DOWN(this, LIBETH_XDP_TX_BATCH);
+
+ for ( ; i < off + batched; i += LIBETH_XDP_TX_BATCH) {
+ u32 base = ntu + i - off;
+
+ unrolled_count(LIBETH_XDP_TX_BATCH)
+ for (u32 j = 0; j < LIBETH_XDP_TX_BATCH; j++)
+ xmit(fill(bulk[i + j], base + j, &sq, priv),
+ base + j, &sq, priv);
+ }
+
+ if (batched < this) {
+linear:
+ for ( ; i < off + this; i++)
+ xmit(fill(bulk[i], ntu + i - off, &sq, priv),
+ ntu + i - off, &sq, priv);
+ }
+
+ ntu += this;
+ if (likely(ntu < sq.count))
+ goto out;
+
+ ntu = 0;
+
+ if (i < n) {
+ this = n - i;
+ off = i;
+
+ goto again;
+ }
+
+out:
+ *sq.ntu = ntu;
+ *sq.pending += n;
+ if (sq.xdp_tx)
+ *sq.xdp_tx += n;
+
+unlock:
+ libeth_xdpsq_unlock(sq.lock);
+
+ return n;
+}
+
+/* ``XDP_TX`` bulking */
+
+void libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp);
+
+/**
+ * libeth_xdp_tx_queue_head - internal helper for queueing one ``XDP_TX`` head
+ * @bq: XDP Tx bulk to queue the head frag to
+ * @xdp: XDP buffer with the head to queue
+ *
+ * Return: false if it's the only frag of the frame, true if it's an S/G frame.
+ */
+static inline bool libeth_xdp_tx_queue_head(struct libeth_xdp_tx_bulk *bq,
+ const struct libeth_xdp_buff *xdp)
+{
+ const struct xdp_buff *base = &xdp->base;
+
+ bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+ .data = xdp->data,
+ .len_fl = (base->data_end - xdp->data) | LIBETH_XDP_TX_FIRST,
+ .soff = xdp_data_hard_end(base) - xdp->data,
+ };
+
+ if (!xdp_buff_has_frags(base))
+ return false;
+
+ bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_MULTI;
+
+ return true;
+}
+
+/**
+ * libeth_xdp_tx_queue_frag - internal helper for queueing one ``XDP_TX`` frag
+ * @bq: XDP Tx bulk to queue the frag to
+ * @frag: frag to queue
+ */
+static inline void libeth_xdp_tx_queue_frag(struct libeth_xdp_tx_bulk *bq,
+ const skb_frag_t *frag)
+{
+ bq->bulk[bq->count++].frag = *frag;
+}
+
+/**
+ * libeth_xdp_tx_queue_bulk - internal helper for queueing one ``XDP_TX`` frame
+ * @bq: XDP Tx bulk to queue the frame to
+ * @xdp: XDP buffer to queue
+ * @flush_bulk: driver callback to flush the bulk to the HW queue
+ *
+ * Return: true on success, false on flush error.
+ */
+static __always_inline bool
+libeth_xdp_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *xdp,
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags))
+{
+ const struct skb_shared_info *sinfo;
+ bool ret = true;
+ u32 nr_frags;
+
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, 0))) {
+ libeth_xdp_return_buff_slow(xdp);
+ return false;
+ }
+
+ if (!libeth_xdp_tx_queue_head(bq, xdp))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_buff(&xdp->base);
+ nr_frags = sinfo->nr_frags;
+
+ for (u32 i = 0; i < nr_frags; i++) {
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, 0))) {
+ ret = false;
+ break;
+ }
+
+ libeth_xdp_tx_queue_frag(bq, &sinfo->frags[i]);
+ }
+
+out:
+ bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_LAST;
+ xdp->data = NULL;
+
+ return ret;
+}
+
+/**
+ * libeth_xdp_tx_fill_stats - fill &libeth_sqe with ``XDP_TX`` frame stats
+ * @sqe: SQ element to fill
+ * @desc: libeth_xdp Tx descriptor
+ * @sinfo: &skb_shared_info for this frame
+ *
+ * Internal helper for filling an SQE with the frame stats, do not use in
+ * drivers. Fills the number of frags and bytes for this frame.
+ */
+#define libeth_xdp_tx_fill_stats(sqe, desc, sinfo) \
+ __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, __UNIQUE_ID(sqe_), \
+ __UNIQUE_ID(desc_), __UNIQUE_ID(sinfo_))
+
+#define __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, ue, ud, us) do { \
+ const struct libeth_xdp_tx_desc *ud = (desc); \
+ const struct skb_shared_info *us; \
+ struct libeth_sqe *ue = (sqe); \
+ \
+ ue->nr_frags = 1; \
+ ue->bytes = ud->len; \
+ \
+ if (ud->flags & LIBETH_XDP_TX_MULTI) { \
+ us = (sinfo); \
+ ue->nr_frags += us->nr_frags; \
+ ue->bytes += us->xdp_frags_size; \
+ } \
+} while (0)
+
+/**
+ * libeth_xdp_tx_fill_buf - internal helper to fill one ``XDP_TX`` &libeth_sqe
+ * @frm: XDP Tx frame from the bulk
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: private data
+ *
+ * Return: XDP Tx descriptor with the synced DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+libeth_xdp_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv)
+{
+ struct libeth_xdp_tx_desc desc;
+ struct skb_shared_info *sinfo;
+ skb_frag_t *frag = &frm.frag;
+ struct libeth_sqe *sqe;
+ netmem_ref netmem;
+
+ if (frm.len_fl & LIBETH_XDP_TX_FIRST) {
+ sinfo = frm.data + frm.soff;
+ skb_frag_fill_netmem_desc(frag, virt_to_netmem(frm.data),
+ offset_in_page(frm.data),
+ frm.len_fl);
+ } else {
+ sinfo = NULL;
+ }
+
+ netmem = skb_frag_netmem(frag);
+ desc = (typeof(desc)){
+ .addr = page_pool_get_dma_addr_netmem(netmem) +
+ skb_frag_off(frag),
+ .len = skb_frag_size(frag) & LIBETH_XDP_TX_LEN,
+ .flags = skb_frag_size(frag) & LIBETH_XDP_TX_FLAGS,
+ };
+
+ dma_sync_single_for_device(__netmem_get_pp(netmem)->p.dev, desc.addr,
+ desc.len, DMA_BIDIRECTIONAL);
+
+ if (!sinfo)
+ return desc;
+
+ sqe = &sq->sqes[i];
+ sqe->type = LIBETH_SQE_XDP_TX;
+ sqe->sinfo = sinfo;
+ libeth_xdp_tx_fill_stats(sqe, &desc, sinfo);
+
+ return desc;
+}
+
+void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent,
+ u32 flags);
+
+/**
+ * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk
+ * @bq: bulk to flush
+ * @flags: XDP TX flags (.ndo_xdp_xmit(), XSk etc.)
+ * @prep: driver-specific callback to prepare the queue for sending
+ * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc
+ * @xmit: driver callback to fill a HW descriptor
+ *
+ * Internal abstraction to create bulk flush functions for drivers. Used for
+ * everything except XSk xmit.
+ *
+ * Return: true if anything was sent, false otherwise.
+ */
+static __always_inline bool
+__libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags,
+ u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq),
+ struct libeth_xdp_tx_desc
+ (*fill)(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv),
+ void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i,
+ const struct libeth_xdpsq *sq,
+ u64 priv))
+{
+ u32 sent, drops;
+ int err = 0;
+
+ sent = libeth_xdp_tx_xmit_bulk(bq->bulk, bq->xdpsq,
+ min(bq->count, LIBETH_XDP_TX_BULK),
+ false, 0, prep, fill, xmit);
+ drops = bq->count - sent;
+
+ if (unlikely(drops)) {
+ libeth_xdp_tx_exception(bq, sent, flags);
+ err = -ENXIO;
+ } else {
+ bq->count = 0;
+ }
+
+ trace_xdp_bulk_tx(bq->dev, sent, drops, err);
+
+ return likely(sent);
+}
+
+/**
+ * libeth_xdp_tx_flush_bulk - wrapper to define flush of one ``XDP_TX`` bulk
+ * @bq: bulk to flush
+ * @flags: Tx flags, see above
+ * @prep: driver callback to prepare the queue
+ * @xmit: driver callback to fill a HW descriptor
+ *
+ * Use via LIBETH_XDP_DEFINE_FLUSH_TX() to define an ``XDP_TX`` driver
+ * callback.
+ */
+#define libeth_xdp_tx_flush_bulk(bq, flags, prep, xmit) \
+ __libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf, \
+ xmit)
+
+/* .ndo_xdp_xmit() implementation */
+
+/**
+ * libeth_xdp_xmit_init_bulk - internal helper to initialize bulk for XDP xmit
+ * @bq: bulk to initialize
+ * @dev: target &net_device
+ * @xdpsqs: array of driver-specific XDPSQ structs
+ * @num: number of active XDPSQs (the above array length)
+ */
+#define libeth_xdp_xmit_init_bulk(bq, dev, xdpsqs, num) \
+ __libeth_xdp_xmit_init_bulk(bq, dev, (xdpsqs)[libeth_xdpsq_id(num)])
+
+static inline void __libeth_xdp_xmit_init_bulk(struct libeth_xdp_tx_bulk *bq,
+ struct net_device *dev,
+ void *xdpsq)
+{
+ bq->dev = dev;
+ bq->xdpsq = xdpsq;
+ bq->count = 0;
+}
+
+/**
+ * libeth_xdp_xmit_frame_dma - internal helper to access DMA of an &xdp_frame
+ * @xf: pointer to the XDP frame
+ *
+ * There's no place in &libeth_xdp_tx_frame to store DMA address for an
+ * &xdp_frame head. The headroom is used then, the address is placed right
+ * after the frame struct, naturally aligned.
+ *
+ * Return: pointer to the DMA address to use.
+ */
+#define libeth_xdp_xmit_frame_dma(xf) \
+ _Generic((xf), \
+ const struct xdp_frame *: \
+ (const dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf), \
+ struct xdp_frame *: \
+ (dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf) \
+ )
+
+static inline void *__libeth_xdp_xmit_frame_dma(const struct xdp_frame *xdpf)
+{
+ void *addr = (void *)(xdpf + 1);
+
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ __alignof(*xdpf) < sizeof(dma_addr_t))
+ addr = PTR_ALIGN(addr, sizeof(dma_addr_t));
+
+ return addr;
+}
+
+/**
+ * libeth_xdp_xmit_queue_head - internal helper for queueing one XDP xmit head
+ * @bq: XDP Tx bulk to queue the head frag to
+ * @xdpf: XDP frame with the head to queue
+ * @dev: device to perform DMA mapping
+ *
+ * Return: ``LIBETH_XDP_DROP`` on DMA mapping error,
+ * ``LIBETH_XDP_PASS`` if it's the only frag in the frame,
+ * ``LIBETH_XDP_TX`` if it's an S/G frame.
+ */
+static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq,
+ struct xdp_frame *xdpf,
+ struct device *dev)
+{
+ dma_addr_t dma;
+
+ dma = dma_map_single(dev, xdpf->data, xdpf->len, DMA_TO_DEVICE);
+ if (dma_mapping_error(dev, dma))
+ return LIBETH_XDP_DROP;
+
+ *libeth_xdp_xmit_frame_dma(xdpf) = dma;
+
+ bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+ .xdpf = xdpf,
+ __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST),
+ };
+
+ if (!xdp_frame_has_frags(xdpf))
+ return LIBETH_XDP_PASS;
+
+ bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI;
+
+ return LIBETH_XDP_TX;
+}
+
+/**
+ * libeth_xdp_xmit_queue_frag - internal helper for queueing one XDP xmit frag
+ * @bq: XDP Tx bulk to queue the frag to
+ * @frag: frag to queue
+ * @dev: device to perform DMA mapping
+ *
+ * Return: true on success, false on DMA mapping error.
+ */
+static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq,
+ const skb_frag_t *frag,
+ struct device *dev)
+{
+ dma_addr_t dma;
+
+ dma = skb_frag_dma_map(dev, frag);
+ if (dma_mapping_error(dev, dma))
+ return false;
+
+ bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+ .dma = dma,
+ __libeth_xdp_tx_len(skb_frag_size(frag)),
+ };
+
+ return true;
+}
+
+/**
+ * libeth_xdp_xmit_queue_bulk - internal helper for queueing one XDP xmit frame
+ * @bq: XDP Tx bulk to queue the frame to
+ * @xdpf: XDP frame to queue
+ * @flush_bulk: driver callback to flush the bulk to the HW queue
+ *
+ * Return: ``LIBETH_XDP_TX`` on success,
+ * ``LIBETH_XDP_DROP`` if the frame should be dropped by the stack,
+ * ``LIBETH_XDP_ABORTED`` if the frame will be dropped by libeth_xdp.
+ */
+static __always_inline u32
+libeth_xdp_xmit_queue_bulk(struct libeth_xdp_tx_bulk *bq,
+ struct xdp_frame *xdpf,
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags))
+{
+ u32 head, nr_frags, i, ret = LIBETH_XDP_TX;
+ struct device *dev = bq->dev->dev.parent;
+ const struct skb_shared_info *sinfo;
+
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO)))
+ return LIBETH_XDP_DROP;
+
+ head = libeth_xdp_xmit_queue_head(bq, xdpf, dev);
+ if (head == LIBETH_XDP_PASS)
+ goto out;
+ else if (head == LIBETH_XDP_DROP)
+ return LIBETH_XDP_DROP;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ nr_frags = sinfo->nr_frags;
+
+ for (i = 0; i < nr_frags; i++) {
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO)))
+ break;
+
+ if (!libeth_xdp_xmit_queue_frag(bq, &sinfo->frags[i], dev))
+ break;
+ }
+
+ if (unlikely(i < nr_frags))
+ ret = LIBETH_XDP_ABORTED;
+
+out:
+ bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST;
+
+ return ret;
+}
+
+/**
+ * libeth_xdp_xmit_fill_buf - internal helper to fill one XDP xmit &libeth_sqe
+ * @frm: XDP Tx frame from the bulk
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: private data
+ *
+ * Return: XDP Tx descriptor with the mapped DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+libeth_xdp_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv)
+{
+ struct libeth_xdp_tx_desc desc;
+ struct libeth_sqe *sqe;
+ struct xdp_frame *xdpf;
+
+ if (frm.flags & LIBETH_XDP_TX_FIRST) {
+ xdpf = frm.xdpf;
+ desc.addr = *libeth_xdp_xmit_frame_dma(xdpf);
+ } else {
+ xdpf = NULL;
+ desc.addr = frm.dma;
+ }
+ desc.opts = frm.opts;
+
+ sqe = &sq->sqes[i];
+ dma_unmap_addr_set(sqe, dma, desc.addr);
+ dma_unmap_len_set(sqe, len, desc.len);
+
+ if (!xdpf) {
+ sqe->type = LIBETH_SQE_XDP_XMIT_FRAG;
+ return desc;
+ }
+
+ sqe->type = LIBETH_SQE_XDP_XMIT;
+ sqe->xdpf = xdpf;
+ libeth_xdp_tx_fill_stats(sqe, &desc,
+ xdp_get_shared_info_from_frame(xdpf));
+
+ return desc;
+}
+
+/**
+ * libeth_xdp_xmit_flush_bulk - wrapper to define flush of one XDP xmit bulk
+ * @bq: bulk to flush
+ * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk()
+ * @prep: driver callback to prepare the queue
+ * @xmit: driver callback to fill a HW descriptor
+ *
+ * Use via LIBETH_XDP_DEFINE_FLUSH_XMIT() to define an XDP xmit driver
+ * callback.
+ */
+#define libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit) \
+ __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_NDO, prep, \
+ libeth_xdp_xmit_fill_buf, xmit)
+
+u32 libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq,
+ u32 count, const struct net_device *dev);
+
+/**
+ * __libeth_xdp_xmit_do_bulk - internal function to implement .ndo_xdp_xmit()
+ * @bq: XDP Tx bulk to queue frames to
+ * @frames: XDP frames passed by the stack
+ * @n: number of frames
+ * @flags: flags passed by the stack
+ * @flush_bulk: driver callback to flush an XDP xmit bulk
+ * @finalize: driver callback to finalize sending XDP Tx frames on the queue
+ *
+ * Perform common checks, map the frags and queue them to the bulk, then flush
+ * the bulk to the XDPSQ. If requested by the stack, finalize the queue.
+ *
+ * Return: number of frames send or -errno on error.
+ */
+static __always_inline int
+__libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq,
+ struct xdp_frame **frames, u32 n, u32 flags,
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags),
+ void (*finalize)(void *xdpsq, bool sent, bool flush))
+{
+ u32 nxmit = 0;
+
+ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+ return -EINVAL;
+
+ for (u32 i = 0; likely(i < n); i++) {
+ u32 ret;
+
+ ret = libeth_xdp_xmit_queue_bulk(bq, frames[i], flush_bulk);
+ if (unlikely(ret != LIBETH_XDP_TX)) {
+ nxmit += ret == LIBETH_XDP_ABORTED;
+ break;
+ }
+
+ nxmit++;
+ }
+
+ if (bq->count) {
+ flush_bulk(bq, LIBETH_XDP_TX_NDO);
+ if (unlikely(bq->count))
+ nxmit -= libeth_xdp_xmit_return_bulk(bq->bulk,
+ bq->count,
+ bq->dev);
+ }
+
+ finalize(bq->xdpsq, nxmit, flags & XDP_XMIT_FLUSH);
+
+ return nxmit;
+}
+
+/**
+ * libeth_xdp_xmit_do_bulk - implement full .ndo_xdp_xmit() in driver
+ * @dev: target &net_device
+ * @n: number of frames to send
+ * @fr: XDP frames to send
+ * @f: flags passed by the stack
+ * @xqs: array of XDPSQs driver structs
+ * @nqs: number of active XDPSQs, the above array length
+ * @fl: driver callback to flush an XDP xmit bulk
+ * @fin: driver cabback to finalize the queue
+ *
+ * If the driver has active XDPSQs, perform common checks and send the frames.
+ * Finalize the queue, if requested.
+ *
+ * Return: number of frames sent or -errno on error.
+ */
+#define libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin) \
+ _libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin, \
+ __UNIQUE_ID(bq_), __UNIQUE_ID(ret_), \
+ __UNIQUE_ID(nqs_))
+
+#define _libeth_xdp_xmit_do_bulk(d, n, fr, f, xqs, nqs, fl, fin, ub, ur, un) \
+({ \
+ u32 un = (nqs); \
+ int ur; \
+ \
+ if (likely(un)) { \
+ LIBETH_XDP_ONSTACK_BULK(ub); \
+ \
+ libeth_xdp_xmit_init_bulk(&ub, d, xqs, un); \
+ ur = __libeth_xdp_xmit_do_bulk(&ub, fr, n, f, fl, fin); \
+ } else { \
+ ur = -ENXIO; \
+ } \
+ \
+ ur; \
+})
+
+/* Rx polling path */
+
+/**
+ * libeth_xdp_tx_init_bulk - initialize an XDP Tx bulk for Rx NAPI poll
+ * @bq: bulk to initialize
+ * @prog: RCU pointer to the XDP program (can be %NULL)
+ * @dev: target &net_device
+ * @xdpsqs: array of driver XDPSQ structs
+ * @num: number of active XDPSQs, the above array length
+ *
+ * Should be called on an onstack XDP Tx bulk before the NAPI polling loop.
+ * Initializes all the needed fields to run libeth_xdp functions. If @num == 0,
+ * assumes XDP is not enabled.
+ * Do not use for XSk, it has its own optimized helper.
+ */
+#define libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num) \
+ __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, false, \
+ __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_))
+
+#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, xsk, ub, un) do { \
+ typeof(bq) ub = (bq); \
+ u32 un = (num); \
+ \
+ rcu_read_lock(); \
+ \
+ if (un || (xsk)) { \
+ ub->prog = rcu_dereference(pr); \
+ ub->dev = (d); \
+ ub->xdpsq = (xdpsqs)[libeth_xdpsq_id(un)]; \
+ } else { \
+ ub->prog = NULL; \
+ } \
+ \
+ ub->act_mask = 0; \
+ ub->count = 0; \
+} while (0)
+
+void libeth_xdp_load_stash(struct libeth_xdp_buff *dst,
+ const struct libeth_xdp_buff_stash *src);
+void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst,
+ const struct libeth_xdp_buff *src);
+void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash);
+
+/**
+ * libeth_xdp_init_buff - initialize a &libeth_xdp_buff for Rx NAPI poll
+ * @dst: onstack buffer to initialize
+ * @src: XDP buffer stash placed on the queue
+ * @rxq: registered &xdp_rxq_info corresponding to this queue
+ *
+ * Should be called before the main NAPI polling loop. Loads the content of
+ * the previously saved stash or initializes the buffer from scratch.
+ * Do not use for XSk.
+ */
+static inline void
+libeth_xdp_init_buff(struct libeth_xdp_buff *dst,
+ const struct libeth_xdp_buff_stash *src,
+ struct xdp_rxq_info *rxq)
+{
+ if (likely(!src->data))
+ dst->data = NULL;
+ else
+ libeth_xdp_load_stash(dst, src);
+
+ dst->base.rxq = rxq;
+}
+
+/**
+ * libeth_xdp_save_buff - save a partially built buffer on a queue
+ * @dst: XDP buffer stash placed on the queue
+ * @src: onstack buffer to save
+ *
+ * Should be called after the main NAPI polling loop. If the loop exited before
+ * the buffer was finished, saves its content on the queue, so that it can be
+ * completed during the next poll. Otherwise, clears the stash.
+ */
+static inline void libeth_xdp_save_buff(struct libeth_xdp_buff_stash *dst,
+ const struct libeth_xdp_buff *src)
+{
+ if (likely(!src->data))
+ dst->data = NULL;
+ else
+ libeth_xdp_save_stash(dst, src);
+}
+
+/**
+ * libeth_xdp_return_stash - free an XDP buffer stash from a queue
+ * @stash: stash to free
+ *
+ * If the queue is about to be destroyed, but it still has an incompleted
+ * buffer stash, this helper should be called to free it.
+ */
+static inline void libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash)
+{
+ if (stash->data)
+ __libeth_xdp_return_stash(stash);
+}
+
+static inline void libeth_xdp_return_va(const void *data, bool napi)
+{
+ netmem_ref netmem = virt_to_netmem(data);
+
+ page_pool_put_full_netmem(__netmem_get_pp(netmem), netmem, napi);
+}
+
+static inline void libeth_xdp_return_frags(const struct skb_shared_info *sinfo,
+ bool napi)
+{
+ for (u32 i = 0; i < sinfo->nr_frags; i++) {
+ netmem_ref netmem = skb_frag_netmem(&sinfo->frags[i]);
+
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi);
+ }
+}
+
+/**
+ * libeth_xdp_return_buff - free/recycle &libeth_xdp_buff
+ * @xdp: buffer to free
+ *
+ * Hotpath helper to free &libeth_xdp_buff. Comparing to xdp_return_buff(),
+ * it's faster as it gets inlined and always assumes order-0 pages and safe
+ * direct recycling. Zeroes @xdp->data to avoid UAFs.
+ */
+#define libeth_xdp_return_buff(xdp) __libeth_xdp_return_buff(xdp, true)
+
+static inline void __libeth_xdp_return_buff(struct libeth_xdp_buff *xdp,
+ bool napi)
+{
+ if (!xdp_buff_has_frags(&xdp->base))
+ goto out;
+
+ libeth_xdp_return_frags(xdp_get_shared_info_from_buff(&xdp->base),
+ napi);
+
+out:
+ libeth_xdp_return_va(xdp->data, napi);
+ xdp->data = NULL;
+}
+
+bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp,
+ const struct libeth_fqe *fqe,
+ u32 len);
+
+/**
+ * libeth_xdp_prepare_buff - fill &libeth_xdp_buff with head FQE data
+ * @xdp: XDP buffer to attach the head to
+ * @fqe: FQE containing the head buffer
+ * @len: buffer len passed from HW
+ *
+ * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer
+ * head with the Rx buffer data: data pointer, length, headroom, and
+ * truesize/tailroom. Zeroes the flags.
+ * Uses faster single u64 write instead of per-field access.
+ */
+static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
+ const struct libeth_fqe *fqe,
+ u32 len)
+{
+ const struct page *page = __netmem_to_page(fqe->netmem);
+
+#ifdef __LIBETH_WORD_ACCESS
+ static_assert(offsetofend(typeof(xdp->base), flags) -
+ offsetof(typeof(xdp->base), frame_sz) ==
+ sizeof(u64));
+
+ *(u64 *)&xdp->base.frame_sz = fqe->truesize;
+#else
+ xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
+#endif
+ xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset,
+ pp_page_to_nmdesc(page)->pp->p.offset, len, true);
+}
+
+/**
+ * libeth_xdp_process_buff - attach Rx buffer to &libeth_xdp_buff
+ * @xdp: XDP buffer to attach the Rx buffer to
+ * @fqe: Rx buffer to process
+ * @len: received data length from the descriptor
+ *
+ * If the XDP buffer is empty, attaches the Rx buffer as head and initializes
+ * the required fields. Otherwise, attaches the buffer as a frag.
+ * Already performs DMA sync-for-CPU and frame start prefetch
+ * (for head buffers only).
+ *
+ * Return: true on success, false if the descriptor must be skipped (empty or
+ * no space for a new frag).
+ */
+static inline bool libeth_xdp_process_buff(struct libeth_xdp_buff *xdp,
+ const struct libeth_fqe *fqe,
+ u32 len)
+{
+ if (!libeth_rx_sync_for_cpu(fqe, len))
+ return false;
+
+ if (xdp->data)
+ return libeth_xdp_buff_add_frag(xdp, fqe, len);
+
+ libeth_xdp_prepare_buff(xdp, fqe, len);
+
+ prefetch(xdp->data);
+
+ return true;
+}
+
+/**
+ * libeth_xdp_buff_stats_frags - update onstack RQ stats with XDP frags info
+ * @ss: onstack stats to update
+ * @xdp: buffer to account
+ *
+ * Internal helper used by __libeth_xdp_run_pass(), do not call directly.
+ * Adds buffer's frags count and total len to the onstack stats.
+ */
+static inline void
+libeth_xdp_buff_stats_frags(struct libeth_rq_napi_stats *ss,
+ const struct libeth_xdp_buff *xdp)
+{
+ const struct skb_shared_info *sinfo;
+
+ sinfo = xdp_get_shared_info_from_buff(&xdp->base);
+ ss->bytes += sinfo->xdp_frags_size;
+ ss->fragments += sinfo->nr_frags + 1;
+}
+
+u32 libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *xdp,
+ enum xdp_action act, int ret);
+
+/**
+ * __libeth_xdp_run_prog - run XDP program on an XDP buffer
+ * @xdp: XDP buffer to run the prog on
+ * @bq: buffer bulk for ``XDP_TX`` queueing
+ *
+ * Internal inline abstraction to run XDP program. Handles ``XDP_DROP``
+ * and ``XDP_REDIRECT`` only, the rest is processed levels up.
+ * Reports an XDP prog exception on errors.
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+static __always_inline u32
+__libeth_xdp_run_prog(struct libeth_xdp_buff *xdp,
+ const struct libeth_xdp_tx_bulk *bq)
+{
+ enum xdp_action act;
+
+ act = bpf_prog_run_xdp(bq->prog, &xdp->base);
+ if (unlikely(act < XDP_DROP || act > XDP_REDIRECT))
+ goto out;
+
+ switch (act) {
+ case XDP_PASS:
+ return LIBETH_XDP_PASS;
+ case XDP_DROP:
+ libeth_xdp_return_buff(xdp);
+
+ return LIBETH_XDP_DROP;
+ case XDP_TX:
+ return LIBETH_XDP_TX;
+ case XDP_REDIRECT:
+ if (unlikely(xdp_do_redirect(bq->dev, &xdp->base, bq->prog)))
+ break;
+
+ xdp->data = NULL;
+
+ return LIBETH_XDP_REDIRECT;
+ default:
+ break;
+ }
+
+out:
+ return libeth_xdp_prog_exception(bq, xdp, act, 0);
+}
+
+/**
+ * __libeth_xdp_run_flush - run XDP program and handle ``XDP_TX`` verdict
+ * @xdp: XDP buffer to run the prog on
+ * @bq: buffer bulk for ``XDP_TX`` queueing
+ * @run: internal callback for running XDP program
+ * @queue: internal callback for queuing ``XDP_TX`` frame
+ * @flush_bulk: driver callback for flushing a bulk
+ *
+ * Internal inline abstraction to run XDP program and additionally handle
+ * ``XDP_TX`` verdict. Used by both XDP and XSk, hence @run and @queue.
+ * Do not use directly.
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+static __always_inline u32
+__libeth_xdp_run_flush(struct libeth_xdp_buff *xdp,
+ struct libeth_xdp_tx_bulk *bq,
+ u32 (*run)(struct libeth_xdp_buff *xdp,
+ const struct libeth_xdp_tx_bulk *bq),
+ bool (*queue)(struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *xdp,
+ bool (*flush_bulk)
+ (struct libeth_xdp_tx_bulk *bq,
+ u32 flags)),
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags))
+{
+ u32 act;
+
+ act = run(xdp, bq);
+ if (act == LIBETH_XDP_TX && unlikely(!queue(bq, xdp, flush_bulk)))
+ act = LIBETH_XDP_DROP;
+
+ bq->act_mask |= act;
+
+ return act;
+}
+
+/**
+ * libeth_xdp_run_prog - run XDP program (non-XSk path) and handle all verdicts
+ * @xdp: XDP buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers
+ * @fl: driver ``XDP_TX`` bulk flush callback
+ *
+ * Run the attached XDP program and handle all possible verdicts. XSk has its
+ * own version.
+ * Prefer using it via LIBETH_XDP_DEFINE_RUN{,_PASS,_PROG}().
+ *
+ * Return: true if the buffer should be passed up the stack, false if the poll
+ * should go to the next buffer.
+ */
+#define libeth_xdp_run_prog(xdp, bq, fl) \
+ (__libeth_xdp_run_flush(xdp, bq, __libeth_xdp_run_prog, \
+ libeth_xdp_tx_queue_bulk, \
+ fl) == LIBETH_XDP_PASS)
+
+/**
+ * __libeth_xdp_run_pass - helper to run XDP program and handle the result
+ * @xdp: XDP buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @rs: onstack libeth RQ stats
+ * @md: metadata that should be filled to the XDP buffer
+ * @prep: callback for filling the metadata
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Inline abstraction that does the following (non-XSk path):
+ * 1) adds frame size and frag number (if needed) to the onstack stats;
+ * 2) fills the descriptor metadata to the onstack &libeth_xdp_buff
+ * 3) runs XDP program if present;
+ * 4) handles all possible verdicts;
+ * 5) on ``XDP_PASS`, builds an skb from the buffer;
+ * 6) populates it with the descriptor metadata;
+ * 7) passes it up the stack.
+ *
+ * In most cases, number 2 means just writing the pointer to the HW descriptor
+ * to the XDP buffer. If so, please use LIBETH_XDP_DEFINE_RUN{,_PASS}()
+ * wrappers to build a driver function.
+ */
+static __always_inline void
+__libeth_xdp_run_pass(struct libeth_xdp_buff *xdp,
+ struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi,
+ struct libeth_rq_napi_stats *rs, const void *md,
+ void (*prep)(struct libeth_xdp_buff *xdp,
+ const void *md),
+ bool (*run)(struct libeth_xdp_buff *xdp,
+ struct libeth_xdp_tx_bulk *bq),
+ bool (*populate)(struct sk_buff *skb,
+ const struct libeth_xdp_buff *xdp,
+ struct libeth_rq_napi_stats *rs))
+{
+ struct sk_buff *skb;
+
+ rs->bytes += xdp->base.data_end - xdp->data;
+ rs->packets++;
+
+ if (xdp_buff_has_frags(&xdp->base))
+ libeth_xdp_buff_stats_frags(rs, xdp);
+
+ if (prep && (!__builtin_constant_p(!!md) || md))
+ prep(xdp, md);
+
+ if (!bq || !run || !bq->prog)
+ goto build;
+
+ if (!run(xdp, bq))
+ return;
+
+build:
+ skb = xdp_build_skb_from_buff(&xdp->base);
+ if (unlikely(!skb)) {
+ libeth_xdp_return_buff_slow(xdp);
+ return;
+ }
+
+ xdp->data = NULL;
+
+ if (unlikely(!populate(skb, xdp, rs))) {
+ napi_consume_skb(skb, true);
+ return;
+ }
+
+ napi_gro_receive(napi, skb);
+}
+
+static inline void libeth_xdp_prep_desc(struct libeth_xdp_buff *xdp,
+ const void *desc)
+{
+ xdp->desc = desc;
+}
+
+/**
+ * libeth_xdp_run_pass - helper to run XDP program and handle the result
+ * @xdp: XDP buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @ss: onstack libeth RQ stats
+ * @desc: pointer to the HW descriptor for that frame
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Wrapper around the underscored version when "fill the descriptor metadata"
+ * means just writing the pointer to the HW descriptor as @xdp->desc.
+ */
+#define libeth_xdp_run_pass(xdp, bq, napi, ss, desc, run, populate) \
+ __libeth_xdp_run_pass(xdp, bq, napi, ss, desc, libeth_xdp_prep_desc, \
+ run, populate)
+
+/**
+ * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop (non-XSk)
+ * @bq: ``XDP_TX`` frame bulk
+ * @flush: driver callback to flush the bulk
+ * @finalize: driver callback to start sending the frames and run the timer
+ *
+ * Flush the bulk if there are frames left to send, kick the queue and flush
+ * the XDP maps.
+ */
+#define libeth_xdp_finalize_rx(bq, flush, finalize) \
+ __libeth_xdp_finalize_rx(bq, 0, flush, finalize)
+
+static __always_inline void
+__libeth_xdp_finalize_rx(struct libeth_xdp_tx_bulk *bq, u32 flags,
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags),
+ void (*finalize)(void *xdpsq, bool sent, bool flush))
+{
+ if (bq->act_mask & LIBETH_XDP_TX) {
+ if (bq->count)
+ flush_bulk(bq, flags | LIBETH_XDP_TX_DROP);
+ finalize(bq->xdpsq, true, true);
+ }
+ if (bq->act_mask & LIBETH_XDP_REDIRECT)
+ xdp_do_flush();
+
+ rcu_read_unlock();
+}
+
+/*
+ * Helpers to reduce boilerplate code in drivers.
+ *
+ * Typical driver Rx flow would be (excl. bulk and buff init, frag attach):
+ *
+ * LIBETH_XDP_DEFINE_START();
+ * LIBETH_XDP_DEFINE_FLUSH_TX(static driver_xdp_flush_tx, driver_xdp_tx_prep,
+ * driver_xdp_xmit);
+ * LIBETH_XDP_DEFINE_RUN(static driver_xdp_run, driver_xdp_run_prog,
+ * driver_xdp_flush_tx, driver_populate_skb);
+ * LIBETH_XDP_DEFINE_FINALIZE(static driver_xdp_finalize_rx,
+ * driver_xdp_flush_tx, driver_xdp_finalize_sq);
+ * LIBETH_XDP_DEFINE_END();
+ *
+ * This will build a set of 4 static functions. The compiler is free to decide
+ * whether to inline them.
+ * Then, in the NAPI polling function:
+ *
+ * while (packets < budget) {
+ * // ...
+ * driver_xdp_run(xdp, &bq, napi, &rs, desc);
+ * }
+ * driver_xdp_finalize_rx(&bq);
+ */
+
+#define LIBETH_XDP_DEFINE_START() \
+ __diag_push(); \
+ __diag_ignore(GCC, 8, "-Wold-style-declaration", \
+ "Allow specifying \'static\' after the return type")
+
+/**
+ * LIBETH_XDP_DEFINE_TIMER - define a driver XDPSQ cleanup timer callback
+ * @name: name of the function to define
+ * @poll: Tx polling/completion function
+ */
+#define LIBETH_XDP_DEFINE_TIMER(name, poll) \
+void name(struct work_struct *work) \
+{ \
+ libeth_xdpsq_run_timer(work, poll); \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_FLUSH_TX - define a driver ``XDP_TX`` bulk flush function
+ * @name: name of the function to define
+ * @prep: driver callback to clean an XDPSQ
+ * @xmit: driver callback to write a HW Tx descriptor
+ */
+#define LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit) \
+ __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xdp)
+
+#define __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, pfx) \
+bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \
+{ \
+ return libeth_##pfx##_tx_flush_bulk(bq, flags, prep, xmit); \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_FLUSH_XMIT - define a driver XDP xmit bulk flush function
+ * @name: name of the function to define
+ * @prep: driver callback to clean an XDPSQ
+ * @xmit: driver callback to write a HW Tx descriptor
+ */
+#define LIBETH_XDP_DEFINE_FLUSH_XMIT(name, prep, xmit) \
+bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \
+{ \
+ return libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit); \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_RUN_PROG - define a driver XDP program run function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an ``XDP_TX`` bulk
+ */
+#define LIBETH_XDP_DEFINE_RUN_PROG(name, flush) \
+ bool __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xdp)
+
+#define __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, pfx) \
+name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq) \
+{ \
+ return libeth_##pfx##_run_prog(xdp, bq, flush); \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_RUN_PASS - define a driver buffer process + pass function
+ * @name: name of the function to define
+ * @run: driver callback to run XDP program (above)
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate) \
+ void __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xdp)
+
+#define __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, pfx) \
+name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq, \
+ struct napi_struct *napi, struct libeth_rq_napi_stats *ss, \
+ const void *desc) \
+{ \
+ return libeth_##pfx##_run_pass(xdp, bq, napi, ss, desc, run, \
+ populate); \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_RUN - define a driver buffer process, run + pass function
+ * @name: name of the function to define
+ * @run: name of the XDP prog run function to define
+ * @flush: driver callback to flush an ``XDP_TX`` bulk
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XDP_DEFINE_RUN(name, run, flush, populate) \
+ __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XDP)
+
+#define __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, pfx) \
+ LIBETH_##pfx##_DEFINE_RUN_PROG(static run, flush); \
+ LIBETH_##pfx##_DEFINE_RUN_PASS(name, run, populate)
+
+/**
+ * LIBETH_XDP_DEFINE_FINALIZE - define a driver Rx NAPI poll finalize function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an ``XDP_TX`` bulk
+ * @finalize: driver callback to finalize an XDPSQ and run the timer
+ */
+#define LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize) \
+ __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xdp)
+
+#define __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, pfx) \
+void name(struct libeth_xdp_tx_bulk *bq) \
+{ \
+ libeth_##pfx##_finalize_rx(bq, flush, finalize); \
+}
+
+#define LIBETH_XDP_DEFINE_END() __diag_pop()
+
+/* XMO */
+
+/**
+ * libeth_xdp_buff_to_rq - get RQ pointer from an XDP buffer pointer
+ * @xdp: &libeth_xdp_buff corresponding to the queue
+ * @type: typeof() of the driver Rx queue structure
+ * @member: name of &xdp_rxq_info inside @type
+ *
+ * Often times, pointer to the RQ is needed when reading/filling metadata from
+ * HW descriptors. The helper can be used to quickly jump from an XDP buffer
+ * to the queue corresponding to its &xdp_rxq_info without introducing
+ * additional fields (&libeth_xdp_buff is precisely 1 cacheline long on x64).
+ */
+#define libeth_xdp_buff_to_rq(xdp, type, member) \
+ container_of_const((xdp)->base.rxq, type, member)
+
+/**
+ * libeth_xdpmo_rx_hash - convert &libeth_rx_pt to an XDP RSS hash metadata
+ * @hash: pointer to the variable to write the hash to
+ * @rss_type: pointer to the variable to write the hash type to
+ * @val: hash value from the HW descriptor
+ * @pt: libeth parsed packet type
+ *
+ * Handle zeroed/non-available hash and convert libeth parsed packet type to
+ * the corresponding XDP RSS hash type. To be called at the end of
+ * xdp_metadata_ops idpf_xdpmo::xmo_rx_hash() implementation.
+ * Note that if the driver doesn't use a constant packet type lookup table but
+ * generates it at runtime, it must call libeth_rx_pt_gen_hash_type(pt) to
+ * generate XDP RSS hash type for each packet type.
+ *
+ * Return: 0 on success, -ENODATA when the hash is not available.
+ */
+static inline int libeth_xdpmo_rx_hash(u32 *hash,
+ enum xdp_rss_hash_type *rss_type,
+ u32 val, struct libeth_rx_pt pt)
+{
+ if (unlikely(!val))
+ return -ENODATA;
+
+ *hash = val;
+ *rss_type = pt.hash_type;
+
+ return 0;
+}
+
+/* Tx buffer completion */
+
+void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo,
+ struct xdp_frame_bulk *bq, bool frags);
+void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp);
+
+/**
+ * __libeth_xdp_complete_tx - complete sent XDPSQE
+ * @sqe: SQ element / Tx buffer to complete
+ * @cp: Tx polling/completion params
+ * @bulk: internal callback to bulk-free ``XDP_TX`` buffers
+ * @xsk: internal callback to free XSk ``XDP_TX`` buffers
+ *
+ * Use the non-underscored version in drivers instead. This one is shared
+ * internally with libeth_tx_complete_any().
+ * Complete an XDPSQE of any type of XDP frame. This includes DMA unmapping
+ * when needed, buffer freeing, stats update, and SQE invalidation.
+ */
+static __always_inline void
+__libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp,
+ typeof(libeth_xdp_return_buff_bulk) bulk,
+ typeof(libeth_xsk_buff_free_slow) xsk)
+{
+ enum libeth_sqe_type type = sqe->type;
+
+ switch (type) {
+ case LIBETH_SQE_EMPTY:
+ return;
+ case LIBETH_SQE_XDP_XMIT:
+ case LIBETH_SQE_XDP_XMIT_FRAG:
+ dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma),
+ dma_unmap_len(sqe, len), DMA_TO_DEVICE);
+ break;
+ default:
+ break;
+ }
+
+ switch (type) {
+ case LIBETH_SQE_XDP_TX:
+ bulk(sqe->sinfo, cp->bq, sqe->nr_frags != 1);
+ break;
+ case LIBETH_SQE_XDP_XMIT:
+ xdp_return_frame_bulk(sqe->xdpf, cp->bq);
+ break;
+ case LIBETH_SQE_XSK_TX:
+ case LIBETH_SQE_XSK_TX_FRAG:
+ xsk(sqe->xsk);
+ break;
+ default:
+ break;
+ }
+
+ switch (type) {
+ case LIBETH_SQE_XDP_TX:
+ case LIBETH_SQE_XDP_XMIT:
+ case LIBETH_SQE_XSK_TX:
+ cp->xdp_tx -= sqe->nr_frags;
+
+ cp->xss->packets++;
+ cp->xss->bytes += sqe->bytes;
+ break;
+ default:
+ break;
+ }
+
+ sqe->type = LIBETH_SQE_EMPTY;
+}
+
+static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe,
+ struct libeth_cq_pp *cp)
+{
+ __libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk,
+ libeth_xsk_buff_free_slow);
+}
+
+/* Misc */
+
+u32 libeth_xdp_queue_threshold(u32 count);
+
+void __libeth_xdp_set_features(struct net_device *dev,
+ const struct xdp_metadata_ops *xmo,
+ u32 zc_segs,
+ const struct xsk_tx_metadata_ops *tmo);
+void libeth_xdp_set_redirect(struct net_device *dev, bool enable);
+
+/**
+ * libeth_xdp_set_features - set XDP features for netdev
+ * @dev: &net_device to configure
+ * @...: optional params, see __libeth_xdp_set_features()
+ *
+ * Set all the features libeth_xdp supports, including .ndo_xdp_xmit(). That
+ * said, it should be used only when XDPSQs are always available regardless
+ * of whether an XDP prog is attached to @dev.
+ */
+#define libeth_xdp_set_features(dev, ...) \
+ CONCATENATE(__libeth_xdp_feat, \
+ COUNT_ARGS(__VA_ARGS__))(dev, ##__VA_ARGS__)
+
+#define __libeth_xdp_feat0(dev) \
+ __libeth_xdp_set_features(dev, NULL, 0, NULL)
+#define __libeth_xdp_feat1(dev, xmo) \
+ __libeth_xdp_set_features(dev, xmo, 0, NULL)
+#define __libeth_xdp_feat2(dev, xmo, zc_segs) \
+ __libeth_xdp_set_features(dev, xmo, zc_segs, NULL)
+#define __libeth_xdp_feat3(dev, xmo, zc_segs, tmo) \
+ __libeth_xdp_set_features(dev, xmo, zc_segs, tmo)
+
+/**
+ * libeth_xdp_set_features_noredir - enable all libeth_xdp features w/o redir
+ * @dev: target &net_device
+ * @...: optional params, see __libeth_xdp_set_features()
+ *
+ * Enable everything except the .ndo_xdp_xmit() feature, use when XDPSQs are
+ * not available right after netdev registration.
+ */
+#define libeth_xdp_set_features_noredir(dev, ...) \
+ __libeth_xdp_set_features_noredir(dev, __UNIQUE_ID(dev_), \
+ ##__VA_ARGS__)
+
+#define __libeth_xdp_set_features_noredir(dev, ud, ...) do { \
+ struct net_device *ud = (dev); \
+ \
+ libeth_xdp_set_features(ud, ##__VA_ARGS__); \
+ libeth_xdp_set_redirect(ud, false); \
+} while (0)
+
+#define libeth_xsktmo ((const void *)GOLDEN_RATIO_PRIME)
+
+#endif /* __LIBETH_XDP_H */
diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h
new file mode 100644
index 000000000000..481a7b28e6f2
--- /dev/null
+++ b/include/net/libeth/xsk.h
@@ -0,0 +1,685 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef __LIBETH_XSK_H
+#define __LIBETH_XSK_H
+
+#include <net/libeth/xdp.h>
+#include <net/xdp_sock_drv.h>
+
+/* ``XDP_TXMD_FLAGS_VALID`` is defined only under ``CONFIG_XDP_SOCKETS`` */
+#ifdef XDP_TXMD_FLAGS_VALID
+static_assert(XDP_TXMD_FLAGS_VALID <= LIBETH_XDP_TX_XSKMD);
+#endif
+
+/* ``XDP_TX`` bulking */
+
+/**
+ * libeth_xsk_tx_queue_head - internal helper for queueing XSk ``XDP_TX`` head
+ * @bq: XDP Tx bulk to queue the head frag to
+ * @xdp: XSk buffer with the head to queue
+ *
+ * Return: false if it's the only frag of the frame, true if it's an S/G frame.
+ */
+static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *xdp)
+{
+ bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+ .xsk = xdp,
+ __libeth_xdp_tx_len(xdp->base.data_end - xdp->data,
+ LIBETH_XDP_TX_FIRST),
+ };
+
+ if (likely(!xdp_buff_has_frags(&xdp->base)))
+ return false;
+
+ bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI;
+
+ return true;
+}
+
+/**
+ * libeth_xsk_tx_queue_frag - internal helper for queueing XSk ``XDP_TX`` frag
+ * @bq: XDP Tx bulk to queue the frag to
+ * @frag: XSk frag to queue
+ */
+static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *frag)
+{
+ bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+ .xsk = frag,
+ __libeth_xdp_tx_len(frag->base.data_end - frag->data),
+ };
+}
+
+/**
+ * libeth_xsk_tx_queue_bulk - internal helper for queueing XSk ``XDP_TX`` frame
+ * @bq: XDP Tx bulk to queue the frame to
+ * @xdp: XSk buffer to queue
+ * @flush_bulk: driver callback to flush the bulk to the HW queue
+ *
+ * Return: true on success, false on flush error.
+ */
+static __always_inline bool
+libeth_xsk_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *xdp,
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags))
+{
+ bool ret = true;
+
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) {
+ libeth_xsk_buff_free_slow(xdp);
+ return false;
+ }
+
+ if (!libeth_xsk_tx_queue_head(bq, xdp))
+ goto out;
+
+ for (const struct libeth_xdp_buff *head = xdp; ; ) {
+ xdp = container_of(xsk_buff_get_frag(&head->base),
+ typeof(*xdp), base);
+ if (!xdp)
+ break;
+
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) {
+ ret = false;
+ break;
+ }
+
+ libeth_xsk_tx_queue_frag(bq, xdp);
+ }
+
+out:
+ bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST;
+
+ return ret;
+}
+
+/**
+ * libeth_xsk_tx_fill_buf - internal helper to fill XSk ``XDP_TX`` &libeth_sqe
+ * @frm: XDP Tx frame from the bulk
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: private data
+ *
+ * Return: XDP Tx descriptor with the synced DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+libeth_xsk_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv)
+{
+ struct libeth_xdp_buff *xdp = frm.xsk;
+ struct libeth_xdp_tx_desc desc = {
+ .addr = xsk_buff_xdp_get_dma(&xdp->base),
+ .opts = frm.opts,
+ };
+ struct libeth_sqe *sqe;
+
+ xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len);
+
+ sqe = &sq->sqes[i];
+ sqe->xsk = xdp;
+
+ if (!(desc.flags & LIBETH_XDP_TX_FIRST)) {
+ sqe->type = LIBETH_SQE_XSK_TX_FRAG;
+ return desc;
+ }
+
+ sqe->type = LIBETH_SQE_XSK_TX;
+ libeth_xdp_tx_fill_stats(sqe, &desc,
+ xdp_get_shared_info_from_buff(&xdp->base));
+
+ return desc;
+}
+
+/**
+ * libeth_xsk_tx_flush_bulk - wrapper to define flush of XSk ``XDP_TX`` bulk
+ * @bq: bulk to flush
+ * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk()
+ * @prep: driver callback to prepare the queue
+ * @xmit: driver callback to fill a HW descriptor
+ *
+ * Use via LIBETH_XSK_DEFINE_FLUSH_TX() to define an XSk ``XDP_TX`` driver
+ * callback.
+ */
+#define libeth_xsk_tx_flush_bulk(bq, flags, prep, xmit) \
+ __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_XSK, prep, \
+ libeth_xsk_tx_fill_buf, xmit)
+
+/* XSk TMO */
+
+/**
+ * libeth_xsktmo_req_csum - XSk Tx metadata op to request checksum offload
+ * @csum_start: unused
+ * @csum_offset: unused
+ * @priv: &libeth_xdp_tx_desc from the filling helper
+ *
+ * Generic implementation of ::tmo_request_checksum. Works only when HW doesn't
+ * require filling checksum offsets and other parameters beside the checksum
+ * request bit.
+ * Consider using within @libeth_xsktmo unless the driver requires HW-specific
+ * callbacks.
+ */
+static inline void libeth_xsktmo_req_csum(u16 csum_start, u16 csum_offset,
+ void *priv)
+{
+ ((struct libeth_xdp_tx_desc *)priv)->flags |= LIBETH_XDP_TX_CSUM;
+}
+
+/* Only to inline the callbacks below, use @libeth_xsktmo in drivers instead */
+static const struct xsk_tx_metadata_ops __libeth_xsktmo = {
+ .tmo_request_checksum = libeth_xsktmo_req_csum,
+};
+
+/**
+ * __libeth_xsk_xmit_fill_buf_md - internal helper to prepare XSk xmit w/meta
+ * @xdesc: &xdp_desc from the XSk buffer pool
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: XSk Tx metadata ops
+ *
+ * Same as __libeth_xsk_xmit_fill_buf(), but requests metadata pointer and
+ * fills additional fields in &libeth_xdp_tx_desc to ask for metadata offload.
+ *
+ * Return: XDP Tx descriptor with the DMA, metadata request bits, and other
+ * info to pass to the driver callback.
+ */
+static __always_inline struct libeth_xdp_tx_desc
+__libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc,
+ const struct libeth_xdpsq *sq,
+ u64 priv)
+{
+ const struct xsk_tx_metadata_ops *tmo = libeth_xdp_priv_to_ptr(priv);
+ struct libeth_xdp_tx_desc desc;
+ struct xdp_desc_ctx ctx;
+
+ ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr);
+ desc = (typeof(desc)){
+ .addr = ctx.dma,
+ __libeth_xdp_tx_len(xdesc->len),
+ };
+
+ BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo));
+ tmo = tmo == libeth_xsktmo ? &__libeth_xsktmo : tmo;
+
+ xsk_tx_metadata_request(ctx.meta, tmo, &desc);
+
+ return desc;
+}
+
+/* XSk xmit implementation */
+
+/**
+ * __libeth_xsk_xmit_fill_buf - internal helper to prepare XSk xmit w/o meta
+ * @xdesc: &xdp_desc from the XSk buffer pool
+ * @sq: XDPSQ abstraction for the queue
+ *
+ * Return: XDP Tx descriptor with the DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+__libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc,
+ const struct libeth_xdpsq *sq)
+{
+ return (struct libeth_xdp_tx_desc){
+ .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr),
+ __libeth_xdp_tx_len(xdesc->len),
+ };
+}
+
+/**
+ * libeth_xsk_xmit_fill_buf - internal helper to prepare an XSk xmit
+ * @frm: &xdp_desc from the XSk buffer pool
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: XSk Tx metadata ops
+ *
+ * Depending on the metadata ops presence (determined at compile time), calls
+ * the quickest helper to build a libeth XDP Tx descriptor.
+ *
+ * Return: XDP Tx descriptor with the synced DMA, metadata request bits,
+ * and other info to pass to the driver callback.
+ */
+static __always_inline struct libeth_xdp_tx_desc
+libeth_xsk_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv)
+{
+ struct libeth_xdp_tx_desc desc;
+
+ if (priv)
+ desc = __libeth_xsk_xmit_fill_buf_md(&frm.desc, sq, priv);
+ else
+ desc = __libeth_xsk_xmit_fill_buf(&frm.desc, sq);
+
+ desc.flags |= xsk_is_eop_desc(&frm.desc) ? LIBETH_XDP_TX_LAST : 0;
+
+ xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len);
+
+ return desc;
+}
+
+/**
+ * libeth_xsk_xmit_do_bulk - send XSk xmit frames
+ * @pool: XSk buffer pool containing the frames to send
+ * @xdpsq: opaque pointer to driver's XDPSQ struct
+ * @budget: maximum number of frames can be sent
+ * @tmo: optional XSk Tx metadata ops
+ * @prep: driver callback to build a &libeth_xdpsq
+ * @xmit: driver callback to put frames to a HW queue
+ * @finalize: driver callback to start a transmission
+ *
+ * Implements generic XSk xmit. Always turns on XSk Tx wakeup as it's assumed
+ * lazy cleaning is used and interrupts are disabled for the queue.
+ * HW descriptor filling is unrolled by ``LIBETH_XDP_TX_BATCH`` to optimize
+ * writes.
+ * Note that unlike other XDP Tx ops, the queue must be locked and cleaned
+ * prior to calling this function to already know available @budget.
+ * @prepare must only build a &libeth_xdpsq and return ``U32_MAX``.
+ *
+ * Return: false if @budget was exhausted, true otherwise.
+ */
+static __always_inline bool
+libeth_xsk_xmit_do_bulk(struct xsk_buff_pool *pool, void *xdpsq, u32 budget,
+ const struct xsk_tx_metadata_ops *tmo,
+ u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq),
+ void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv),
+ void (*finalize)(void *xdpsq, bool sent, bool flush))
+{
+ const struct libeth_xdp_tx_frame *bulk;
+ bool wake;
+ u32 n;
+
+ wake = xsk_uses_need_wakeup(pool);
+ if (wake)
+ xsk_clear_tx_need_wakeup(pool);
+
+ n = xsk_tx_peek_release_desc_batch(pool, budget);
+ bulk = container_of(&pool->tx_descs[0], typeof(*bulk), desc);
+
+ libeth_xdp_tx_xmit_bulk(bulk, xdpsq, n, true,
+ libeth_xdp_ptr_to_priv(tmo), prep,
+ libeth_xsk_xmit_fill_buf, xmit);
+ finalize(xdpsq, n, true);
+
+ if (wake)
+ xsk_set_tx_need_wakeup(pool);
+
+ return n < budget;
+}
+
+/* Rx polling path */
+
+/**
+ * libeth_xsk_tx_init_bulk - initialize XDP Tx bulk for an XSk Rx NAPI poll
+ * @bq: bulk to initialize
+ * @prog: RCU pointer to the XDP program (never %NULL)
+ * @dev: target &net_device
+ * @xdpsqs: array of driver XDPSQ structs
+ * @num: number of active XDPSQs, the above array length
+ *
+ * Should be called on an onstack XDP Tx bulk before the XSk NAPI polling loop.
+ * Initializes all the needed fields to run libeth_xdp functions.
+ * Never checks if @prog is %NULL or @num == 0 as XDP must always be enabled
+ * when hitting this path.
+ */
+#define libeth_xsk_tx_init_bulk(bq, prog, dev, xdpsqs, num) \
+ __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, true, \
+ __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_))
+
+struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head,
+ struct libeth_xdp_buff *xdp);
+
+/**
+ * libeth_xsk_process_buff - attach XSk Rx buffer to &libeth_xdp_buff
+ * @head: head XSk buffer to attach the XSk buffer to (or %NULL)
+ * @xdp: XSk buffer to process
+ * @len: received data length from the descriptor
+ *
+ * If @head == %NULL, treats the XSk buffer as head and initializes
+ * the required fields. Otherwise, attaches the buffer as a frag.
+ * Already performs DMA sync-for-CPU and frame start prefetch
+ * (for head buffers only).
+ *
+ * Return: head XSk buffer on success or if the descriptor must be skipped
+ * (empty), %NULL if there is no space for a new frag.
+ */
+static inline struct libeth_xdp_buff *
+libeth_xsk_process_buff(struct libeth_xdp_buff *head,
+ struct libeth_xdp_buff *xdp, u32 len)
+{
+ if (unlikely(!len)) {
+ libeth_xsk_buff_free_slow(xdp);
+ return head;
+ }
+
+ xsk_buff_set_size(&xdp->base, len);
+ xsk_buff_dma_sync_for_cpu(&xdp->base);
+
+ if (head)
+ return libeth_xsk_buff_add_frag(head, xdp);
+
+ prefetch(xdp->data);
+
+ return xdp;
+}
+
+void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs,
+ const struct libeth_xdp_buff *xdp);
+
+u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp,
+ const struct libeth_xdp_tx_bulk *bq,
+ enum xdp_action act, int ret);
+
+/**
+ * __libeth_xsk_run_prog - run XDP program on XSk buffer
+ * @xdp: XSk buffer to run the prog on
+ * @bq: buffer bulk for ``XDP_TX`` queueing
+ *
+ * Internal inline abstraction to run XDP program on XSk Rx path. Handles
+ * only the most common ``XDP_REDIRECT`` inline, the rest is processed
+ * externally.
+ * Reports an XDP prog exception on errors.
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+static __always_inline u32
+__libeth_xsk_run_prog(struct libeth_xdp_buff *xdp,
+ const struct libeth_xdp_tx_bulk *bq)
+{
+ enum xdp_action act;
+ int ret = 0;
+
+ act = bpf_prog_run_xdp(bq->prog, &xdp->base);
+ if (unlikely(act != XDP_REDIRECT))
+rest:
+ return __libeth_xsk_run_prog_slow(xdp, bq, act, ret);
+
+ ret = xdp_do_redirect(bq->dev, &xdp->base, bq->prog);
+ if (unlikely(ret))
+ goto rest;
+
+ return LIBETH_XDP_REDIRECT;
+}
+
+/**
+ * libeth_xsk_run_prog - run XDP program on XSk path and handle all verdicts
+ * @xdp: XSk buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers
+ * @fl: driver ``XDP_TX`` bulk flush callback
+ *
+ * Run the attached XDP program and handle all possible verdicts.
+ * Prefer using it via LIBETH_XSK_DEFINE_RUN{,_PASS,_PROG}().
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+#define libeth_xsk_run_prog(xdp, bq, fl) \
+ __libeth_xdp_run_flush(xdp, bq, __libeth_xsk_run_prog, \
+ libeth_xsk_tx_queue_bulk, fl)
+
+/**
+ * __libeth_xsk_run_pass - helper to run XDP program and handle the result
+ * @xdp: XSk buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @rs: onstack libeth RQ stats
+ * @md: metadata that should be filled to the XSk buffer
+ * @prep: callback for filling the metadata
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Inline abstraction, XSk's counterpart of __libeth_xdp_run_pass(), see its
+ * doc for details.
+ *
+ * Return: false if the polling loop must be exited due to lack of free
+ * buffers, true otherwise.
+ */
+static __always_inline bool
+__libeth_xsk_run_pass(struct libeth_xdp_buff *xdp,
+ struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi,
+ struct libeth_rq_napi_stats *rs, const void *md,
+ void (*prep)(struct libeth_xdp_buff *xdp,
+ const void *md),
+ u32 (*run)(struct libeth_xdp_buff *xdp,
+ struct libeth_xdp_tx_bulk *bq),
+ bool (*populate)(struct sk_buff *skb,
+ const struct libeth_xdp_buff *xdp,
+ struct libeth_rq_napi_stats *rs))
+{
+ struct sk_buff *skb;
+ u32 act;
+
+ rs->bytes += xdp->base.data_end - xdp->data;
+ rs->packets++;
+
+ if (unlikely(xdp_buff_has_frags(&xdp->base)))
+ libeth_xsk_buff_stats_frags(rs, xdp);
+
+ if (prep && (!__builtin_constant_p(!!md) || md))
+ prep(xdp, md);
+
+ act = run(xdp, bq);
+ if (likely(act == LIBETH_XDP_REDIRECT))
+ return true;
+
+ if (act != LIBETH_XDP_PASS)
+ return act != LIBETH_XDP_ABORTED;
+
+ skb = xdp_build_skb_from_zc(&xdp->base);
+ if (unlikely(!skb)) {
+ libeth_xsk_buff_free_slow(xdp);
+ return true;
+ }
+
+ if (unlikely(!populate(skb, xdp, rs))) {
+ napi_consume_skb(skb, true);
+ return true;
+ }
+
+ napi_gro_receive(napi, skb);
+
+ return true;
+}
+
+/**
+ * libeth_xsk_run_pass - helper to run XDP program and handle the result
+ * @xdp: XSk buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @rs: onstack libeth RQ stats
+ * @desc: pointer to the HW descriptor for that frame
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Wrapper around the underscored version when "fill the descriptor metadata"
+ * means just writing the pointer to the HW descriptor as @xdp->desc.
+ */
+#define libeth_xsk_run_pass(xdp, bq, napi, rs, desc, run, populate) \
+ __libeth_xsk_run_pass(xdp, bq, napi, rs, desc, libeth_xdp_prep_desc, \
+ run, populate)
+
+/**
+ * libeth_xsk_finalize_rx - finalize XDPSQ after an XSk NAPI polling loop
+ * @bq: ``XDP_TX`` frame bulk
+ * @flush: driver callback to flush the bulk
+ * @finalize: driver callback to start sending the frames and run the timer
+ *
+ * Flush the bulk if there are frames left to send, kick the queue and flush
+ * the XDP maps.
+ */
+#define libeth_xsk_finalize_rx(bq, flush, finalize) \
+ __libeth_xdp_finalize_rx(bq, LIBETH_XDP_TX_XSK, flush, finalize)
+
+/*
+ * Helpers to reduce boilerplate code in drivers.
+ *
+ * Typical driver XSk Rx flow would be (excl. bulk and buff init, frag attach):
+ *
+ * LIBETH_XDP_DEFINE_START();
+ * LIBETH_XSK_DEFINE_FLUSH_TX(static driver_xsk_flush_tx, driver_xsk_tx_prep,
+ * driver_xdp_xmit);
+ * LIBETH_XSK_DEFINE_RUN(static driver_xsk_run, driver_xsk_run_prog,
+ * driver_xsk_flush_tx, driver_populate_skb);
+ * LIBETH_XSK_DEFINE_FINALIZE(static driver_xsk_finalize_rx,
+ * driver_xsk_flush_tx, driver_xdp_finalize_sq);
+ * LIBETH_XDP_DEFINE_END();
+ *
+ * This will build a set of 4 static functions. The compiler is free to decide
+ * whether to inline them.
+ * Then, in the NAPI polling function:
+ *
+ * while (packets < budget) {
+ * // ...
+ * if (!driver_xsk_run(xdp, &bq, napi, &rs, desc))
+ * break;
+ * }
+ * driver_xsk_finalize_rx(&bq);
+ */
+
+/**
+ * LIBETH_XSK_DEFINE_FLUSH_TX - define a driver XSk ``XDP_TX`` flush function
+ * @name: name of the function to define
+ * @prep: driver callback to clean an XDPSQ
+ * @xmit: driver callback to write a HW Tx descriptor
+ */
+#define LIBETH_XSK_DEFINE_FLUSH_TX(name, prep, xmit) \
+ __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xsk)
+
+/**
+ * LIBETH_XSK_DEFINE_RUN_PROG - define a driver XDP program run function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an XSk ``XDP_TX`` bulk
+ */
+#define LIBETH_XSK_DEFINE_RUN_PROG(name, flush) \
+ u32 __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xsk)
+
+/**
+ * LIBETH_XSK_DEFINE_RUN_PASS - define a driver buffer process + pass function
+ * @name: name of the function to define
+ * @run: driver callback to run XDP program (above)
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XSK_DEFINE_RUN_PASS(name, run, populate) \
+ bool __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xsk)
+
+/**
+ * LIBETH_XSK_DEFINE_RUN - define a driver buffer process, run + pass function
+ * @name: name of the function to define
+ * @run: name of the XDP prog run function to define
+ * @flush: driver callback to flush an XSk ``XDP_TX`` bulk
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XSK_DEFINE_RUN(name, run, flush, populate) \
+ __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XSK)
+
+/**
+ * LIBETH_XSK_DEFINE_FINALIZE - define a driver XSk NAPI poll finalize function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an XSk ``XDP_TX`` bulk
+ * @finalize: driver callback to finalize an XDPSQ and run the timer
+ */
+#define LIBETH_XSK_DEFINE_FINALIZE(name, flush, finalize) \
+ __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xsk)
+
+/* Refilling */
+
+/**
+ * struct libeth_xskfq - structure representing an XSk buffer (fill) queue
+ * @fp: hotpath part of the structure
+ * @pool: &xsk_buff_pool for buffer management
+ * @fqes: array of XSk buffer pointers
+ * @descs: opaque pointer to the HW descriptor array
+ * @ntu: index of the next buffer to poll
+ * @count: number of descriptors/buffers the queue has
+ * @pending: current number of XSkFQEs to refill
+ * @thresh: threshold below which the queue is refilled
+ * @buf_len: HW-writeable length per each buffer
+ * @nid: ID of the closest NUMA node with memory
+ */
+struct libeth_xskfq {
+ struct_group_tagged(libeth_xskfq_fp, fp,
+ struct xsk_buff_pool *pool;
+ struct libeth_xdp_buff **fqes;
+ void *descs;
+
+ u32 ntu;
+ u32 count;
+ );
+
+ /* Cold fields */
+ u32 pending;
+ u32 thresh;
+
+ u32 buf_len;
+ int nid;
+};
+
+int libeth_xskfq_create(struct libeth_xskfq *fq);
+void libeth_xskfq_destroy(struct libeth_xskfq *fq);
+
+/**
+ * libeth_xsk_buff_xdp_get_dma - get DMA address of XSk &libeth_xdp_buff
+ * @xdp: buffer to get the DMA addr for
+ */
+#define libeth_xsk_buff_xdp_get_dma(xdp) \
+ xsk_buff_xdp_get_dma(&(xdp)->base)
+
+/**
+ * libeth_xskfqe_alloc - allocate @n XSk Rx buffers
+ * @fq: hotpath part of the XSkFQ, usually onstack
+ * @n: number of buffers to allocate
+ * @fill: driver callback to write DMA addresses to HW descriptors
+ *
+ * Note that @fq->ntu gets updated, but ::pending must be recalculated
+ * by the caller.
+ *
+ * Return: number of buffers refilled.
+ */
+static __always_inline u32
+libeth_xskfqe_alloc(struct libeth_xskfq_fp *fq, u32 n,
+ void (*fill)(const struct libeth_xskfq_fp *fq, u32 i))
+{
+ u32 this, ret, done = 0;
+ struct xdp_buff **xskb;
+
+ this = fq->count - fq->ntu;
+ if (likely(this > n))
+ this = n;
+
+again:
+ xskb = (typeof(xskb))&fq->fqes[fq->ntu];
+ ret = xsk_buff_alloc_batch(fq->pool, xskb, this);
+
+ for (u32 i = 0, ntu = fq->ntu; likely(i < ret); i++)
+ fill(fq, ntu + i);
+
+ done += ret;
+ fq->ntu += ret;
+
+ if (likely(fq->ntu < fq->count) || unlikely(ret < this))
+ goto out;
+
+ fq->ntu = 0;
+
+ if (this < n) {
+ this = n - this;
+ goto again;
+ }
+
+out:
+ return done;
+}
+
+/* .ndo_xsk_wakeup */
+
+void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi);
+void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid);
+
+/* Pool setup */
+
+int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable);
+
+#endif /* __LIBETH_XSK_H */
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index c306ebe379a0..26232f603e33 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -138,12 +138,12 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
static inline void lwtunnel_set_redirect(struct dst_entry *dst)
{
if (lwtunnel_output_redirect(dst->lwtstate)) {
- dst->lwtstate->orig_output = dst->output;
- dst->output = lwtunnel_output;
+ dst->lwtstate->orig_output = READ_ONCE(dst->output);
+ WRITE_ONCE(dst->output, lwtunnel_output);
}
if (lwtunnel_input_redirect(dst->lwtstate)) {
- dst->lwtstate->orig_input = dst->input;
- dst->input = lwtunnel_input;
+ dst->lwtstate->orig_input = READ_ONCE(dst->input);
+ WRITE_ONCE(dst->input, lwtunnel_input);
}
}
#else
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 82617579d910..a45e4bee65d4 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -758,6 +758,8 @@ struct ieee80211_parsed_tpe {
* be updated to 1, even if bss_param_ch_cnt didn't change. This allows
* the link to know that it heard the latest value from its own beacon
* (as opposed to hearing its value from another link's beacon).
+ * @s1g_long_beacon_period: number of beacon intervals between each long
+ * beacon transmission.
*/
struct ieee80211_bss_conf {
struct ieee80211_vif *vif;
@@ -857,6 +859,8 @@ struct ieee80211_bss_conf {
u8 bss_param_ch_cnt;
u8 bss_param_ch_cnt_link_id;
+
+ u8 s1g_long_beacon_period;
};
/**
@@ -2428,6 +2432,7 @@ struct ieee80211_sta_aggregates {
* @he_cap: HE capabilities of this STA
* @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities
* @eht_cap: EHT capabilities of this STA
+ * @s1g_cap: S1G capabilities of this STA
* @agg: per-link data for multi-link aggregation
* @bandwidth: current bandwidth the station can receive with
* @rx_nss: in HT/VHT, the maximum number of spatial streams the
@@ -2450,6 +2455,7 @@ struct ieee80211_link_sta {
struct ieee80211_sta_he_cap he_cap;
struct ieee80211_he_6ghz_capa he_6ghz_capa;
struct ieee80211_sta_eht_cap eht_cap;
+ struct ieee80211_sta_s1g_cap s1g_cap;
struct ieee80211_sta_aggregates agg;
@@ -2850,8 +2856,6 @@ struct ieee80211_txq {
*
* @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT
* and connecting with a lower bandwidth instead
- * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in
- * EHT in 5 GHz and connecting with a lower bandwidth instead
*
* @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so
* no need to stop queues. This really should be set by a driver that
@@ -2921,7 +2925,6 @@ enum ieee80211_hw_flags {
IEEE80211_HW_DETECTS_COLOR_COLLISION,
IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX,
IEEE80211_HW_DISALLOW_PUNCTURING,
- IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ,
IEEE80211_HW_HANDLES_QUIET_CSA,
IEEE80211_HW_STRICT,
@@ -4133,6 +4136,15 @@ struct ieee80211_prep_tx_info {
* Statistics that the driver doesn't fill will be filled by mac80211.
* The callback can sleep.
*
+ * @link_sta_statistics: Get link statistics for this station. For example with
+ * beacon filtering, the statistics kept by mac80211 might not be
+ * accurate, so let the driver pre-fill the statistics. The driver can
+ * fill most of the values (indicating which by setting the filled
+ * bitmap), but not all of them make sense - see the source for which
+ * ones are possible.
+ * Statistics that the driver doesn't fill will be filled by mac80211.
+ * The callback can sleep.
+ *
* @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max),
* bursting) for a hardware TX queue.
* Returns a negative error code on failure.
@@ -4302,6 +4314,8 @@ struct ieee80211_prep_tx_info {
* @mgd_complete_tx: Notify the driver that the response frame for a previously
* transmitted frame announced with @mgd_prepare_tx was received, the data
* is filled similarly to @mgd_prepare_tx though the duration is not used.
+ * Note that this isn't always called for each mgd_prepare_tx() call, for
+ * example for SAE the 'confirm' messages can be on the air in any order.
*
* @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending
* a TDLS discovery-request, we expect a reply to arrive on the AP's
@@ -4466,6 +4480,8 @@ struct ieee80211_prep_tx_info {
* new links bitmaps may be 0 if going from/to a non-MLO situation.
* The @old array contains pointers to the old bss_conf structures
* that were already removed, in case they're needed.
+ * Note that removal of link should always succeed, so the return value
+ * will be ignored in a removal only case.
* This callback can sleep.
* @change_sta_links: Change the valid links of a station, similar to
* @change_vif_links. This callback can sleep.
@@ -4508,7 +4524,7 @@ struct ieee80211_ops {
enum nl80211_iftype new_type, bool p2p);
void (*remove_interface)(struct ieee80211_hw *hw,
struct ieee80211_vif *vif);
- int (*config)(struct ieee80211_hw *hw, u32 changed);
+ int (*config)(struct ieee80211_hw *hw, int radio_idx, u32 changed);
void (*bss_info_changed)(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
struct ieee80211_bss_conf *info,
@@ -4571,8 +4587,10 @@ struct ieee80211_ops {
void (*get_key_seq)(struct ieee80211_hw *hw,
struct ieee80211_key_conf *key,
struct ieee80211_key_seq *seq);
- int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value);
- int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value);
+ int (*set_frag_threshold)(struct ieee80211_hw *hw, int radio_idx,
+ u32 value);
+ int (*set_rts_threshold)(struct ieee80211_hw *hw, int radio_idx,
+ u32 value);
int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
struct ieee80211_sta *sta);
int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
@@ -4627,6 +4645,10 @@ struct ieee80211_ops {
s64 offset);
void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
int (*tx_last_beacon)(struct ieee80211_hw *hw);
+ void (*link_sta_statistics)(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_link_sta *link_sta,
+ struct link_station_info *link_sinfo);
/**
* @ampdu_action:
@@ -4665,7 +4687,8 @@ struct ieee80211_ops {
int (*get_survey)(struct ieee80211_hw *hw, int idx,
struct survey_info *survey);
void (*rfkill_poll)(struct ieee80211_hw *hw);
- void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class);
+ void (*set_coverage_class)(struct ieee80211_hw *hw, int radio_idx,
+ s16 coverage_class);
#ifdef CONFIG_NL80211_TESTMODE
int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
void *data, int len);
@@ -4680,8 +4703,10 @@ struct ieee80211_ops {
void (*channel_switch)(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
struct ieee80211_channel_switch *ch_switch);
- int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant);
- int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant);
+ int (*set_antenna)(struct ieee80211_hw *hw, int radio_idx,
+ u32 tx_ant, u32 rx_ant);
+ int (*get_antenna)(struct ieee80211_hw *hw, int radio_idx,
+ u32 *tx_ant, u32 *rx_ant);
int (*remain_on_channel)(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
@@ -6008,21 +6033,12 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
int tid, struct ieee80211_key_seq *seq);
/**
- * ieee80211_remove_key - remove the given key
- * @keyconf: the parameter passed with the set key
- *
- * Context: Must be called with the wiphy mutex held.
- *
- * Remove the given key. If the key was uploaded to the hardware at the
- * time this function is called, it is not deleted in the hardware but
- * instead assumed to have been removed already.
- */
-void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);
-
-/**
* ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN
* @vif: the virtual interface to add the key on
- * @keyconf: new key data
+ * @idx: the keyidx of the key
+ * @key_data: the key data
+ * @key_len: the key data. Might be bigger than the actual key length,
+ * but not smaller (for the driver convinence)
* @link_id: the link id of the key or -1 for non-MLO
*
* When GTK rekeying was done while the system was suspended, (a) new
@@ -6045,13 +6061,11 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);
* for the new key for each TID to set up sequence counters properly.
*
* IMPORTANT: If this replaces a key that is present in the hardware,
- * then it will attempt to remove it during this call. In many cases
- * this isn't what you want, so call ieee80211_remove_key() first for
- * the key that's being replaced.
+ * then it will attempt to remove it during this call.
*/
struct ieee80211_key_conf *
ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
- struct ieee80211_key_conf *keyconf,
+ u8 idx, u8 *key_data, u8 key_len,
int link_id);
/**
@@ -7242,13 +7256,14 @@ void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif);
* ieee80211_ave_rssi - report the average RSSI for the specified interface
*
* @vif: the specified virtual interface
+ * @link_id: the link ID for MLO, or -1 for non-MLO
*
* Note: This function assumes that the given vif is valid.
*
* Return: The average RSSI value for the requested interface, or 0 if not
* applicable.
*/
-int ieee80211_ave_rssi(struct ieee80211_vif *vif);
+int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id);
/**
* ieee80211_report_wowlan_wakeup - report WoWLAN wakeup
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 3ce56a816425..57df78cfbf82 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -10,6 +10,7 @@
#include "shm_channel.h"
#define GDMA_STATUS_MORE_ENTRIES 0x00000105
+#define GDMA_STATUS_CMD_UNSUPPORTED 0xffffffff
/* Structures labeled with "HW DATA" are exchanged with the hardware. All of
* them are naturally aligned and hence don't need __packed.
@@ -58,9 +59,10 @@ enum gdma_eqe_type {
GDMA_EQE_HWC_INIT_EQ_ID_DB = 129,
GDMA_EQE_HWC_INIT_DATA = 130,
GDMA_EQE_HWC_INIT_DONE = 131,
- GDMA_EQE_HWC_SOC_RECONFIG = 132,
+ GDMA_EQE_HWC_FPGA_RECONFIG = 132,
GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133,
GDMA_EQE_HWC_SOC_SERVICE = 134,
+ GDMA_EQE_HWC_RESET_REQUEST = 135,
GDMA_EQE_RNIC_QP_FATAL = 176,
};
@@ -388,7 +390,7 @@ struct gdma_context {
unsigned int max_num_queues;
unsigned int max_num_msix;
unsigned int num_msix_usable;
- struct gdma_irq_context *irq_contexts;
+ struct xarray irq_contexts;
/* L2 MTU */
u16 adapter_mtu;
@@ -403,6 +405,8 @@ struct gdma_context {
u32 test_event_eq_id;
bool is_pf;
+ bool in_service;
+
phys_addr_t bar0_pa;
void __iomem *bar0_va;
void __iomem *shm_base;
@@ -578,12 +582,24 @@ enum {
/* Driver can handle holes (zeros) in the device list */
#define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
+/* Driver supports dynamic MSI-X vector allocation */
+#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13)
+
+/* Driver can self reset on EQE notification */
+#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14)
+
+/* Driver can self reset on FPGA Reconfig EQE notification */
+#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
+
#define GDMA_DRV_CAP_FLAGS1 \
(GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \
GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \
- GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP)
+ GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
+ GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \
+ GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \
+ GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE)
#define GDMA_DRV_CAP_FLAGS2 0
@@ -910,4 +926,9 @@ void mana_unregister_debugfs(void);
int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event);
+int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state);
+int mana_gd_resume(struct pci_dev *pdev);
+
+bool mana_need_log(struct gdma_context *gc, int err);
+
#endif /* _GDMA_H */
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 9abb66461211..e1030a7d2daa 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -5,6 +5,7 @@
#define _MANA_H
#include <net/xdp.h>
+#include <net/net_shaper.h>
#include "gdma.h"
#include "hw_channel.h"
@@ -404,6 +405,65 @@ struct mana_ethtool_stats {
u64 rx_cqe_unknown_type;
};
+struct mana_ethtool_phy_stats {
+ /* Drop Counters */
+ u64 rx_pkt_drop_phy;
+ u64 tx_pkt_drop_phy;
+
+ /* Per TC traffic Counters */
+ u64 rx_pkt_tc0_phy;
+ u64 tx_pkt_tc0_phy;
+ u64 rx_pkt_tc1_phy;
+ u64 tx_pkt_tc1_phy;
+ u64 rx_pkt_tc2_phy;
+ u64 tx_pkt_tc2_phy;
+ u64 rx_pkt_tc3_phy;
+ u64 tx_pkt_tc3_phy;
+ u64 rx_pkt_tc4_phy;
+ u64 tx_pkt_tc4_phy;
+ u64 rx_pkt_tc5_phy;
+ u64 tx_pkt_tc5_phy;
+ u64 rx_pkt_tc6_phy;
+ u64 tx_pkt_tc6_phy;
+ u64 rx_pkt_tc7_phy;
+ u64 tx_pkt_tc7_phy;
+
+ u64 rx_byte_tc0_phy;
+ u64 tx_byte_tc0_phy;
+ u64 rx_byte_tc1_phy;
+ u64 tx_byte_tc1_phy;
+ u64 rx_byte_tc2_phy;
+ u64 tx_byte_tc2_phy;
+ u64 rx_byte_tc3_phy;
+ u64 tx_byte_tc3_phy;
+ u64 rx_byte_tc4_phy;
+ u64 tx_byte_tc4_phy;
+ u64 rx_byte_tc5_phy;
+ u64 tx_byte_tc5_phy;
+ u64 rx_byte_tc6_phy;
+ u64 tx_byte_tc6_phy;
+ u64 rx_byte_tc7_phy;
+ u64 tx_byte_tc7_phy;
+
+ /* Per TC pause Counters */
+ u64 rx_pause_tc0_phy;
+ u64 tx_pause_tc0_phy;
+ u64 rx_pause_tc1_phy;
+ u64 tx_pause_tc1_phy;
+ u64 rx_pause_tc2_phy;
+ u64 tx_pause_tc2_phy;
+ u64 rx_pause_tc3_phy;
+ u64 tx_pause_tc3_phy;
+ u64 rx_pause_tc4_phy;
+ u64 tx_pause_tc4_phy;
+ u64 rx_pause_tc5_phy;
+ u64 tx_pause_tc5_phy;
+ u64 rx_pause_tc6_phy;
+ u64 tx_pause_tc6_phy;
+ u64 rx_pause_tc7_phy;
+ u64 tx_pause_tc7_phy;
+};
+
struct mana_context {
struct gdma_dev *gdma_dev;
@@ -467,13 +527,22 @@ struct mana_port_context {
struct mutex vport_mutex;
int vport_use_count;
+ /* Net shaper handle*/
+ struct net_shaper_handle handle;
+
u16 port_idx;
+ /* Currently configured speed (mbps) */
+ u32 speed;
+ /* Maximum speed supported by the SKU (mbps) */
+ u32 max_speed;
bool port_is_up;
bool port_st_save; /* Saved port state */
struct mana_ethtool_stats eth_stats;
+ struct mana_ethtool_phy_stats phy_stats;
+
/* Debugfs */
struct dentry *mana_port_debugfs;
};
@@ -501,6 +570,10 @@ struct bpf_prog *mana_xdp_get(struct mana_port_context *apc);
void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog);
int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
void mana_query_gf_stats(struct mana_port_context *apc);
+int mana_query_link_cfg(struct mana_port_context *apc);
+int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
+ int enable_clamping);
+void mana_query_phy_stats(struct mana_port_context *apc);
int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues);
void mana_pre_dealloc_rxbufs(struct mana_port_context *apc);
@@ -527,6 +600,9 @@ enum mana_command_code {
MANA_FENCE_RQ = 0x20006,
MANA_CONFIG_VPORT_RX = 0x20007,
MANA_QUERY_VPORT_CONFIG = 0x20008,
+ MANA_QUERY_LINK_CONFIG = 0x2000A,
+ MANA_SET_BW_CLAMP = 0x2000B,
+ MANA_QUERY_PHY_STAT = 0x2000c,
/* Privileged commands for the PF mode */
MANA_REGISTER_FILTER = 0x28000,
@@ -535,6 +611,35 @@ enum mana_command_code {
MANA_DEREGISTER_HW_PORT = 0x28004,
};
+/* Query Link Configuration*/
+struct mana_query_link_config_req {
+ struct gdma_req_hdr hdr;
+ mana_handle_t vport;
+}; /* HW DATA */
+
+struct mana_query_link_config_resp {
+ struct gdma_resp_hdr hdr;
+ u32 qos_speed_mbps;
+ u8 qos_unconfigured;
+ u8 reserved1[3];
+ u32 link_speed_mbps;
+ u8 reserved2[4];
+}; /* HW DATA */
+
+/* Set Bandwidth Clamp*/
+struct mana_set_bw_clamp_req {
+ struct gdma_req_hdr hdr;
+ mana_handle_t vport;
+ enum TRI_STATE enable_clamping;
+ u32 link_speed_mbps;
+}; /* HW DATA */
+
+struct mana_set_bw_clamp_resp {
+ struct gdma_resp_hdr hdr;
+ u8 qos_unconfigured;
+ u8 reserved[7];
+}; /* HW DATA */
+
/* Query Device Configuration */
struct mana_query_device_cfg_req {
struct gdma_req_hdr hdr;
@@ -689,6 +794,74 @@ struct mana_query_gf_stat_resp {
u64 tx_err_gdma;
}; /* HW DATA */
+/* Query phy stats */
+struct mana_query_phy_stat_req {
+ struct gdma_req_hdr hdr;
+ u64 req_stats;
+}; /* HW DATA */
+
+struct mana_query_phy_stat_resp {
+ struct gdma_resp_hdr hdr;
+ u64 reported_stats;
+
+ /* Aggregate Drop Counters */
+ u64 rx_pkt_drop_phy;
+ u64 tx_pkt_drop_phy;
+
+ /* Per TC(Traffic class) traffic Counters */
+ u64 rx_pkt_tc0_phy;
+ u64 tx_pkt_tc0_phy;
+ u64 rx_pkt_tc1_phy;
+ u64 tx_pkt_tc1_phy;
+ u64 rx_pkt_tc2_phy;
+ u64 tx_pkt_tc2_phy;
+ u64 rx_pkt_tc3_phy;
+ u64 tx_pkt_tc3_phy;
+ u64 rx_pkt_tc4_phy;
+ u64 tx_pkt_tc4_phy;
+ u64 rx_pkt_tc5_phy;
+ u64 tx_pkt_tc5_phy;
+ u64 rx_pkt_tc6_phy;
+ u64 tx_pkt_tc6_phy;
+ u64 rx_pkt_tc7_phy;
+ u64 tx_pkt_tc7_phy;
+
+ u64 rx_byte_tc0_phy;
+ u64 tx_byte_tc0_phy;
+ u64 rx_byte_tc1_phy;
+ u64 tx_byte_tc1_phy;
+ u64 rx_byte_tc2_phy;
+ u64 tx_byte_tc2_phy;
+ u64 rx_byte_tc3_phy;
+ u64 tx_byte_tc3_phy;
+ u64 rx_byte_tc4_phy;
+ u64 tx_byte_tc4_phy;
+ u64 rx_byte_tc5_phy;
+ u64 tx_byte_tc5_phy;
+ u64 rx_byte_tc6_phy;
+ u64 tx_byte_tc6_phy;
+ u64 rx_byte_tc7_phy;
+ u64 tx_byte_tc7_phy;
+
+ /* Per TC(Traffic Class) pause Counters */
+ u64 rx_pause_tc0_phy;
+ u64 tx_pause_tc0_phy;
+ u64 rx_pause_tc1_phy;
+ u64 tx_pause_tc1_phy;
+ u64 rx_pause_tc2_phy;
+ u64 tx_pause_tc2_phy;
+ u64 rx_pause_tc3_phy;
+ u64 tx_pause_tc3_phy;
+ u64 rx_pause_tc4_phy;
+ u64 tx_pause_tc4_phy;
+ u64 rx_pause_tc5_phy;
+ u64 tx_pause_tc5_phy;
+ u64 rx_pause_tc6_phy;
+ u64 tx_pause_tc6_phy;
+ u64 rx_pause_tc7_phy;
+ u64 tx_pause_tc7_phy;
+}; /* HW DATA */
+
/* Configure vPort Rx Steering */
struct mana_cfg_rx_steer_req_v2 {
struct gdma_req_hdr hdr;
diff --git a/include/net/mctp.h b/include/net/mctp.h
index 07d458990113..c3207ce98f07 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -69,7 +69,10 @@ struct mctp_sock {
/* bind() params */
unsigned int bind_net;
- mctp_eid_t bind_addr;
+ mctp_eid_t bind_local_addr;
+ mctp_eid_t bind_peer_addr;
+ unsigned int bind_peer_net;
+ bool bind_peer_set;
__u8 bind_type;
/* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */
@@ -183,8 +186,8 @@ struct mctp_sk_key {
struct mctp_skb_cb {
unsigned int magic;
unsigned int net;
- int ifindex; /* extended/direct addressing if set */
- mctp_eid_t src;
+ /* fields below provide extended addressing for ingress to recvmsg() */
+ int ifindex;
unsigned char halen;
unsigned char haddr[MAX_ADDR_LEN];
};
@@ -222,6 +225,8 @@ struct mctp_flow {
struct mctp_sk_key *key;
};
+struct mctp_dst;
+
/* Route definition.
*
* These are held in the pernet->mctp.routes list, with RCU protection for
@@ -229,16 +234,25 @@ struct mctp_flow {
* dropped on NETDEV_UNREGISTER events.
*
* Updates to the route table are performed under rtnl; all reads under RCU,
- * so routes cannot be referenced over a RCU grace period. Specifically: A
- * caller cannot block between mctp_route_lookup and mctp_route_release()
+ * so routes cannot be referenced over a RCU grace period.
*/
struct mctp_route {
mctp_eid_t min, max;
unsigned char type;
+
unsigned int mtu;
- struct mctp_dev *dev;
- int (*output)(struct mctp_route *route,
+
+ enum {
+ MCTP_ROUTE_DIRECT,
+ MCTP_ROUTE_GATEWAY,
+ } dst_type;
+ union {
+ struct mctp_dev *dev;
+ struct mctp_fq_addr gateway;
+ };
+
+ int (*output)(struct mctp_dst *dst,
struct sk_buff *skb);
struct list_head list;
@@ -246,12 +260,35 @@ struct mctp_route {
struct rcu_head rcu;
};
+/* Route lookup result: dst. Represents the results of a routing decision,
+ * but is only held over the individual routing operation.
+ *
+ * Will typically be stored on the caller stack, and must be released after
+ * usage.
+ */
+struct mctp_dst {
+ struct mctp_dev *dev;
+ unsigned int mtu;
+ mctp_eid_t nexthop;
+
+ /* set for direct addressing */
+ unsigned char halen;
+ unsigned char haddr[MAX_ADDR_LEN];
+
+ int (*output)(struct mctp_dst *dst, struct sk_buff *skb);
+};
+
+int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex,
+ unsigned char halen, const unsigned char *haddr);
+
/* route interfaces */
-struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet,
- mctp_eid_t daddr);
+int mctp_route_lookup(struct net *net, unsigned int dnet,
+ mctp_eid_t daddr, struct mctp_dst *dst);
+
+void mctp_dst_release(struct mctp_dst *dst);
/* always takes ownership of skb */
-int mctp_local_output(struct sock *sk, struct mctp_route *rt,
+int mctp_local_output(struct sock *sk, struct mctp_dst *dst,
struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag);
void mctp_key_unref(struct mctp_sk_key *key);
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 3c88d5bc5eed..d38783a2ce57 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -60,15 +60,6 @@ enum {
#include <net/neighbour.h>
-/* Set to 3 to get tracing... */
-#define ND_DEBUG 1
-
-#define ND_PRINTK(val, level, fmt, ...) \
-do { \
- if (val <= ND_DEBUG) \
- net_##level##_ratelimited(fmt, ##__VA_ARGS__); \
-} while (0)
-
struct ctl_table;
struct inet6_dev;
struct net_device;
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 9a832cab5b1d..4a30bd458c5a 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -176,12 +176,17 @@ struct neigh_ops {
};
struct pneigh_entry {
- struct pneigh_entry *next;
+ struct pneigh_entry __rcu *next;
possible_net_t net;
struct net_device *dev;
netdevice_tracker dev_tracker;
+ union {
+ struct list_head free_node;
+ struct rcu_head rcu;
+ };
u32 flags;
u8 protocol;
+ bool permanent;
u32 key[];
};
@@ -235,7 +240,8 @@ struct neigh_table {
unsigned long last_rand;
struct neigh_statistics __percpu *stats;
struct neigh_hash_table __rcu *nht;
- struct pneigh_entry **phash_buckets;
+ struct mutex phash_lock;
+ struct pneigh_entry __rcu **phash_buckets;
};
static inline int neigh_parms_family(struct neigh_parms *p)
@@ -260,13 +266,15 @@ static inline void *neighbour_priv(const struct neighbour *n)
#define NEIGH_UPDATE_F_EXT_LEARNED BIT(5)
#define NEIGH_UPDATE_F_ISROUTER BIT(6)
#define NEIGH_UPDATE_F_ADMIN BIT(7)
+#define NEIGH_UPDATE_F_EXT_VALIDATED BIT(8)
/* In-kernel representation for NDA_FLAGS_EXT flags: */
#define NTF_OLD_MASK 0xff
#define NTF_EXT_SHIFT 8
-#define NTF_EXT_MASK (NTF_EXT_MANAGED)
+#define NTF_EXT_MASK (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED)
#define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT)
+#define NTF_EXT_VALIDATED (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT)
extern const struct nla_policy nda_policy[];
@@ -373,10 +381,10 @@ unsigned long neigh_rand_reach_time(unsigned long base);
void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb);
struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net,
- const void *key, struct net_device *dev,
- int creat);
-struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net,
- const void *key, struct net_device *dev);
+ const void *key, struct net_device *dev);
+int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key,
+ struct net_device *dev, u32 flags, u8 protocol,
+ bool permanent);
int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key,
struct net_device *dev);
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index ba2eaf39089b..6e835972abd1 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -294,6 +294,15 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue,
netif_txq_try_stop(_txq, get_desc, start_thrs); \
})
+static inline void netif_subqueue_sent(const struct net_device *dev,
+ unsigned int idx, unsigned int bytes)
+{
+ struct netdev_queue *txq;
+
+ txq = netdev_get_tx_queue(dev, idx);
+ netdev_tx_sent_queue(txq, bytes);
+}
+
#define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \
({ \
struct netdev_queue *_txq; \
diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 2c8c2b023848..8d65ffbf57de 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -13,9 +13,6 @@
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp;
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp;
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp;
-#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp;
#endif
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index ca26274196b9..aa0a7c82199e 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -18,7 +18,6 @@
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_conntrack_tcp.h>
-#include <linux/netfilter/nf_conntrack_dccp.h>
#include <linux/netfilter/nf_conntrack_sctp.h>
#include <linux/netfilter/nf_conntrack_proto_gre.h>
@@ -31,7 +30,6 @@ struct nf_ct_udp {
/* per conntrack: protocol private data */
union nf_conntrack_proto {
/* insert conntrack proto private data here */
- struct nf_ct_dccp dccp;
struct ip_ct_sctp sctp;
struct ip_ct_tcp tcp;
struct nf_ct_udp udp;
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 1f47bef51722..6929f8daf1ed 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -117,11 +117,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
const struct nf_hook_state *state);
-int nf_conntrack_dccp_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state);
int nf_conntrack_sctp_packet(struct nf_conn *ct,
struct sk_buff *skb,
unsigned int dataoff,
@@ -137,7 +132,6 @@ void nf_conntrack_generic_init_net(struct net *net);
void nf_conntrack_tcp_init_net(struct net *net);
void nf_conntrack_udp_init_net(struct net *net);
void nf_conntrack_gre_init_net(struct net *net);
-void nf_conntrack_dccp_init_net(struct net *net);
void nf_conntrack_sctp_init_net(struct net *net);
void nf_conntrack_icmp_init_net(struct net *net);
void nf_conntrack_icmpv6_init_net(struct net *net);
@@ -223,13 +217,6 @@ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct)
}
#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net)
-{
- return &net->ct.nf_ct_proto.dccp;
-}
-#endif
-
#ifdef CONFIG_NF_CT_PROTO_SCTP
static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net)
{
diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index e55eedc84ed7..00506792a06d 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -59,6 +59,9 @@ extern int sysctl_nf_log_all_netns;
int nf_log_register(u_int8_t pf, struct nf_logger *logger);
void nf_log_unregister(struct nf_logger *logger);
+/* Check if any logger is registered for a given protocol family. */
+bool nf_log_is_registered(u_int8_t pf);
+
int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger);
void nf_log_unset(struct net *net, const struct nf_logger *logger);
diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h
index 7c669792fb9c..f1db33bc6bf8 100644
--- a/include/net/netfilter/nf_reject.h
+++ b/include/net/netfilter/nf_reject.h
@@ -34,7 +34,6 @@ static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff,
/* Protocols with partial checksums. */
case IPPROTO_UDPLITE:
- case IPPROTO_DCCP:
return false;
}
return true;
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5e49619ae49c..891e43a01bdc 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -459,19 +459,13 @@ struct nft_set_ext;
* control plane functions.
*/
struct nft_set_ops {
- bool (*lookup)(const struct net *net,
+ const struct nft_set_ext * (*lookup)(const struct net *net,
const struct nft_set *set,
+ const u32 *key);
+ const struct nft_set_ext * (*update)(struct nft_set *set,
const u32 *key,
- const struct nft_set_ext **ext);
- bool (*update)(struct nft_set *set,
- const u32 *key,
- struct nft_elem_priv *
- (*new)(struct nft_set *,
- const struct nft_expr *,
- struct nft_regs *),
const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_set_ext **ext);
+ struct nft_regs *regs);
bool (*delete)(const struct nft_set *set,
const u32 *key);
@@ -1939,11 +1933,6 @@ static inline u64 nft_net_tstamp(const struct net *net)
#define __NFT_REDUCE_READONLY 1UL
#define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY
-static inline bool nft_reduce_is_readonly(const struct nft_expr *expr)
-{
- return expr->ops->reduce == NFT_REDUCE_READONLY;
-}
-
void nft_reg_track_update(struct nft_regs_track *track,
const struct nft_expr *expr, u8 dreg, u8 len);
void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len);
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 03b6165756fc..6c2f483d9828 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -94,34 +94,41 @@ extern const struct nft_set_type nft_set_pipapo_type;
extern const struct nft_set_type nft_set_pipapo_avx2_type;
#ifdef CONFIG_MITIGATION_RETPOLINE
-bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-bool nft_hash_lookup_fast(const struct net *net,
- const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-bool nft_set_do_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
+const struct nft_set_ext *
+nft_rhash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
+const struct nft_set_ext *
+nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
+const struct nft_set_ext *
+nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
+const struct nft_set_ext *
+nft_hash_lookup_fast(const struct net *net, const struct nft_set *set,
+ const u32 *key);
+const struct nft_set_ext *
+nft_hash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
+const struct nft_set_ext *
+nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
#else
-static inline bool
+static inline const struct nft_set_ext *
nft_set_do_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+ const u32 *key)
{
- return set->ops->lookup(net, set, key, ext);
+ return set->ops->lookup(net, set, key);
}
#endif
/* called from nft_pipapo_avx2.c */
-bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
+const struct nft_set_ext *
+nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
/* called from nft_set_pipapo.c */
-bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
+const struct nft_set_ext *
+nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
void nft_counter_init_seqcount(void);
@@ -181,4 +188,7 @@ void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt);
void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt);
+struct nft_elem_priv *nft_dynset_new(struct nft_set *set,
+ const struct nft_expr *expr,
+ struct nft_regs *regs);
#endif /* _NET_NF_TABLES_CORE_H */
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 90a560dc167a..1a8356ca4b78 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -68,6 +68,8 @@
* nlmsg_for_each_msg() loop over all messages
* nlmsg_validate() validate netlink message incl. attrs
* nlmsg_for_each_attr() loop over all attributes
+ * nlmsg_for_each_attr_type() loop over all attributes with the
+ * given type
*
* Misc:
* nlmsg_report() report back to application?
@@ -967,6 +969,18 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh)
nlmsg_attrlen(nlh, hdrlen), rem)
/**
+ * nlmsg_for_each_attr_type - iterate over a stream of attributes
+ * @pos: loop counter, set to the current attribute
+ * @type: required attribute type for @pos
+ * @nlh: netlink message header
+ * @hdrlen: length of the family specific header
+ * @rem: initialized to len, holds bytes currently remaining in stream
+ */
+#define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \
+ nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
+ if (nla_type(pos) == type)
+
+/**
* nlmsg_put - Add a new netlink message to an skb
* @skb: socket buffer to store message in
* @portid: netlink PORTID of requesting application
diff --git a/include/net/netmem.h b/include/net/netmem.h
index 386164fb9c18..f7dacc9e75fd 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -12,6 +12,50 @@
#include <linux/mm.h>
#include <net/net_debug.h>
+/* These fields in struct page are used by the page_pool and net stack:
+ *
+ * struct {
+ * unsigned long pp_magic;
+ * struct page_pool *pp;
+ * unsigned long _pp_mapping_pad;
+ * unsigned long dma_addr;
+ * atomic_long_t pp_ref_count;
+ * };
+ *
+ * We mirror the page_pool fields here so the page_pool can access these
+ * fields without worrying whether the underlying fields belong to a
+ * page or netmem_desc.
+ *
+ * CAUTION: Do not update the fields in netmem_desc without also
+ * updating the anonymous aliasing union in struct net_iov.
+ */
+struct netmem_desc {
+ unsigned long _flags;
+ unsigned long pp_magic;
+ struct page_pool *pp;
+ unsigned long _pp_mapping_pad;
+ unsigned long dma_addr;
+ atomic_long_t pp_ref_count;
+};
+
+#define NETMEM_DESC_ASSERT_OFFSET(pg, desc) \
+ static_assert(offsetof(struct page, pg) == \
+ offsetof(struct netmem_desc, desc))
+NETMEM_DESC_ASSERT_OFFSET(flags, _flags);
+NETMEM_DESC_ASSERT_OFFSET(pp_magic, pp_magic);
+NETMEM_DESC_ASSERT_OFFSET(pp, pp);
+NETMEM_DESC_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad);
+NETMEM_DESC_ASSERT_OFFSET(dma_addr, dma_addr);
+NETMEM_DESC_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
+#undef NETMEM_DESC_ASSERT_OFFSET
+
+/*
+ * Since struct netmem_desc uses the space in struct page, the size
+ * should be checked, until struct netmem_desc has its own instance from
+ * slab, to avoid conflicting with other members within struct page.
+ */
+static_assert(sizeof(struct netmem_desc) <= offsetof(struct page, _refcount));
+
/* net_iov */
DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
@@ -30,13 +74,48 @@ enum net_iov_type {
NET_IOV_MAX = ULONG_MAX
};
+/* A memory descriptor representing abstract networking I/O vectors,
+ * generally for non-pages memory that doesn't have its corresponding
+ * struct page and needs to be explicitly allocated through slab.
+ *
+ * net_iovs are allocated and used by networking code, and the size of
+ * the chunk is PAGE_SIZE.
+ *
+ * This memory can be any form of non-struct paged memory. Examples
+ * include imported dmabuf memory and imported io_uring memory. See
+ * net_iov_type for all the supported types.
+ *
+ * @pp_magic: pp field, similar to the one in struct page/struct
+ * netmem_desc.
+ * @pp: the pp this net_iov belongs to, if any.
+ * @dma_addr: the dma addrs of the net_iov. Needed for the network
+ * card to send/receive this net_iov.
+ * @pp_ref_count: the pp ref count of this net_iov, exactly the same
+ * usage as struct page/struct netmem_desc.
+ * @owner: the net_iov_area this net_iov belongs to, if any.
+ * @type: the type of the memory. Different types of net_iovs are
+ * supported.
+ */
struct net_iov {
- enum net_iov_type type;
- unsigned long pp_magic;
- struct page_pool *pp;
+ union {
+ struct netmem_desc desc;
+
+ /* XXX: The following part should be removed once all
+ * the references to them are converted so as to be
+ * accessed via netmem_desc e.g. niov->desc.pp instead
+ * of niov->pp.
+ */
+ struct {
+ unsigned long _flags;
+ unsigned long pp_magic;
+ struct page_pool *pp;
+ unsigned long _pp_mapping_pad;
+ unsigned long dma_addr;
+ atomic_long_t pp_ref_count;
+ };
+ };
struct net_iov_area *owner;
- unsigned long dma_addr;
- atomic_long_t pp_ref_count;
+ enum net_iov_type type;
};
struct net_iov_area {
@@ -48,27 +127,22 @@ struct net_iov_area {
unsigned long base_virtual;
};
-/* These fields in struct page are used by the page_pool and net stack:
- *
- * struct {
- * unsigned long pp_magic;
- * struct page_pool *pp;
- * unsigned long _pp_mapping_pad;
- * unsigned long dma_addr;
- * atomic_long_t pp_ref_count;
- * };
- *
- * We mirror the page_pool fields here so the page_pool can access these fields
- * without worrying whether the underlying fields belong to a page or net_iov.
+/* net_iov is union'ed with struct netmem_desc mirroring struct page, so
+ * the page_pool can access these fields without worrying whether the
+ * underlying fields are accessed via netmem_desc or directly via
+ * net_iov, until all the references to them are converted so as to be
+ * accessed via netmem_desc e.g. niov->desc.pp instead of niov->pp.
*
- * The non-net stack fields of struct page are private to the mm stack and must
- * never be mirrored to net_iov.
+ * The non-net stack fields of struct page are private to the mm stack
+ * and must never be mirrored to net_iov.
*/
-#define NET_IOV_ASSERT_OFFSET(pg, iov) \
- static_assert(offsetof(struct page, pg) == \
+#define NET_IOV_ASSERT_OFFSET(desc, iov) \
+ static_assert(offsetof(struct netmem_desc, desc) == \
offsetof(struct net_iov, iov))
+NET_IOV_ASSERT_OFFSET(_flags, _flags);
NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic);
NET_IOV_ASSERT_OFFSET(pp, pp);
+NET_IOV_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad);
NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr);
NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
#undef NET_IOV_ASSERT_OFFSET
@@ -89,8 +163,7 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov)
* typedef netmem_ref - a nonexistent type marking a reference to generic
* network memory.
*
- * A netmem_ref currently is always a reference to a struct page. This
- * abstraction is introduced so support for new memory types can be added.
+ * A netmem_ref can be a struct page* or a struct net_iov* underneath.
*
* Use the supplied helpers to obtain the underlying memory pointer and fields.
*/
@@ -117,9 +190,6 @@ static inline struct page *__netmem_to_page(netmem_ref netmem)
return (__force struct page *)netmem;
}
-/* This conversion fails (returns NULL) if the netmem_ref is not struct page
- * backed.
- */
static inline struct page *netmem_to_page(netmem_ref netmem)
{
if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
@@ -143,10 +213,9 @@ static inline netmem_ref net_iov_to_netmem(struct net_iov *niov)
return (__force netmem_ref)((unsigned long)niov | NET_IOV);
}
-static inline netmem_ref page_to_netmem(struct page *page)
-{
- return (__force netmem_ref)page;
-}
+#define page_to_netmem(p) (_Generic((p), \
+ const struct page * : (__force const netmem_ref)(p), \
+ struct page * : (__force netmem_ref)(p)))
/**
* virt_to_netmem - convert virtual memory pointer to a netmem reference
@@ -178,11 +247,61 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem)
return page_to_pfn(netmem_to_page(netmem));
}
+/**
+ * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing
+ * @netmem
+ * @netmem: netmem reference to convert
+ *
+ * Unsafe version that can be used only when @netmem is always backed by
+ * system memory, performs faster and generates smaller object code (no
+ * check for the LSB, no WARN). When @netmem points to IOV, provokes
+ * undefined behaviour.
+ *
+ * Return: pointer to the &netmem_desc (garbage if @netmem is not backed
+ * by system memory).
+ */
+static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem)
+{
+ return (__force struct netmem_desc *)netmem;
+}
+
+/* __netmem_clear_lsb - convert netmem_ref to struct net_iov * for access to
+ * common fields.
+ * @netmem: netmem reference to extract as net_iov.
+ *
+ * All the sub types of netmem_ref (page, net_iov) have the same pp, pp_magic,
+ * dma_addr, and pp_ref_count fields at the same offsets. Thus, we can access
+ * these fields without a type check to make sure that the underlying mem is
+ * net_iov or page.
+ *
+ * The resulting value of this function can only be used to access the fields
+ * that are NET_IOV_ASSERT_OFFSET'd. Accessing any other fields will result in
+ * undefined behavior.
+ *
+ * Return: the netmem_ref cast to net_iov* regardless of its underlying type.
+ */
static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
{
return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV);
}
+/* XXX: How to extract netmem_desc from page must be changed, once
+ * netmem_desc no longer overlays on page and will be allocated through
+ * slab.
+ */
+#define __pp_page_to_nmdesc(p) (_Generic((p), \
+ const struct page * : (const struct netmem_desc *)(p), \
+ struct page * : (struct netmem_desc *)(p)))
+
+/* CAUTION: Check if the page is a pp page before calling this helper or
+ * know it's a pp page.
+ */
+#define pp_page_to_nmdesc(p) \
+({ \
+ DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \
+ __pp_page_to_nmdesc(p); \
+})
+
/**
* __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem
* @netmem: netmem reference to get the pointer from
@@ -196,7 +315,7 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
*/
static inline struct page_pool *__netmem_get_pp(netmem_ref netmem)
{
- return __netmem_to_page(netmem)->pp;
+ return __netmem_to_nmdesc(netmem)->pp;
}
static inline struct page_pool *netmem_get_pp(netmem_ref netmem)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..ab74b5ed0b01 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -7,9 +7,6 @@
#include <linux/atomic.h>
#include <linux/workqueue.h>
#include <linux/netfilter/nf_conntrack_tcp.h>
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-#include <linux/netfilter/nf_conntrack_dccp.h>
-#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
#include <linux/netfilter/nf_conntrack_sctp.h>
#endif
@@ -50,13 +47,6 @@ struct nf_icmp_net {
unsigned int timeout;
};
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-struct nf_dccp_net {
- u8 dccp_loose;
- unsigned int dccp_timeout[CT_DCCP_MAX + 1];
-};
-#endif
-
#ifdef CONFIG_NF_CT_PROTO_SCTP
struct nf_sctp_net {
unsigned int timeouts[SCTP_CONNTRACK_MAX];
@@ -82,9 +72,6 @@ struct nf_ip_net {
struct nf_udp_net udp;
struct nf_icmp_net icmp;
struct nf_icmp_net icmpv6;
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- struct nf_dccp_net dccp;
-#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
struct nf_sctp_net sctp;
#endif
diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h
index 1db8f9aaddb4..89555f90b97b 100644
--- a/include/net/netns/mctp.h
+++ b/include/net/netns/mctp.h
@@ -6,19 +6,25 @@
#ifndef __NETNS_MCTP_H__
#define __NETNS_MCTP_H__
+#include <linux/hash.h>
+#include <linux/hashtable.h>
#include <linux/mutex.h>
#include <linux/types.h>
+#define MCTP_BINDS_BITS 7
+
struct netns_mctp {
/* Only updated under RTNL, entries freed via RCU */
struct list_head routes;
- /* Bound sockets: list of sockets bound by type.
- * This list is updated from non-atomic contexts (under bind_lock),
- * and read (under rcu) in packet rx
+ /* Bound sockets: hash table of sockets, keyed by
+ * (type, src_eid, dest_eid).
+ * Specific src_eid/dest_eid entries also have an entry for
+ * MCTP_ADDR_ANY. This list is updated from non-atomic contexts
+ * (under bind_lock), and read (under rcu) in packet rx.
*/
struct mutex bind_lock;
- struct hlist_head binds;
+ DECLARE_HASHTABLE(binds, MCTP_BINDS_BITS);
/* tag allocations. This list is read and updated from atomic contexts,
* but elements are free()ed after a RCU grace-period
@@ -34,4 +40,10 @@ struct netns_mctp {
struct list_head neighbours;
};
+static inline u32 mctp_bind_hash(u8 type, u8 local_addr, u8 peer_addr)
+{
+ return hash_32(type | (u32)local_addr << 8 | (u32)peer_addr << 16,
+ MCTP_BINDS_BITS);
+}
+
#endif /* __NETNS_MCTP_H__ */
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 93f2c31baf9b..db180626be06 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -153,6 +153,13 @@ static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool,
return page_pool_alloc_netmem(pool, offset, size, gfp);
}
+static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool)
+{
+ gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+
+ return page_pool_alloc_netmems(pool, gfp);
+}
+
static inline struct page *page_pool_alloc(struct page_pool *pool,
unsigned int *offset,
unsigned int *size, gfp_t gfp)
@@ -437,12 +444,7 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem)
*/
static inline dma_addr_t page_pool_get_dma_addr(const struct page *page)
{
- dma_addr_t ret = page->dma_addr;
-
- if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
- ret <<= PAGE_SHIFT;
-
- return ret;
+ return page_pool_get_dma_addr_netmem(page_to_netmem(page));
}
static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool,
diff --git a/include/net/pfcp.h b/include/net/pfcp.h
index af14f970b80e..639553797d3e 100644
--- a/include/net/pfcp.h
+++ b/include/net/pfcp.h
@@ -45,7 +45,7 @@ struct pfcphdr_session {
reserved:4;
#elif defined(__BIG_ENDIAN_BITFIELD)
u8 reserved:4,
- message_priprity:4;
+ message_priority:4;
#else
#error "Please fix <asm/byteorder>"
#endif
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index b07b1cd14e9f..6a5ec1418e85 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -30,8 +30,6 @@ struct request_sock_ops {
unsigned int obj_size;
struct kmem_cache *slab;
char *slab_name;
- int (*rtx_syn_ack)(const struct sock *sk,
- struct request_sock *req);
void (*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void (*send_reset)(const struct sock *sk,
@@ -41,8 +39,6 @@ struct request_sock_ops {
void (*syn_ack_timeout)(const struct request_sock *req);
};
-int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req);
-
struct saved_syn {
u32 mac_hdrlen;
u32 network_hdrlen;
diff --git a/include/net/route.h b/include/net/route.h
index 8e39aa822cf9..7ea840daa775 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -153,7 +153,7 @@ static inline void inet_sk_init_flowi4(const struct inet_sock *inet,
ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
sk->sk_protocol, inet_sk_flowi_flags(sk), daddr,
inet->inet_saddr, inet->inet_dport,
- inet->inet_sport, sk->sk_uid);
+ inet->inet_sport, sk_uid(sk));
security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
}
@@ -331,7 +331,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk),
ip_sock_rt_scope(sk), protocol, flow_flags, dst,
- src, dport, sport, sk->sk_uid);
+ src, dport, sport, sk_uid(sk));
}
static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst,
@@ -390,7 +390,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
const struct net *net;
rcu_read_lock();
- net = dev_net_rcu(dst->dev);
+ net = dev_net_rcu(dst_dev(dst));
hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
rcu_read_unlock();
}
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 1ad7ce71d0a7..8a540ad9b509 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -51,9 +51,9 @@
* We should wean ourselves off this.
*/
union sctp_addr {
+ struct sockaddr_inet sa; /* Large enough for both address families */
struct sockaddr_in v4;
struct sockaddr_in6 v6;
- struct sockaddr sa;
};
/* Forward declarations for data structures. */
diff --git a/include/net/sock.h b/include/net/sock.h
index 4c37015b7cf7..c8a4b283df6f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1553,7 +1553,7 @@ __sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc)
}
static inline bool
-sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
+sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size)
{
return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb));
}
@@ -2076,6 +2076,7 @@ static inline void sock_orphan(struct sock *sk)
sock_set_flag(sk, SOCK_DEAD);
sk_set_socket(sk, NULL);
sk->sk_wq = NULL;
+ /* Note: sk_uid is unchanged. */
write_unlock_bh(&sk->sk_callback_lock);
}
@@ -2086,18 +2087,23 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
rcu_assign_pointer(sk->sk_wq, &parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
- sk->sk_uid = SOCK_INODE(parent)->i_uid;
+ WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid);
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
-kuid_t sock_i_uid(struct sock *sk);
+static inline kuid_t sk_uid(const struct sock *sk)
+{
+ /* Paired with WRITE_ONCE() in sockfs_setattr() */
+ return READ_ONCE(sk->sk_uid);
+}
+
unsigned long __sock_i_ino(struct sock *sk);
unsigned long sock_i_ino(struct sock *sk);
static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
{
- return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+ return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0);
}
static inline u32 net_tx_rndhash(void)
@@ -2590,12 +2596,12 @@ static inline gfp_t gfp_memcg_charge(void)
static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
- return noblock ? 0 : sk->sk_rcvtimeo;
+ return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo);
}
static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
{
- return noblock ? 0 : sk->sk_sndtimeo;
+ return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo);
}
static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
@@ -2677,6 +2683,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb);
+bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk);
+int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
+ struct timespec64 *ts);
+
static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
@@ -2982,7 +2992,6 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);
-void sock_enable_timestamps(struct sock *sk);
#if defined(CONFIG_CGROUP_BPF)
void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
#else
diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h
index e8dd77a96748..a5ce83f3eea4 100644
--- a/include/net/tc_act/tc_connmark.h
+++ b/include/net/tc_act/tc_connmark.h
@@ -7,6 +7,7 @@
struct tcf_connmark_parms {
struct net *net;
u16 zone;
+ int action;
struct rcu_head rcu;
};
diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h
index 68269e4581b7..8d0c7a9f9345 100644
--- a/include/net/tc_act/tc_csum.h
+++ b/include/net/tc_act/tc_csum.h
@@ -8,6 +8,7 @@
struct tcf_csum_params {
u32 update_flags;
+ int action;
struct rcu_head rcu;
};
@@ -18,15 +19,6 @@ struct tcf_csum {
};
#define to_tcf_csum(a) ((struct tcf_csum *)a)
-static inline bool is_tcf_csum(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
- if (a->ops && a->ops->id == TCA_ID_CSUM)
- return true;
-#endif
- return false;
-}
-
static inline u32 tcf_csum_update_flags(const struct tc_action *a)
{
u32 update_flags;
diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index 77f87c622a2e..8b90c86c0b0d 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -13,7 +13,7 @@ struct tcf_ct_params {
struct nf_conntrack_helper *helper;
struct nf_conn *tmpl;
u16 zone;
-
+ int action;
u32 mark;
u32 mark_mask;
@@ -92,13 +92,4 @@ static inline void
tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { }
#endif
-static inline bool is_tcf_ct(const struct tc_action *a)
-{
-#if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK)
- if (a->ops && a->ops->id == TCA_ID_CT)
- return true;
-#endif
- return false;
-}
-
#endif /* __NET_TC_CT_H */
diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h
index f071c1d70a25..7fe01ab236da 100644
--- a/include/net/tc_act/tc_ctinfo.h
+++ b/include/net/tc_act/tc_ctinfo.h
@@ -7,6 +7,7 @@
struct tcf_ctinfo_params {
struct rcu_head rcu;
struct net *net;
+ int action;
u32 dscpmask;
u32 dscpstatemask;
u32 cpmarkmask;
@@ -18,9 +19,9 @@ struct tcf_ctinfo_params {
struct tcf_ctinfo {
struct tc_action common;
struct tcf_ctinfo_params __rcu *params;
- u64 stats_dscp_set;
- u64 stats_dscp_error;
- u64 stats_cpmark_set;
+ atomic64_t stats_dscp_set;
+ atomic64_t stats_dscp_error;
+ atomic64_t stats_cpmark_set;
};
enum {
diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h
index c8fa11ebb397..c1a67149c6b6 100644
--- a/include/net/tc_act/tc_gate.h
+++ b/include/net/tc_act/tc_gate.h
@@ -51,15 +51,6 @@ struct tcf_gate {
#define to_gate(a) ((struct tcf_gate *)a)
-static inline bool is_tcf_gate(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
- if (a->ops && a->ops->id == TCA_ID_GATE)
- return true;
-#endif
- return false;
-}
-
static inline s32 tcf_gate_prio(const struct tc_action *a)
{
s32 tcfg_prio;
diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h
index 721de4f5733a..dd067bd4018d 100644
--- a/include/net/tc_act/tc_mpls.h
+++ b/include/net/tc_act/tc_mpls.h
@@ -10,6 +10,7 @@
struct tcf_mpls_params {
int tcfm_action;
u32 tcfm_label;
+ int action; /* tcf_action */
u8 tcfm_tc;
u8 tcfm_ttl;
u8 tcfm_bos;
@@ -27,15 +28,6 @@ struct tcf_mpls {
};
#define to_mpls(a) ((struct tcf_mpls *)a)
-static inline bool is_tcf_mpls(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
- if (a->ops && a->ops->id == TCA_ID_MPLS)
- return true;
-#endif
- return false;
-}
-
static inline u32 tcf_mpls_action(const struct tc_action *a)
{
u32 tcfm_action;
diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h
index c869274ac529..ae35f4009445 100644
--- a/include/net/tc_act/tc_nat.h
+++ b/include/net/tc_act/tc_nat.h
@@ -6,6 +6,7 @@
#include <net/act_api.h>
struct tcf_nat_parms {
+ int action;
__be32 old_addr;
__be32 new_addr;
__be32 mask;
diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 83fe39931781..f58ee15cd858 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -14,6 +14,7 @@ struct tcf_pedit_key_ex {
struct tcf_pedit_parms {
struct tc_pedit_key *tcfp_keys;
struct tcf_pedit_key_ex *tcfp_keys_ex;
+ int action;
u32 tcfp_off_max_hint;
unsigned char tcfp_nkeys;
unsigned char tcfp_flags;
diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h
index 283bde711a42..a89fc8e68b1e 100644
--- a/include/net/tc_act/tc_police.h
+++ b/include/net/tc_act/tc_police.h
@@ -5,10 +5,11 @@
#include <net/act_api.h>
struct tcf_police_params {
+ int action;
int tcfp_result;
u32 tcfp_ewma_rate;
- s64 tcfp_burst;
u32 tcfp_mtu;
+ s64 tcfp_burst;
s64 tcfp_mtu_ptoks;
s64 tcfp_pkt_burst;
struct psched_ratecfg rate;
@@ -44,15 +45,6 @@ struct tc_police_compat {
struct tc_ratespec peakrate;
};
-static inline bool is_tcf_police(const struct tc_action *act)
-{
-#ifdef CONFIG_NET_CLS_ACT
- if (act->ops && act->ops->id == TCA_ID_POLICE)
- return true;
-#endif
- return false;
-}
-
static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act)
{
struct tcf_police *police = to_police(act);
diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
index b5d76305e854..abd163ca1864 100644
--- a/include/net/tc_act/tc_sample.h
+++ b/include/net/tc_act/tc_sample.h
@@ -17,15 +17,6 @@ struct tcf_sample {
};
#define to_sample(a) ((struct tcf_sample *)a)
-static inline bool is_tcf_sample(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
- return a->ops && a->ops->id == TCA_ID_SAMPLE;
-#else
- return false;
-#endif
-}
-
static inline __u32 tcf_sample_rate(const struct tc_action *a)
{
return to_sample(a)->rate;
diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index 9649600fb3dc..31b2cd0bebb5 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -12,6 +12,7 @@
#include <linux/tc_act/tc_skbedit.h>
struct tcf_skbedit_params {
+ int action;
u32 flags;
u32 priority;
u32 mark;
diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index 904eddfc1826..3f5e9242b5e8 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -26,15 +26,6 @@ struct tcf_vlan {
};
#define to_vlan(a) ((struct tcf_vlan *)a)
-static inline bool is_tcf_vlan(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
- if (a->ops && a->ops->id == TCA_ID_VLAN)
- return true;
-#endif
- return false;
-}
-
static inline u32 tcf_vlan_action(const struct tc_action *a)
{
u32 tcfv_action;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5078ad868fee..b3815d104340 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -267,7 +267,6 @@ extern long sysctl_tcp_mem[3];
#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
#define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */
-extern atomic_long_t tcp_memory_allocated;
DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
extern struct percpu_counter tcp_sockets_allocated;
@@ -321,7 +320,7 @@ extern struct proto tcp_prot;
#define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
#define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
-void tcp_tasklet_init(void);
+void tcp_tsq_work_init(void);
int tcp_v4_err(struct sk_buff *skb, u32);
@@ -1560,7 +1559,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
enum skb_drop_reason *reason);
-int tcp_filter(struct sock *sk, struct sk_buff *skb);
+int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason);
void tcp_set_state(struct sock *sk, int state);
void tcp_done(struct sock *sk);
int tcp_abort(struct sock *sk, int err);
@@ -1811,14 +1810,8 @@ static inline void tcp_mib_init(struct net *net)
}
/* from STCP */
-static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
-{
- tp->lost_skb_hint = NULL;
-}
-
static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
{
- tcp_clear_retrans_hints_partial(tp);
tp->retransmit_skb_hint = NULL;
}
diff --git a/include/net/tcx.h b/include/net/tcx.h
index 5ce0ce9e0c02..23a61af13547 100644
--- a/include/net/tcx.h
+++ b/include/net/tcx.h
@@ -20,7 +20,6 @@ struct tcx_entry {
struct tcx_link {
struct bpf_link link;
struct net_device *dev;
- u32 location;
};
static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress)
diff --git a/include/net/udp.h b/include/net/udp.h
index a772510b2aa5..f8ae2c4ade14 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -205,7 +205,6 @@ static inline void udp_hash4_dec(struct udp_hslot *hslot2)
extern struct proto udp_prot;
-extern atomic_long_t udp_memory_allocated;
DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
/* sysctl variables for udp */
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 2df3b8344eb5..9acef2fbd2fd 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -130,35 +130,20 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type);
void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type);
-static inline void udp_tunnel_get_rx_info(struct net_device *dev)
-{
- ASSERT_RTNL();
- if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- return;
- call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
-}
-
-static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
-{
- ASSERT_RTNL();
- if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- return;
- call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev);
-}
-
/* Transmit the skb using UDP encapsulation. */
void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl,
__be16 df, __be16 src_port, __be16 dst_port,
- bool xnet, bool nocheck);
+ bool xnet, bool nocheck, u16 ipcb_flags);
-int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb,
- struct net_device *dev,
- const struct in6_addr *saddr,
- const struct in6_addr *daddr,
- __u8 prio, __u8 ttl, __be32 label,
- __be16 src_port, __be16 dst_port, bool nocheck);
+void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb,
+ struct net_device *dev,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u8 prio, __u8 ttl, __be32 label,
+ __be16 src_port, __be16 dst_port, bool nocheck,
+ u16 ip6cb_flags);
void udp_tunnel_sock_release(struct socket *sock);
@@ -221,19 +206,17 @@ static inline void udp_tunnel_encap_enable(struct sock *sk)
#define UDP_TUNNEL_NIC_MAX_TABLES 4
enum udp_tunnel_nic_info_flags {
- /* Device callbacks may sleep */
- UDP_TUNNEL_NIC_INFO_MAY_SLEEP = BIT(0),
/* Device only supports offloads when it's open, all ports
* will be removed before close and re-added after open.
*/
- UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(1),
+ UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(0),
/* Device supports only IPv4 tunnels */
- UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(2),
+ UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(1),
/* Device has hard-coded the IANA VXLAN port (4789) as VXLAN.
* This port must not be counted towards n_entries of any table.
* Driver will not receive any callback associated with port 4789.
*/
- UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(3),
+ UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(2),
};
struct udp_tunnel_nic;
@@ -324,6 +307,9 @@ struct udp_tunnel_nic_ops {
size_t (*dump_size)(struct net_device *dev, unsigned int table);
int (*dump_write)(struct net_device *dev, unsigned int table,
struct sk_buff *skb);
+ void (*assert_locked)(struct net_device *dev);
+ void (*lock)(struct net_device *dev);
+ void (*unlock)(struct net_device *dev);
};
#ifdef CONFIG_INET
@@ -352,8 +338,28 @@ static inline void
udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
unsigned int idx, u8 priv)
{
- if (udp_tunnel_nic_ops)
+ if (udp_tunnel_nic_ops) {
+ udp_tunnel_nic_ops->assert_locked(dev);
udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv);
+ }
+}
+
+static inline void udp_tunnel_nic_assert_locked(struct net_device *dev)
+{
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->assert_locked(dev);
+}
+
+static inline void udp_tunnel_nic_lock(struct net_device *dev)
+{
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->lock(dev);
+}
+
+static inline void udp_tunnel_nic_unlock(struct net_device *dev)
+{
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->unlock(dev);
}
static inline void
@@ -395,17 +401,50 @@ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev)
static inline size_t
udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
{
+ size_t ret;
+
if (!udp_tunnel_nic_ops)
return 0;
- return udp_tunnel_nic_ops->dump_size(dev, table);
+
+ udp_tunnel_nic_ops->lock(dev);
+ ret = udp_tunnel_nic_ops->dump_size(dev, table);
+ udp_tunnel_nic_ops->unlock(dev);
+
+ return ret;
}
static inline int
udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
struct sk_buff *skb)
{
+ int ret;
+
if (!udp_tunnel_nic_ops)
return 0;
- return udp_tunnel_nic_ops->dump_write(dev, table, skb);
+
+ udp_tunnel_nic_ops->lock(dev);
+ ret = udp_tunnel_nic_ops->dump_write(dev, table, skb);
+ udp_tunnel_nic_ops->unlock(dev);
+
+ return ret;
+}
+
+static inline void udp_tunnel_get_rx_info(struct net_device *dev)
+{
+ ASSERT_RTNL();
+ if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+ return;
+ udp_tunnel_nic_assert_locked(dev);
+ call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
}
+
+static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
+{
+ ASSERT_RTNL();
+ if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+ return;
+ udp_tunnel_nic_assert_locked(dev);
+ call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev);
+}
+
#endif
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index e2f7ca045d3e..0ee50785f4f1 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -332,6 +332,7 @@ struct vxlan_dev {
#define VXLAN_F_VNIFILTER 0x20000
#define VXLAN_F_MDB 0x40000
#define VXLAN_F_LOCALBYPASS 0x80000
+#define VXLAN_F_MC_ROUTE 0x100000
/* Flags that are used in the receive path. These flags must match in
* order for a socket to be shareable
@@ -353,7 +354,9 @@ struct vxlan_dev {
VXLAN_F_UDP_ZERO_CSUM6_RX | \
VXLAN_F_COLLECT_METADATA | \
VXLAN_F_VNIFILTER | \
- VXLAN_F_LOCALBYPASS)
+ VXLAN_F_LOCALBYPASS | \
+ VXLAN_F_MC_ROUTE | \
+ 0)
struct net_device *vxlan_dev_create(struct net *net, const char *name,
u8 name_assign_type, struct vxlan_config *conf);
diff --git a/include/net/x25.h b/include/net/x25.h
index 5e833cfc864e..414f3fd99345 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -203,7 +203,6 @@ void x25_send_frame(struct sk_buff *, struct x25_neigh *);
int x25_lapb_receive_frame(struct sk_buff *, struct net_device *,
struct packet_type *, struct net_device *);
void x25_establish_link(struct x25_neigh *);
-void x25_terminate_link(struct x25_neigh *);
/* x25_facilities.c */
int x25_parse_facilities(struct sk_buff *, struct x25_facilities *,
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index e8bd6ddb7b12..ce587a225661 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -84,6 +84,7 @@ struct xdp_sock {
struct list_head map_list;
/* Protects map_list */
spinlock_t map_list_lock;
+ u32 max_tx_budget;
/* Protects multiple processes in the control path */
struct mutex mutex;
struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */