diff options
Diffstat (limited to 'include/net')
106 files changed, 4508 insertions, 850 deletions
diff --git a/include/net/act_api.h b/include/net/act_api.h index 404df8557f6a..2894cfff2da3 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -76,19 +76,24 @@ static inline void tcf_lastuse_update(struct tcf_t *tm) { unsigned long now = jiffies; - if (tm->lastuse != now) - tm->lastuse = now; - if (unlikely(!tm->firstuse)) - tm->firstuse = now; + if (READ_ONCE(tm->lastuse) != now) + WRITE_ONCE(tm->lastuse, now); + if (unlikely(!READ_ONCE(tm->firstuse))) + WRITE_ONCE(tm->firstuse, now); } static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) { - dtm->install = jiffies_to_clock_t(jiffies - stm->install); - dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse); - dtm->firstuse = stm->firstuse ? - jiffies_to_clock_t(jiffies - stm->firstuse) : 0; - dtm->expires = jiffies_to_clock_t(stm->expires); + unsigned long firstuse, now = jiffies; + + dtm->install = jiffies_to_clock_t(now - READ_ONCE(stm->install)); + dtm->lastuse = jiffies_to_clock_t(now - READ_ONCE(stm->lastuse)); + + firstuse = READ_ONCE(stm->firstuse); + dtm->firstuse = firstuse ? + jiffies_to_clock_t(now - firstuse) : 0; + + dtm->expires = jiffies_to_clock_t(READ_ONCE(stm->expires)); } static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) @@ -170,14 +175,12 @@ static inline void tc_action_net_exit(struct list_head *net_list, { struct net *net; - rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { struct tc_action_net *tn = net_generic(net, id); tcf_idrinfo_destroy(tn->ops, tn->idrinfo); kfree(tn->idrinfo); } - rtnl_unlock(); } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index cf793d18e5df..0fb4c41c9bbf 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -16,6 +16,7 @@ struct sock; struct socket; struct rxrpc_call; struct rxrpc_peer; +struct krb5_buffer; enum rxrpc_abort_reason; enum rxrpc_interruptibility { @@ -24,23 +25,33 @@ enum rxrpc_interruptibility { RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */ }; +enum rxrpc_oob_type { + RXRPC_OOB_CHALLENGE, /* Security challenge for a connection */ +}; + /* * Debug ID counter for tracing. */ extern atomic_t rxrpc_debug_id; +/* + * Operations table for rxrpc to call out to a kernel application (e.g. kAFS). + */ +struct rxrpc_kernel_ops { + void (*notify_new_call)(struct sock *sk, struct rxrpc_call *call, + unsigned long user_call_ID); + void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*notify_oob)(struct sock *sk, struct sk_buff *oob); +}; + typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *, - unsigned long); -typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long); -void rxrpc_kernel_new_call_notification(struct socket *, - rxrpc_notify_new_call_t, - rxrpc_discard_new_call_t); +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, struct key *key, @@ -72,16 +83,33 @@ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data); unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer); unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); -int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, - rxrpc_user_attach_call_t, unsigned long, gfp_t, - unsigned int); +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); -u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); -void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, - unsigned long); int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); int rxrpc_sock_set_security_keyring(struct sock *, struct key *); +int rxrpc_sock_set_manage_response(struct sock *sk, bool set); + +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata); +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type); +void rxrpc_kernel_free_oob(struct sk_buff *oob); +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index); +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why); +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata); +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype); #endif /* _NET_RXRPC_H */ diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 1af1841b7601..34f53dde65ce 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -47,6 +47,8 @@ struct unix_sock { #define peer_wait peer_wq.wait wait_queue_entry_t peer_wake; struct scm_stat scm_stat; + int inq_len; + bool recvmsg_inq; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) struct sk_buff *oob_skb; #endif diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 9e85424c8343..d40e978126e3 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -221,6 +221,7 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); +void vsock_linger(struct sock *sk); /**** TAP ****/ @@ -242,8 +243,8 @@ int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); -#ifdef CONFIG_BPF_SYSCALL extern struct proto vsock_proto; +#ifdef CONFIG_BPF_SYSCALL int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void __init vsock_bpf_build_proto(void); #else diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h new file mode 100644 index 000000000000..e1a1c8aedc79 --- /dev/null +++ b/include/net/aligned_data.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_ALIGNED_DATA_H +#define _NET_ALIGNED_DATA_H + +#include <linux/atomic.h> +#include <linux/types.h> + +/* Structure holding cacheline aligned fields on SMP builds. + * Each field or group should have an ____cacheline_aligned_in_smp + * attribute to ensure no accidental false sharing can happen. + */ +struct net_aligned_data { + atomic64_t net_cookie ____cacheline_aligned_in_smp; +#if defined(CONFIG_INET) + atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; + atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; +#endif +}; + +extern struct net_aligned_data net_aligned_data; + +#endif /* _NET_ALIGNED_DATA_H */ diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index bbefde319f95..ada5b56a4413 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -29,6 +29,7 @@ #include <linux/poll.h> #include <net/sock.h> #include <linux/seq_file.h> +#include <linux/ethtool.h> #define BT_SUBSYS_VERSION 2 #define BT_SUBSYS_REVISION 22 @@ -243,6 +244,12 @@ struct bt_codecs { #define BT_ISO_BASE 20 +/* Socket option value 21 reserved */ + +#define BT_PKT_SEQNUM 22 + +#define BT_SCM_PKT_SEQNUM 0x05 + __printf(1, 2) void bt_info(const char *fmt, ...); __printf(1, 2) @@ -390,7 +397,8 @@ struct bt_sock { enum { BT_SK_DEFER_SETUP, BT_SK_SUSPEND, - BT_SK_PKT_STATUS + BT_SK_PKT_STATUS, + BT_SK_PKT_SEQNUM, }; struct bt_sock_list { @@ -448,6 +456,9 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, hci_req_complete_t *req_complete, hci_req_complete_skb_t *req_complete_skb); +int hci_ethtool_ts_info(unsigned int index, int sk_proto, + struct kernel_ethtool_ts_info *ts_info); + #define HCI_REQ_START BIT(0) #define HCI_REQ_SKB BIT(1) @@ -471,6 +482,7 @@ struct bt_skb_cb { u8 pkt_type; u8 force_active; u16 expect; + u16 pkt_seqnum; u8 incoming:1; u8 pkt_status:2; union { @@ -484,6 +496,7 @@ struct bt_skb_cb { #define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type #define hci_skb_pkt_status(skb) bt_cb((skb))->pkt_status +#define hci_skb_pkt_seqnum(skb) bt_cb((skb))->pkt_seqnum #define hci_skb_expect(skb) bt_cb((skb))->expect #define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode #define hci_skb_event(skb) bt_cb((skb))->hci.req_event diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index a8586c3058c7..df1847b74e55 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -377,6 +377,8 @@ enum { * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, + + __HCI_NUM_QUIRKS, }; /* HCI device flags */ @@ -494,6 +496,7 @@ enum { #define HCI_EVENT_PKT 0x04 #define HCI_ISODATA_PKT 0x05 #define HCI_DIAG_PKT 0xf0 +#define HCI_DRV_PKT 0xf1 #define HCI_VENDOR_PKT 0xff /* HCI packet types */ @@ -557,7 +560,9 @@ enum { #define ESCO_LINK 0x02 /* Low Energy links do not have defined link type. Use invented one */ #define LE_LINK 0x80 -#define ISO_LINK 0x82 +#define CIS_LINK 0x82 +#define BIS_LINK 0x83 +#define PA_LINK 0x84 #define INVALID_LINK 0xff /* LMP features */ @@ -1931,6 +1936,8 @@ struct hci_cp_le_pa_create_sync { __u8 sync_cte_type; } __packed; +#define HCI_OP_LE_PA_CREATE_SYNC_CANCEL 0x2045 + #define HCI_OP_LE_PA_TERM_SYNC 0x2046 struct hci_cp_le_pa_term_sync { __le16 handle; @@ -2628,6 +2635,7 @@ struct hci_ev_le_conn_complete { #define LE_EXT_ADV_DIRECT_IND 0x0004 #define LE_EXT_ADV_SCAN_RSP 0x0008 #define LE_EXT_ADV_LEGACY_PDU 0x0010 +#define LE_EXT_ADV_DATA_STATUS_MASK 0x0060 #define LE_EXT_ADV_EVT_TYPE_MASK 0x007f #define ADDR_LE_DEV_PUBLIC 0x00 @@ -2830,8 +2838,8 @@ struct hci_evt_le_create_big_complete { __le16 bis_handle[]; } __packed; -#define HCI_EVT_LE_BIG_SYNC_ESTABILISHED 0x1d -struct hci_evt_le_big_sync_estabilished { +#define HCI_EVT_LE_BIG_SYNC_ESTABLISHED 0x1d +struct hci_evt_le_big_sync_established { __u8 status; __u8 handle; __u8 latency[3]; @@ -2845,6 +2853,12 @@ struct hci_evt_le_big_sync_estabilished { __le16 bis[]; } __packed; +#define HCI_EVT_LE_BIG_SYNC_LOST 0x1e +struct hci_evt_le_big_sync_lost { + __u8 handle; + __u8 reason; +} __packed; + #define HCI_EVT_LE_BIG_INFO_ADV_REPORT 0x22 struct hci_evt_le_big_info_adv_report { __le16 sync_handle; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 5115da34f881..4dc11c66f7b8 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -29,8 +29,11 @@ #include <linux/idr.h> #include <linux/leds.h> #include <linux/rculist.h> +#include <linux/spinlock.h> +#include <linux/srcu.h> #include <net/bluetooth/hci.h> +#include <net/bluetooth/hci_drv.h> #include <net/bluetooth/hci_sync.h> #include <net/bluetooth/hci_sock.h> #include <net/bluetooth/coredump.h> @@ -92,6 +95,7 @@ struct discovery_state { u16 uuid_count; u8 (*uuids)[16]; unsigned long name_resolve_timeout; + spinlock_t lock; }; #define SUSPEND_NOTIFIER_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ @@ -241,6 +245,7 @@ struct adv_info { __u8 mesh; __u8 instance; __u8 handle; + __u8 sid; __u32 flags; __u16 timeout; __u16 remaining_time; @@ -345,6 +350,7 @@ struct adv_monitor { struct hci_dev { struct list_head list; + struct srcu_struct srcu; struct mutex lock; struct ida unset_handle_ida; @@ -460,7 +466,7 @@ struct hci_dev { unsigned int auto_accept_delay; - unsigned long quirks; + DECLARE_BITMAP(quirk_flags, __HCI_NUM_QUIRKS); atomic_t cmd_cnt; unsigned int acl_cnt; @@ -545,6 +551,7 @@ struct hci_dev { struct hci_conn_hash conn_hash; struct list_head mesh_pending; + struct mutex mgmt_pending_lock; struct list_head mgmt_pending; struct list_head reject_list; struct list_head accept_list; @@ -613,6 +620,8 @@ struct hci_dev { struct list_head monitored_devices; bool advmon_pend_notify; + struct hci_drv *hci_drv; + #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; #endif @@ -649,6 +658,10 @@ struct hci_dev { u8 (*classify_pkt_type)(struct hci_dev *hdev, struct sk_buff *skb); }; +#define hci_set_quirk(hdev, nr) set_bit((nr), (hdev)->quirk_flags) +#define hci_clear_quirk(hdev, nr) clear_bit((nr), (hdev)->quirk_flags) +#define hci_test_quirk(hdev, nr) test_bit((nr), (hdev)->quirk_flags) + #define HCI_PHY_HANDLE(handle) (handle & 0xff) enum conn_reasons { @@ -822,20 +835,20 @@ extern struct mutex hci_cb_list_lock; #define hci_dev_test_and_clear_flag(hdev, nr) test_and_clear_bit((nr), (hdev)->dev_flags) #define hci_dev_test_and_change_flag(hdev, nr) test_and_change_bit((nr), (hdev)->dev_flags) -#define hci_dev_clear_volatile_flags(hdev) \ - do { \ - hci_dev_clear_flag(hdev, HCI_LE_SCAN); \ - hci_dev_clear_flag(hdev, HCI_LE_ADV); \ - hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\ - hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); \ - hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); \ +#define hci_dev_clear_volatile_flags(hdev) \ + do { \ + hci_dev_clear_flag((hdev), HCI_LE_SCAN); \ + hci_dev_clear_flag((hdev), HCI_LE_ADV); \ + hci_dev_clear_flag((hdev), HCI_LL_RPA_RESOLUTION); \ + hci_dev_clear_flag((hdev), HCI_PERIODIC_INQ); \ + hci_dev_clear_flag((hdev), HCI_QUALITY_REPORT); \ } while (0) #define hci_dev_le_state_simultaneous(hdev) \ - (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) && \ - (hdev->le_states[4] & 0x08) && /* Central */ \ - (hdev->le_states[4] & 0x40) && /* Peripheral */ \ - (hdev->le_states[3] & 0x10)) /* Simultaneous */ + (!hci_test_quirk((hdev), HCI_QUIRK_BROKEN_LE_STATES) && \ + ((hdev)->le_states[4] & 0x08) && /* Central */ \ + ((hdev)->le_states[4] & 0x40) && /* Peripheral */ \ + ((hdev)->le_states[3] & 0x10)) /* Simultaneous */ /* ----- HCI interface to upper protocols ----- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr); @@ -878,6 +891,7 @@ static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, static inline void discovery_init(struct hci_dev *hdev) { + spin_lock_init(&hdev->discovery.lock); hdev->discovery.state = DISCOVERY_STOPPED; INIT_LIST_HEAD(&hdev->discovery.all); INIT_LIST_HEAD(&hdev->discovery.unknown); @@ -892,8 +906,11 @@ static inline void hci_discovery_filter_clear(struct hci_dev *hdev) hdev->discovery.report_invalid_rssi = true; hdev->discovery.rssi = HCI_RSSI_INVALID; hdev->discovery.uuid_count = 0; + + spin_lock(&hdev->discovery.lock); kfree(hdev->discovery.uuids); hdev->discovery.uuids = NULL; + spin_unlock(&hdev->discovery.lock); } bool hci_discovery_active(struct hci_dev *hdev); @@ -996,7 +1013,9 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) case ESCO_LINK: h->sco_num++; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: h->iso_num++; break; } @@ -1022,7 +1041,9 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) case ESCO_LINK: h->sco_num--; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: h->iso_num--; break; } @@ -1039,7 +1060,9 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) case SCO_LINK: case ESCO_LINK: return h->sco_num; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: return h->iso_num; default: return 0; @@ -1100,7 +1123,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, ba) || c->type != ISO_LINK) + if (bacmp(&c->dst, ba) || c->type != BIS_LINK) continue; if (c->iso_qos.bcast.bis == bis) { @@ -1113,10 +1136,8 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, return NULL; } -static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev, - __u8 sid, - bdaddr_t *dst, - __u8 dst_type) +static inline struct hci_conn * +hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1124,8 +1145,10 @@ static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || bacmp(&c->dst, dst) || - c->dst_type != dst_type || c->sid != sid) + if (c->type != PA_LINK) + continue; + + if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags)) continue; rcu_read_unlock(); @@ -1148,8 +1171,8 @@ hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, ba) || c->type != ISO_LINK || - !test_bit(HCI_CONN_PER_ADV, &c->flags)) + if (bacmp(&c->dst, ba) || c->type != BIS_LINK || + !test_bit(HCI_CONN_PER_ADV, &c->flags)) continue; if (c->iso_qos.bcast.big == big && @@ -1238,7 +1261,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) + if (c->type != CIS_LINK) continue; /* Match CIG ID if set */ @@ -1270,7 +1293,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) + if (c->type != CIS_LINK) continue; if (handle == c->iso_qos.ucast.cig) { @@ -1293,17 +1316,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) - continue; - - /* An ISO_LINK hcon with BDADDR_ANY as destination - * address is a Broadcast connection. A Broadcast - * slave connection is associated with a PA train, - * so the sync_handle can be used to differentiate - * from unicast. - */ - if (bacmp(&c->dst, BDADDR_ANY) && - c->sync_handle == HCI_SYNC_HANDLE_INVALID) + if (c->type != BIS_LINK) continue; if (handle == c->iso_qos.bcast.big) { @@ -1327,7 +1340,7 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != PA_LINK) continue; if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { @@ -1342,7 +1355,8 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, } static inline struct hci_conn * -hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) +hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state, + __u8 role) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1350,8 +1364,7 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK || - c->state != state) + if (c->type != BIS_LINK || c->state != state || c->role != role) continue; if (handle == c->iso_qos.bcast.big) { @@ -1374,8 +1387,8 @@ hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || - !test_bit(HCI_CONN_PA_SYNC, &c->flags)) + if (c->type != BIS_LINK || + !test_bit(HCI_CONN_PA_SYNC, &c->flags)) continue; if (c->iso_qos.bcast.big == big) { @@ -1397,7 +1410,7 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != PA_LINK) continue; /* Ignore the listen hcon, we are looking @@ -1417,26 +1430,6 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) return NULL; } -static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, - __u8 type, __u16 state) -{ - struct hci_conn_hash *h = &hdev->conn_hash; - struct hci_conn *c; - - rcu_read_lock(); - - list_for_each_entry_rcu(c, &h->list, list) { - if (c->type == type && c->state == state) { - rcu_read_unlock(); - return c; - } - } - - rcu_read_unlock(); - - return NULL; -} - typedef void (*hci_conn_func_t)(struct hci_conn *conn, void *data); static inline void hci_conn_hash_list_state(struct hci_dev *hdev, hci_conn_func_t func, __u8 type, @@ -1524,8 +1517,6 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); -int hci_pa_create_sync_pending(struct hci_dev *hdev); -int hci_le_big_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, @@ -1556,19 +1547,20 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, u16 timeout); struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos); -struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, __u8 base_len, __u8 *base); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos); struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos, + __u8 dst_type, __u8 sid, + struct bt_iso_qos *qos, __u8 data_len, __u8 *data); struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); -int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, - struct bt_iso_qos *qos, - __u16 sync_handle, __u8 num_bis, __u8 bis[]); +int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, + struct bt_iso_qos *qos, __u16 sync_handle, + __u8 num_bis, __u8 bis[]); int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level); int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, @@ -1800,6 +1792,7 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, void hci_uuids_clear(struct hci_dev *hdev); void hci_link_keys_clear(struct hci_dev *hdev); +u8 *hci_conn_key_enc_size(struct hci_conn *conn); struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr); struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, @@ -1836,6 +1829,7 @@ int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, void hci_adv_instances_clear(struct hci_dev *hdev); struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance); +struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid); struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance); struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, @@ -1843,7 +1837,7 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u16 timeout, u16 duration, s8 tx_power, u32 min_interval, u32 max_interval, u8 mesh_handle); -struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, +struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval); int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, @@ -1931,8 +1925,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) #define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \ - !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \ - &(dev)->quirks)) + !hci_test_quirk((dev), \ + HCI_QUIRK_BROKEN_LE_CODED)) #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) @@ -1940,31 +1934,31 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ - (hdev->commands[39] & 0x04)) + ((dev)->commands[39] & 0x04)) #define read_key_size_capable(dev) \ ((dev)->commands[20] & 0x10 && \ - !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE)) #define read_voice_setting_capable(dev) \ ((dev)->commands[9] & 0x04 && \ - !test_bit(HCI_QUIRK_BROKEN_READ_VOICE_SETTING, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_VOICE_SETTING)) /* Use enhanced synchronous connection if command is supported and its quirk * has not been set. */ #define enhanced_sync_conn_capable(dev) \ (((dev)->commands[29] & 0x08) && \ - !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN)) /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_SCAN)) /* Use ext create connection if command is supported */ #define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_CREATE_CONN)) /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) @@ -1979,8 +1973,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); */ #define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \ ext_adv_capable(dev)) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, \ - &(dev)->quirks)) + !hci_test_quirk((dev), \ + HCI_QUIRK_BROKEN_EXT_CREATE_CONN)) /* Periodic advertising support */ #define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV)) @@ -1997,7 +1991,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ - (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) + (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 @@ -2013,7 +2007,9 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, case ESCO_LINK: return sco_connect_ind(hdev, bdaddr, flags); - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: return iso_connect_ind(hdev, bdaddr, flags); default: @@ -2404,7 +2400,6 @@ void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); -void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle); int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, bdaddr_t *bdaddr, u8 addr_type); diff --git a/include/net/bluetooth/hci_drv.h b/include/net/bluetooth/hci_drv.h new file mode 100644 index 000000000000..2f01c44f05ec --- /dev/null +++ b/include/net/bluetooth/hci_drv.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Google Corporation + */ + +#ifndef __HCI_DRV_H +#define __HCI_DRV_H + +#include <linux/types.h> + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci.h> + +struct hci_drv_cmd_hdr { + __le16 opcode; + __le16 len; +} __packed; + +struct hci_drv_ev_hdr { + __le16 opcode; + __le16 len; +} __packed; + +#define HCI_DRV_EV_CMD_STATUS 0x0000 +struct hci_drv_ev_cmd_status { + __le16 opcode; + __u8 status; +} __packed; + +#define HCI_DRV_EV_CMD_COMPLETE 0x0001 +struct hci_drv_ev_cmd_complete { + __le16 opcode; + __u8 status; + __u8 data[]; +} __packed; + +#define HCI_DRV_STATUS_SUCCESS 0x00 +#define HCI_DRV_STATUS_UNSPECIFIED_ERROR 0x01 +#define HCI_DRV_STATUS_UNKNOWN_COMMAND 0x02 +#define HCI_DRV_STATUS_INVALID_PARAMETERS 0x03 + +#define HCI_DRV_MAX_DRIVER_NAME_LENGTH 32 + +/* Common commands that make sense on all drivers start from 0x0000 */ +#define HCI_DRV_OP_READ_INFO 0x0000 +#define HCI_DRV_READ_INFO_SIZE 0 +struct hci_drv_rp_read_info { + __u8 driver_name[HCI_DRV_MAX_DRIVER_NAME_LENGTH]; + __le16 num_supported_commands; + __le16 supported_commands[]; +} __packed; + +/* Driver specific OGF (Opcode Group Field) + * Commands in this group may have different meanings across different drivers. + */ +#define HCI_DRV_OGF_DRIVER_SPECIFIC 0x01 + +int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status); +int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp, + size_t rp_len); +int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *cmd_skb); + +struct hci_drv_handler { + int (*func)(struct hci_dev *hdev, void *data, u16 data_len); + size_t data_len; +}; + +struct hci_drv { + size_t common_handler_count; + const struct hci_drv_handler *common_handlers; + + size_t specific_handler_count; + const struct hci_drv_handler *specific_handlers; +}; + +#endif /* __HCI_DRV_H */ diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 082f89531b88..bbd752494ef9 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -51,6 +51,8 @@ struct hci_mon_hdr { #define HCI_MON_CTRL_EVENT 17 #define HCI_MON_ISO_TX_PKT 18 #define HCI_MON_ISO_RX_PKT 19 +#define HCI_MON_DRV_TX_PKT 20 +#define HCI_MON_DRV_RX_PKT 21 struct hci_mon_new_index { __u8 type; diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 7e2cf0cca939..5224f57f6af2 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -115,8 +115,8 @@ int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance); int hci_enable_advertising_sync(struct hci_dev *hdev); int hci_enable_advertising(struct hci_dev *hdev); -int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, - u8 *data, u32 flags, u16 min_interval, +int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid, + u8 data_len, u8 *data, u32 flags, u16 min_interval, u16 max_interval, u16 sync_interval); int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance); @@ -185,3 +185,6 @@ int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn); int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn); int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, struct hci_conn_params *params); + +int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn); diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 18687ccf0638..022b122a9fb6 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -77,6 +77,7 @@ enum { BOND_OPT_NS_TARGETS, BOND_OPT_PRIO, BOND_OPT_COUPLED_CONTROL, + BOND_OPT_BROADCAST_NEIGH, BOND_OPT_LAST }; diff --git a/include/net/bonding.h b/include/net/bonding.h index 95f67b308c19..e06f0d63b2c1 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -115,6 +115,8 @@ static inline int is_netpoll_tx_blocked(struct net_device *dev) #define is_netpoll_tx_blocked(dev) (0) #endif +DECLARE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled); + struct bond_params { int mode; int xmit_policy; @@ -149,6 +151,7 @@ struct bond_params { struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; #endif int coupled_control; + int broadcast_neighbor; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index efbd79c67be2..406626ff6cc8 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -560,7 +560,7 @@ struct ieee80211_sta_s1g_cap { * @vht_cap: VHT capabilities in this band * @s1g_cap: S1G capabilities in this band * @edmg_cap: EDMG capabilities in this band - * @s1g_cap: S1G capabilities in this band (S1B band only, of course) + * @s1g_cap: S1G capabilities in this band (S1G band only, of course) * @n_iftype_data: number of iftype data entries * @iftype_data: interface type data entries. Note that the bits in * @types_mask inside this structure cannot overlap (i.e. only @@ -633,7 +633,7 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, const struct ieee80211_sband_iftype_data *data; int i; - if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) + if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) return NULL; if (iftype == NL80211_IFTYPE_AP_VLAN) @@ -1097,43 +1097,6 @@ int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef, int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef); /** - * ieee80211_chanwidth_rate_flags - return rate flags for channel width - * @width: the channel width of the channel - * - * In some channel types, not all rates may be used - for example CCK - * rates may not be used in 5/10 MHz channels. - * - * Returns: rate flags which apply for this channel width - */ -static inline enum ieee80211_rate_flags -ieee80211_chanwidth_rate_flags(enum nl80211_chan_width width) -{ - switch (width) { - case NL80211_CHAN_WIDTH_5: - return IEEE80211_RATE_SUPPORTS_5MHZ; - case NL80211_CHAN_WIDTH_10: - return IEEE80211_RATE_SUPPORTS_10MHZ; - default: - break; - } - return 0; -} - -/** - * ieee80211_chandef_rate_flags - returns rate flags for a channel - * @chandef: channel definition for the channel - * - * See ieee80211_chanwidth_rate_flags(). - * - * Returns: rate flags which apply for this channel - */ -static inline enum ieee80211_rate_flags -ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef) -{ - return ieee80211_chanwidth_rate_flags(chandef->width); -} - -/** * ieee80211_chandef_max_power - maximum transmission power for the chandef * * In some regulations, the transmit power may depend on the configured channel @@ -1300,11 +1263,13 @@ struct cfg80211_crypto_settings { * struct cfg80211_mbssid_config - AP settings for multi bssid * * @tx_wdev: pointer to the transmitted interface in the MBSSID set + * @tx_link_id: link ID of the transmitted profile in an MLD. * @index: index of this AP in the multi bssid group. * @ema: set to true if the beacons should be sent out in EMA mode. */ struct cfg80211_mbssid_config { struct wireless_dev *tx_wdev; + u8 tx_link_id; u8 index; bool ema; }; @@ -1459,6 +1424,23 @@ struct cfg80211_unsol_bcast_probe_resp { }; /** + * struct cfg80211_s1g_short_beacon - S1G short beacon data. + * + * @update: Set to true if the feature configuration should be updated. + * @short_head: Short beacon head. + * @short_tail: Short beacon tail. + * @short_head_len: Short beacon head len. + * @short_tail_len: Short beacon tail len. + */ +struct cfg80211_s1g_short_beacon { + bool update; + const u8 *short_head; + const u8 *short_tail; + size_t short_head_len; + size_t short_tail_len; +}; + +/** * struct cfg80211_ap_settings - AP configuration * * Used to configure an AP interface. @@ -1498,6 +1480,8 @@ struct cfg80211_unsol_bcast_probe_resp { * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @mbssid_config: AP settings for multiple bssid + * @s1g_long_beacon_period: S1G long beacon period + * @s1g_short_beacon: S1G short beacon data */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1531,6 +1515,8 @@ struct cfg80211_ap_settings { struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; struct cfg80211_mbssid_config mbssid_config; + u8 s1g_long_beacon_period; + struct cfg80211_s1g_short_beacon s1g_short_beacon; }; @@ -1542,11 +1528,13 @@ struct cfg80211_ap_settings { * @beacon: beacon data * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters + * @s1g_short_beacon: S1G short beacon data */ struct cfg80211_ap_update { struct cfg80211_beacon_data beacon; struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; + struct cfg80211_s1g_short_beacon s1g_short_beacon; }; /** @@ -1561,6 +1549,7 @@ struct cfg80211_ap_update { * @n_counter_offsets_beacon: number of csa counters the beacon (tail) * @n_counter_offsets_presp: number of csa counters in the probe response * @beacon_after: beacon data to be used on the new channel + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @radar_required: whether radar detection is required on the new channel * @block_tx: whether transmissions should be blocked while changing * @count: number of beacons until switch @@ -1575,6 +1564,7 @@ struct cfg80211_csa_settings { unsigned int n_counter_offsets_beacon; unsigned int n_counter_offsets_presp; struct cfg80211_beacon_data beacon_after; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; bool radar_required; bool block_tx; u8 count; @@ -1590,6 +1580,7 @@ struct cfg80211_csa_settings { * @counter_offset_beacon: offsets of the counters within the beacon (tail) * @counter_offset_presp: offsets of the counters within the probe response * @beacon_next: beacon data to be used after the color change + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @count: number of beacons until the color change * @color: the color used after the change * @link_id: defines the link on which color change is expected during MLO. @@ -1600,6 +1591,7 @@ struct cfg80211_color_change_settings { u16 counter_offset_beacon; u16 counter_offset_presp; struct cfg80211_beacon_data beacon_next; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; u8 count; u8 color; u8 link_id; @@ -1688,6 +1680,7 @@ struct sta_txpwr { * @he_6ghz_capa: HE 6 GHz Band capabilities of station * @eht_capa: EHT capabilities of station * @eht_capa_len: the length of the EHT capabilities + * @s1g_capa: S1G capabilities of station */ struct link_station_parameters { const u8 *mld_mac; @@ -1706,6 +1699,7 @@ struct link_station_parameters { const struct ieee80211_he_6ghz_capa *he_6ghz_capa; const struct ieee80211_eht_cap_elem *eht_capa; u8 eht_capa_len; + const struct ieee80211_s1g_cap *s1g_capa; }; /** @@ -1770,6 +1764,9 @@ struct cfg80211_ttlm_params { * @supported_oper_classes_len: number of supported operating classes * @support_p2p_ps: information if station supports P2P PS mechanism * @airtime_weight: airtime scheduler weight for this station + * @eml_cap_present: Specifies if EML capabilities field (@eml_cap) is + * present/updated + * @eml_cap: EML capabilities of this station * @link_sta_params: link related params. */ struct station_parameters { @@ -1794,6 +1791,8 @@ struct station_parameters { u8 supported_oper_classes_len; int support_p2p_ps; u16 airtime_weight; + bool eml_cap_present; + u16 eml_cap; struct link_station_parameters link_sta_params; }; @@ -2048,6 +2047,99 @@ struct cfg80211_tid_stats { #define IEEE80211_MAX_CHAINS 4 /** + * struct link_station_info - link station information + * + * Link station information filled by driver for get_station() and + * dump_station(). + * @filled: bit flag of flags using the bits of &enum nl80211_sta_info to + * indicate the relevant values in this struct for them + * @connected_time: time(in secs) since a link of station is last connected + * @inactive_time: time since last activity for link station(tx/rx) + * in milliseconds + * @assoc_at: bootime (ns) of the last association of link of station + * @rx_bytes: bytes (size of MPDUs) received from this link of station + * @tx_bytes: bytes (size of MPDUs) transmitted to this link of station + * @signal: The signal strength, type depends on the wiphy's signal_type. + * For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_. + * @signal_avg: Average signal strength, type depends on the wiphy's + * signal_type. For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_ + * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg + * @chain_signal: per-chain signal strength of last received packet in dBm + * @chain_signal_avg: per-chain signal strength average in dBm + * @txrate: current unicast bitrate from this link of station + * @rxrate: current unicast bitrate to this link of station + * @rx_packets: packets (MSDUs & MMPDUs) received from this link of station + * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this link of station + * @tx_retries: cumulative retry counts (MPDUs) for this link of station + * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK) + * @rx_dropped_misc: Dropped for un-specified reason. + * @bss_param: current BSS parameters + * @beacon_loss_count: Number of times beacon loss event has triggered. + * @expected_throughput: expected throughput in kbps (including 802.11 headers) + * towards this station. + * @rx_beacon: number of beacons received from this peer + * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received + * from this peer + * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer + * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer + * @airtime_weight: current airtime scheduling weight + * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last + * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs. + * Note that this doesn't use the @filled bit, but is used if non-NULL. + * @ack_signal: signal strength (in dBm) of the last ACK frame. + * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has + * been sent. + * @rx_mpdu_count: number of MPDUs received from this station + * @fcs_err_count: number of packets (MPDUs) received from this station with + * an FCS error. This counter should be incremented only when TA of the + * received packet with an FCS error matches the peer MAC address. + * @addr: For MLO STA connection, filled with address of the link of station. + */ +struct link_station_info { + u64 filled; + u32 connected_time; + u32 inactive_time; + u64 assoc_at; + u64 rx_bytes; + u64 tx_bytes; + s8 signal; + s8 signal_avg; + + u8 chains; + s8 chain_signal[IEEE80211_MAX_CHAINS]; + s8 chain_signal_avg[IEEE80211_MAX_CHAINS]; + + struct rate_info txrate; + struct rate_info rxrate; + u32 rx_packets; + u32 tx_packets; + u32 tx_retries; + u32 tx_failed; + u32 rx_dropped_misc; + struct sta_bss_parameters bss_param; + + u32 beacon_loss_count; + + u32 expected_throughput; + + u64 tx_duration; + u64 rx_duration; + u64 rx_beacon; + u8 rx_beacon_signal_avg; + + u16 airtime_weight; + + s8 ack_signal; + s8 avg_ack_signal; + struct cfg80211_tid_stats *pertid; + + u32 rx_mpdu_count; + u32 fcs_err_count; + + u8 addr[ETH_ALEN] __aligned(2); +}; + +/** * struct station_info - station information * * Station information filled by driver for get_station() and dump_station. @@ -2131,6 +2223,11 @@ struct cfg80211_tid_stats { * dump_station() callbacks. User space needs this information to determine * the accepted and rejected affiliated links of the connected station. * @assoc_resp_ies_len: Length of @assoc_resp_ies buffer in octets. + * @valid_links: bitmap of valid links, or 0 for non-MLO. Drivers fill this + * information in cfg80211_new_sta(), cfg80211_del_sta_sinfo(), + * get_station() and dump_station() callbacks. + * @links: reference to Link sta entries for MLO STA, all link specific + * information is accessed through links[link_id]. */ struct station_info { u64 filled; @@ -2195,6 +2292,9 @@ struct station_info { u8 mld_addr[ETH_ALEN] __aligned(2); const u8 *assoc_resp_ies; size_t assoc_resp_ies_len; + + u16 valid_links; + struct link_station_info *links[IEEE80211_MLD_MAX_NUM_LINKS]; }; /** @@ -2675,15 +2775,16 @@ struct cfg80211_scan_6ghz_params { * @wiphy: the wiphy this was for * @scan_start: time (in jiffies) when the scan started * @wdev: the wireless device to scan for - * @info: (internal) information about completed scan - * @notified: (internal) scan request was notified as done or aborted * @no_cck: used to send probe requests at non CCK rate in 2GHz band * @mac_addr: MAC address used with randomisation * @mac_addr_mask: MAC address mask used with randomisation, bits that * are 0 in the mask should be randomised, bits that are 1 should * be taken from the @mac_addr * @scan_6ghz: relevant for split scan request only, - * true if this is the second scan request + * true if this is a 6 GHz scan request + * @first_part: %true if this is the first part of a split scan request or a + * scan that was not split. May be %true for a @scan_6ghz scan if no other + * channels were requested * @n_6ghz_params: number of 6 GHz params * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) @@ -2707,20 +2808,17 @@ struct cfg80211_scan_request { u8 mac_addr[ETH_ALEN] __aligned(2); u8 mac_addr_mask[ETH_ALEN] __aligned(2); u8 bssid[ETH_ALEN] __aligned(2); - - /* internal */ struct wiphy *wiphy; unsigned long scan_start; - struct cfg80211_scan_info info; - bool notified; bool no_cck; bool scan_6ghz; + bool first_part; u32 n_6ghz_params; struct cfg80211_scan_6ghz_params *scan_6ghz_params; s8 tsf_report_link_id; /* keep last */ - struct ieee80211_channel *channels[] __counted_by(n_channels); + struct ieee80211_channel *channels[]; }; static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) @@ -4782,12 +4880,14 @@ struct cfg80211_ops { int (*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev, int rate[NUM_NL80211_BANDS]); - int (*set_wiphy_params)(struct wiphy *wiphy, u32 changed); + int (*set_wiphy_params)(struct wiphy *wiphy, int radio_idx, + u32 changed); int (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, enum nl80211_tx_power_setting type, int mbm); int (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id, int *dbm); + int radio_idx, unsigned int link_id, int *dbm); void (*rfkill_poll)(struct wiphy *wiphy); @@ -4849,8 +4949,10 @@ struct cfg80211_ops { struct wireless_dev *wdev, struct mgmt_frame_regs *upd); - int (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant); - int (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant); + int (*set_antenna)(struct wiphy *wiphy, int radio_idx, + u32 tx_ant, u32 rx_ant); + int (*get_antenna)(struct wiphy *wiphy, int radio_idx, + u32 *tx_ant, u32 *rx_ant); int (*sched_scan_start)(struct wiphy *wiphy, struct net_device *dev, @@ -5473,6 +5575,18 @@ struct wiphy_iftype_akm_suites { }; /** + * struct wiphy_radio_cfg - physical radio config of a wiphy + * This structure describes the configurations of a physical radio in a + * wiphy. It is used to denote per-radio attributes belonging to a wiphy. + * + * @rts_threshold: RTS threshold (dot11RTSThreshold); + * -1 (default) = RTS/CTS disabled + */ +struct wiphy_radio_cfg { + u32 rts_threshold; +}; + +/** * struct wiphy_radio_freq_range - wiphy frequency range * @start_freq: start range edge frequency (kHz) * @end_freq: end range edge frequency (kHz) @@ -5727,6 +5841,10 @@ struct wiphy_radio { * supports enabling HW timestamping for all peers (i.e. no need to * specify a mac address). * + * @radio_cfg: configuration of radios belonging to a muli-radio wiphy. This + * struct contains a list of all radio specific attributes and should be + * used only for multi-radio wiphy. + * * @radio: radios belonging to this wiphy * @n_radio: number of radios */ @@ -5816,6 +5934,8 @@ struct wiphy { void (*reg_notifier)(struct wiphy *wiphy, struct regulatory_request *request); + struct wiphy_radio_cfg *radio_cfg; + /* fields below are read-only, assigned by cfg80211 */ const struct ieee80211_regdomain __rcu *regd; @@ -8496,6 +8616,17 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie, int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); /** + * cfg80211_link_sinfo_alloc_tid_stats - allocate per-tid statistics. + * + * @link_sinfo: the link station information + * @gfp: allocation flags + * + * Return: 0 on success. Non-zero on error. + */ +int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo, + gfp_t gfp); + +/** * cfg80211_sinfo_release_content - release contents of station info * @sinfo: the station information * @@ -8506,6 +8637,13 @@ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); static inline void cfg80211_sinfo_release_content(struct station_info *sinfo) { kfree(sinfo->pertid); + + for (int link_id = 0; link_id < ARRAY_SIZE(sinfo->links); link_id++) { + if (sinfo->links[link_id]) { + kfree(sinfo->links[link_id]->pertid); + kfree(sinfo->links[link_id]); + } + } } /** @@ -8910,6 +9048,7 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index, /** * cfg80211_rx_spurious_frame - inform userspace about a spurious frame * @dev: The device the frame matched to + * @link_id: the link the frame was received on, -1 if not applicable or unknown * @addr: the transmitter address * @gfp: context flags * @@ -8919,13 +9058,14 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index, * Return: %true if the frame was passed to userspace (or this failed * for a reason other than not having a subscription.) */ -bool cfg80211_rx_spurious_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp); +bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp); /** * cfg80211_rx_unexpected_4addr_frame - inform about unexpected WDS frame * @dev: The device the frame matched to * @addr: the transmitter address + * @link_id: the link the frame was received on, -1 if not applicable or unknown * @gfp: context flags * * This function is used in AP mode (only!) to inform userspace that @@ -8935,8 +9075,8 @@ bool cfg80211_rx_spurious_frame(struct net_device *dev, * Return: %true if the frame was passed to userspace (or this failed * for a reason other than not having a subscription.) */ -bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp); +bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp); /** * cfg80211_probe_status - notify userspace about probe status @@ -9402,6 +9542,17 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data); +/** + * cfg80211_get_radio_idx_by_chan - get the radio index by the channel + * + * @wiphy: the wiphy + * @chan: channel for which the supported radio index is required + * + * Return: radio index on success or a negative error code + */ +int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, + const struct ieee80211_channel *chan); + /** * cfg80211_stop_iface - trigger interface disconnection @@ -9766,6 +9917,11 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); * struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data * @buf: MLO Reconfiguration Response frame (header + body) * @len: length of the frame data + * @driver_initiated: Indicates whether the add links request is initiated by + * driver. This is set to true when the link reconfiguration request + * initiated by driver due to AP link recommendation requests + * (Ex: BTM (BSS Transition Management) request) handling offloaded to + * driver. * @added_links: BIT mask of links successfully added to the association * @links: per-link information indexed by link ID * @links.bss: the BSS that MLO reconfiguration was requested for, ownership of @@ -9778,6 +9934,7 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); struct cfg80211_mlo_reconf_done_data { const u8 *buf; size_t len; + bool driver_initiated; u16 added_links; struct { struct cfg80211_bss *bss; diff --git a/include/net/checksum.h b/include/net/checksum.h index 243f972267b8..3cbab35de5ab 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -99,12 +99,6 @@ csum_block_add(__wsum csum, __wsum csum2, int offset) } static __always_inline __wsum -csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) -{ - return csum_block_add(csum, csum2, offset); -} - -static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); @@ -115,12 +109,6 @@ static __always_inline __wsum csum_unfold(__sum16 n) return (__force __wsum)n; } -static __always_inline -__wsum csum_partial_ext(const void *buff, int len, __wsum sum) -{ - return csum_partial(buff, len, sum); -} - #define CSUM_MANGLED_0 ((__force __sum16)0xffff) static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) @@ -164,7 +152,7 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr); + __wsum diff, bool pseudohdr, bool ipv6); static __always_inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, diff --git a/include/net/devlink.h b/include/net/devlink.h index b8783126c1ed..93640a29427c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -118,6 +118,8 @@ struct devlink_rate { u32 tx_priority; u32 tx_weight; + + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; struct devlink_port { @@ -420,17 +422,19 @@ typedef u64 devlink_resource_occ_get_t(void *priv); #define __DEVLINK_PARAM_MAX_STRING_VALUE 32 enum devlink_param_type { - DEVLINK_PARAM_TYPE_U8, - DEVLINK_PARAM_TYPE_U16, - DEVLINK_PARAM_TYPE_U32, - DEVLINK_PARAM_TYPE_STRING, - DEVLINK_PARAM_TYPE_BOOL, + DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8, + DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16, + DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32, + DEVLINK_PARAM_TYPE_U64 = DEVLINK_VAR_ATTR_TYPE_U64, + DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING, + DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG, }; union devlink_param_value { u8 vu8; u16 vu16; u32 vu32; + u64 vu64; char vstr[__DEVLINK_PARAM_MAX_STRING_VALUE]; bool vbool; }; @@ -520,6 +524,8 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, + DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -578,6 +584,12 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME "event_eq_size" #define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME "enable_phc" +#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE DEVLINK_PARAM_TYPE_BOOL + +#define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id" +#define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ @@ -1482,6 +1494,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, @@ -1490,6 +1505,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index e4fdc6b54cef..d8ff24a33459 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -40,10 +40,12 @@ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ FN(TCP_RFC7323_PAWS_ACK) \ + FN(TCP_RFC7323_TW_PAWS) \ FN(TCP_RFC7323_TSECR) \ FN(TCP_LISTEN_OVERFLOW) \ FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ + FN(TCP_INVALID_END_SEQUENCE) \ FN(TCP_INVALID_ACK_SEQUENCE) \ FN(TCP_RESET) \ FN(TCP_INVALID_SYN) \ @@ -61,6 +63,7 @@ FN(NEIGH_FAILED) \ FN(NEIGH_QUEUEFULL) \ FN(NEIGH_DEAD) \ + FN(NEIGH_HH_FILLFAIL) \ FN(TC_EGRESS) \ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ @@ -119,6 +122,11 @@ FN(ARP_PVLAN_DISABLE) \ FN(MAC_IEEE_MAC_CONTROL) \ FN(BRIDGE_INGRESS_STP_STATE) \ + FN(CAN_RX_INVALID_FRAME) \ + FN(CANFD_RX_INVALID_FRAME) \ + FN(CANXL_RX_INVALID_FRAME) \ + FN(PFMEMALLOC) \ + FN(DUALPI2_STEP_DROP) \ FNe(MAX) /** @@ -284,6 +292,12 @@ enum skb_drop_reason { */ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, /** + * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in + * TIME_WAIT state. + * Corresponds to LINUX_MIB_PAWS_TW_REJECTED. + */ + SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, + /** * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr. * Corresponds to LINUX_MIB_TSECRREJECTED. */ @@ -292,9 +306,15 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_LISTEN_OVERFLOW, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ SKB_DROP_REASON_TCP_OLD_SEQUENCE, - /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ + /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field. */ SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /** + * @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE: + * Not acceptable END_SEQ field. + * Corresponds to LINUX_MIB_BEYOND_WINDOW. + */ + SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE, + /** * @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ * field because ack sequence is not in the window between snd_una * and snd_nxt @@ -341,6 +361,8 @@ enum skb_drop_reason { SKB_DROP_REASON_NEIGH_QUEUEFULL, /** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */ SKB_DROP_REASON_NEIGH_DEAD, + /** @SKB_DROP_REASON_NEIGH_HH_FILLFAIL: failed to fill the device hard header */ + SKB_DROP_REASON_NEIGH_HH_FILLFAIL, /** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */ SKB_DROP_REASON_TC_EGRESS, /** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */ @@ -564,6 +586,31 @@ enum skb_drop_reason { */ SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE, /** + * @SKB_DROP_REASON_CAN_RX_INVALID_FRAME: received + * non conform CAN frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CAN_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_CANFD_RX_INVALID_FRAME: received + * non conform CAN-FD frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CANFD_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_CANXL_RX_INVALID_FRAME: received + * non conform CAN-XL frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CANXL_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_PFMEMALLOC: packet allocated from memory reserve + * reached a path or socket not eligible for use of memory reserves + */ + SKB_DROP_REASON_PFMEMALLOC, + /** + * @SKB_DROP_REASON_DUALPI2_STEP_DROP: dropped by the step drop + * threshold of DualPI2 qdisc. + */ + SKB_DROP_REASON_DUALPI2_STEP_DROP, + /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen */ diff --git a/include/net/dsa.h b/include/net/dsa.h index a0a9481c52c2..d73ea0880066 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -54,11 +54,13 @@ struct tc_action; #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26 #define DSA_TAG_PROTO_LAN937X_VALUE 27 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 +#define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE 29 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, + DSA_TAG_PROTO_BRCM_LEGACY_FCS = DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, @@ -1131,9 +1133,10 @@ struct dsa_switch_ops { * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, diff --git a/include/net/dst.h b/include/net/dst.h index 78c78cdce0e9..00467c1b5093 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -240,9 +240,9 @@ static inline void dst_hold(struct dst_entry *dst) static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { - if (unlikely(time != dst->lastuse)) { + if (unlikely(time != READ_ONCE(dst->lastuse))) { dst->__use++; - dst->lastuse = time; + WRITE_ONCE(dst->lastuse, time); } } @@ -431,13 +431,15 @@ static inline void dst_link_failure(struct sk_buff *skb) static inline void dst_set_expires(struct dst_entry *dst, int timeout) { - unsigned long expires = jiffies + timeout; + unsigned long old, expires = jiffies + timeout; if (expires == 0) expires = 1; - if (dst->expires == 0 || time_before(expires, dst->expires)) - dst->expires = expires; + old = READ_ONCE(dst->expires); + + if (!old || time_before(expires, old)) + WRITE_ONCE(dst->expires, expires); } static inline unsigned int dst_dev_overhead(struct dst_entry *dst, @@ -456,7 +458,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return INDIRECT_CALL_INET(skb_dst(skb)->output, + return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->output), ip6_output, ip_output, net, sk, skb); } @@ -466,7 +468,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { - return INDIRECT_CALL_INET(skb_dst(skb)->input, + return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->input), ip6_input, ip_local_deliver, skb); } @@ -476,7 +478,7 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { - if (dst->obsolete) + if (READ_ONCE(dst->obsolete)) dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie); return dst; @@ -561,6 +563,26 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } +static inline struct net_device *dst_dev(const struct dst_entry *dst) +{ + return READ_ONCE(dst->dev); +} + +static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) +{ + return dst_dev(skb_dst(skb)); +} + +static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) +{ + return dev_net(skb_dst_dev(skb)); +} + +static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) +{ + return dev_net_rcu(skb_dst_dev(skb)); +} + struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 5927910ec06e..6e68e359ad18 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -45,6 +45,8 @@ struct fib_rule { struct fib_rule_port_range dport_range; u16 sport_mask; u16 dport_mask; + u8 iif_is_l3_master; + u8 oif_is_l3_master; struct rcu_head rcu; }; diff --git a/include/net/flow.h b/include/net/flow.h index 335bbc52171c..a1839c278d87 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -38,6 +38,8 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 +#define FLOWI_FLAG_L3MDEV_OIF 0x04 +#define FLOWI_FLAG_ANY_SPORT 0x08 __u32 flowic_secid; kuid_t flowic_uid; __u32 flowic_multipath_hash; diff --git a/include/net/gro.h b/include/net/gro.h index 22d3a69e4404..a0fca7ac6e7e 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -534,6 +534,12 @@ static inline void gro_normal_list(struct gro_node *gro) gro->rx_count = 0; } +static inline void gro_flush_normal(struct gro_node *gro, bool flush_old) +{ + gro_flush(gro, flush_old); + gro_normal_list(gro); +} + /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index c32878c69179..ab3929a2a956 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -150,7 +150,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, int iif, int sdif, bool *refcounted) { - struct net *net = dev_net_rcu(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net_rcu(skb); const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct sock *sk; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 949641e92539..19dbd9081d5a 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -175,14 +175,9 @@ struct inet_hashinfo { bool pernet; } ____cacheline_aligned_in_smp; -static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) +static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk) { -#if IS_ENABLED(CONFIG_IP_DCCP) - return sk->sk_prot->h.hashinfo ? : - sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#else return sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#endif } static inline struct inet_listen_hashbucket * @@ -207,12 +202,6 @@ static inline spinlock_t *inet_ehash_lockp( int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); -static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h) -{ - kfree(h->lhash2); - h->lhash2 = NULL; -} - static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); @@ -492,7 +481,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const int sdif, bool *refcounted) { - struct net *net = dev_net_rcu(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net_rcu(skb); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; diff --git a/include/net/ip.h b/include/net/ip.h index 8a48ade24620..befcba575129 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -59,6 +59,7 @@ struct inet_skb_parm { #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) #define IPSKB_MULTIPATH BIT(9) +#define IPSKB_MCROUTE BIT(10) u16 frag_max_size; }; @@ -167,6 +168,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, int ip_local_deliver(struct sk_buff *skb); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto); int ip_mr_input(struct sk_buff *skb); +int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -470,12 +472,12 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { mtu = rt->rt_pmtu; - if (mtu && time_before(jiffies, rt->dst.expires)) + if (mtu && time_before(jiffies, READ_ONCE(rt->dst.expires))) goto out; } @@ -484,7 +486,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, if (mtu) goto out; - mtu = READ_ONCE(dst->dev->mtu); + mtu = READ_ONCE(dst_dev(dst)->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) @@ -504,16 +506,17 @@ out: static inline unsigned int ip_skb_dst_mtu(struct sock *sk, const struct sk_buff *skb) { + const struct dst_entry *dst = skb_dst(skb); unsigned int mtu; if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; - return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); + return ip_dst_mtu_maybe_forward(dst, forwarding); } - mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); - return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); + mtu = min(READ_ONCE(dst_dev(dst)->mtu), IP_MAX_MTU); + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, @@ -667,14 +670,6 @@ static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, memcpy(buf, &naddr, sizeof(naddr)); } -#if IS_MODULE(CONFIG_IPV6) -#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X) -#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X) -#else -#define EXPORT_IPV6_MOD(X) -#define EXPORT_IPV6_MOD_GPL(X) -#endif - #if IS_ENABLED(CONFIG_IPV6) #include <linux/ipv6.h> #endif @@ -694,6 +689,14 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif +#if IS_MODULE(CONFIG_IPV6) +#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X) +#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X) +#else +#define EXPORT_IPV6_MOD(X) +#define EXPORT_IPV6_MOD_GPL(X) +#endif + static inline unsigned int ipv4_addr_hash(__be32 ip) { return (__force unsigned int) ip; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 7c87873ae211..88b0dd4d8e09 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -198,6 +198,7 @@ struct fib6_info { fib6_destroying:1, unused:4; + struct list_head purge_link; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 6dbdf60b342f..9255f21818ee 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -274,7 +274,7 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb) unsigned int mtu; if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) { - mtu = READ_ONCE(dst->dev->mtu); + mtu = READ_ONCE(dst_dev(dst)->mtu); mtu -= lwtunnel_headroom(dst->lwtstate, mtu); } else { mtu = dst_mtu(dst); @@ -337,7 +337,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst mtu = IPV6_MIN_MTU; rcu_read_lock(); - idev = __in6_dev_get(dst->dev); + idev = __in6_dev_get(dst_dev(dst)); if (idev) mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 399592405c72..120db2865811 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -152,13 +152,14 @@ int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, - struct net_device *dev) + struct net_device *dev, u16 ip6cb_flags) { int pkt_len, err; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + IP6CB(skb)->flags = ip6cb_flags; pkt_len = skb->len - skb_inner_network_offset(skb); - err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); + err = ip6_local_out(skb_dst_dev_net(skb), sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e3864b74e92a..48bb3cf41469 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -574,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net, int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); -void fib_select_multipath(struct fib_result *res, int hash); +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index a36a335cef9f..8cf1380f3656 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -377,10 +377,9 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); - -void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill); +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); @@ -604,7 +603,7 @@ static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, - u8 tos, u8 ttl, __be16 df, bool xnet); + u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index f7fe796e8429..1eb8dad18f7e 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -59,6 +59,20 @@ int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg); +static inline +bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) +{ + return !(fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF) && + fl->flowi_l3mdev == iifindex; +} + +static inline +bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) +{ + return fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF && + fl->flowi_l3mdev == oifindex; +} + void l3mdev_update_flow(struct net *net, struct flowi *fl); int l3mdev_master_ifindex_rcu(const struct net_device *dev); @@ -327,6 +341,19 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, { return 1; } + +static inline +bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) +{ + return false; +} + +static inline +bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) +{ + return false; +} + static inline void l3mdev_update_flow(struct net *net, struct flowi *fl) { diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h index ab05024be518..5d991404845e 100644 --- a/include/net/libeth/rx.h +++ b/include/net/libeth/rx.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2024 Intel Corporation */ +/* Copyright (C) 2024-2025 Intel Corporation */ #ifndef __LIBETH_RX_H #define __LIBETH_RX_H @@ -13,8 +13,10 @@ /* Space reserved in front of each frame */ #define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN) +#define LIBETH_XDP_HEADROOM (ALIGN(XDP_PACKET_HEADROOM, NET_SKB_PAD) + \ + NET_IP_ALIGN) /* Maximum headroom for worst-case calculations */ -#define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM +#define LIBETH_MAX_HEADROOM LIBETH_XDP_HEADROOM /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */ #define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) /* Maximum supported L2-L4 header length */ @@ -31,7 +33,7 @@ /** * struct libeth_fqe - structure representing an Rx buffer (fill queue element) - * @page: page holding the buffer + * @netmem: network memory reference holding the buffer * @offset: offset from the page start (to the headroom) * @truesize: total space occupied by the buffer (w/ headroom and tailroom) * @@ -40,7 +42,7 @@ * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```. */ struct libeth_fqe { - struct page *page; + netmem_ref netmem; u32 offset; u32 truesize; } __aligned_largest; @@ -66,6 +68,7 @@ enum libeth_fqe_type { * @count: number of descriptors/buffers the queue has * @type: type of the buffers this queue has * @hsplit: flag whether header split is enabled + * @xdp: flag indicating whether XDP is enabled * @buf_len: HW-writeable length per each buffer * @nid: ID of the closest NUMA node with memory */ @@ -81,6 +84,7 @@ struct libeth_fq { /* Cold fields */ enum libeth_fqe_type type:2; bool hsplit:1; + bool xdp:1; u32 buf_len; int nid; @@ -102,15 +106,16 @@ static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i) struct libeth_fqe *buf = &fq->fqes[i]; buf->truesize = fq->truesize; - buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize); - if (unlikely(!buf->page)) + buf->netmem = page_pool_dev_alloc_netmem(fq->pp, &buf->offset, + &buf->truesize); + if (unlikely(!buf->netmem)) return DMA_MAPPING_ERROR; - return page_pool_get_dma_addr(buf->page) + buf->offset + + return page_pool_get_dma_addr_netmem(buf->netmem) + buf->offset + fq->pp->p.offset; } -void libeth_rx_recycle_slow(struct page *page); +void libeth_rx_recycle_slow(netmem_ref netmem); /** * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA @@ -126,18 +131,19 @@ void libeth_rx_recycle_slow(struct page *page); static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe, u32 len) { - struct page *page = fqe->page; + netmem_ref netmem = fqe->netmem; /* Very rare, but possible case. The most common reason: * the last fragment contained FCS only, which was then * stripped by the HW. */ if (unlikely(!len)) { - libeth_rx_recycle_slow(page); + libeth_rx_recycle_slow(netmem); return false; } - page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len); + page_pool_dma_sync_netmem_for_cpu(netmem_get_pp(netmem), netmem, + fqe->offset, len); return true; } diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h index 35614f9523f6..c3db5c6f1641 100644 --- a/include/net/libeth/tx.h +++ b/include/net/libeth/tx.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2024 Intel Corporation */ +/* Copyright (C) 2024-2025 Intel Corporation */ #ifndef __LIBETH_TX_H #define __LIBETH_TX_H @@ -12,11 +12,17 @@ /** * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion - * @LIBETH_SQE_EMPTY: unused/empty, no action required + * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX/XSk frame, no action required * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree() * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats + * @__LIBETH_SQE_XDP_START: separator between skb and XDP types + * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats + * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats + * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA + * @LIBETH_SQE_XSK_TX: &libeth_xdp_buff on XSk queue, xsk_buff_free(), stats + * @LIBETH_SQE_XSK_TX_FRAG: &libeth_xdp_buff frag on XSk queue, xsk_buff_free() */ enum libeth_sqe_type { LIBETH_SQE_EMPTY = 0U, @@ -24,6 +30,13 @@ enum libeth_sqe_type { LIBETH_SQE_SLAB, LIBETH_SQE_FRAG, LIBETH_SQE_SKB, + + __LIBETH_SQE_XDP_START, + LIBETH_SQE_XDP_TX = __LIBETH_SQE_XDP_START, + LIBETH_SQE_XDP_XMIT, + LIBETH_SQE_XDP_XMIT_FRAG, + LIBETH_SQE_XSK_TX, + LIBETH_SQE_XSK_TX_FRAG, }; /** @@ -32,6 +45,9 @@ enum libeth_sqe_type { * @rs_idx: index of the last buffer from the batch this one was sent in * @raw: slab buffer to free via kfree() * @skb: &sk_buff to consume + * @sinfo: skb shared info of an XDP_TX frame + * @xdpf: XDP frame from ::ndo_xdp_xmit() + * @xsk: XSk Rx frame from XDP_TX action * @dma: DMA address to unmap * @len: length of the mapped region to unmap * @nr_frags: number of frags in the frame this buffer belongs to @@ -46,6 +62,9 @@ struct libeth_sqe { union { void *raw; struct sk_buff *skb; + struct skb_shared_info *sinfo; + struct xdp_frame *xdpf; + struct libeth_xdp_buff *xsk; }; DEFINE_DMA_UNMAP_ADDR(dma); @@ -71,7 +90,10 @@ struct libeth_sqe { /** * struct libeth_cq_pp - completion queue poll params * @dev: &device to perform DMA unmapping + * @bq: XDP frame bulk to combine return operations * @ss: onstack NAPI stats to fill + * @xss: onstack XDPSQ NAPI stats to fill + * @xdp_tx: number of XDP-not-XSk frames processed * @napi: whether it's called from the NAPI context * * libeth uses this structure to access objects needed for performing full @@ -80,7 +102,13 @@ struct libeth_sqe { */ struct libeth_cq_pp { struct device *dev; - struct libeth_sq_napi_stats *ss; + struct xdp_frame_bulk *bq; + + union { + struct libeth_sq_napi_stats *ss; + struct libeth_xdpsq_napi_stats *xss; + }; + u32 xdp_tx; bool napi; }; @@ -126,4 +154,6 @@ static inline void libeth_tx_complete(struct libeth_sqe *sqe, sqe->type = LIBETH_SQE_EMPTY; } +void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp); + #endif /* __LIBETH_TX_H */ diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h index 603825e45133..cf1d78a9dc38 100644 --- a/include/net/libeth/types.h +++ b/include/net/libeth/types.h @@ -1,10 +1,32 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2024 Intel Corporation */ +/* Copyright (C) 2024-2025 Intel Corporation */ #ifndef __LIBETH_TYPES_H #define __LIBETH_TYPES_H -#include <linux/types.h> +#include <linux/workqueue.h> + +/* Stats */ + +/** + * struct libeth_rq_napi_stats - "hot" counters to update in Rx polling loop + * @packets: received frames counter + * @bytes: sum of bytes of received frames above + * @fragments: sum of fragments of received S/G frames + * @hsplit: number of frames the device performed the header split for + * @raw: alias to access all the fields as an array + */ +struct libeth_rq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + u32 fragments; + u32 hsplit; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; /** * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop @@ -22,4 +44,84 @@ struct libeth_sq_napi_stats { }; }; +/** + * struct libeth_xdpsq_napi_stats - "hot" counters to update in XDP Tx + * completion loop + * @packets: completed frames counter + * @bytes: sum of bytes of completed frames above + * @fragments: sum of fragments of completed S/G frames + * @raw: alias to access all the fields as an array + */ +struct libeth_xdpsq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + u32 fragments; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + +/* XDP */ + +/* + * The following structures should be embedded into driver's queue structure + * and passed to the libeth_xdp helpers, never used directly. + */ + +/* XDPSQ sharing */ + +/** + * struct libeth_xdpsq_lock - locking primitive for sharing XDPSQs + * @lock: spinlock for locking the queue + * @share: whether this particular queue is shared + */ +struct libeth_xdpsq_lock { + spinlock_t lock; + bool share; +}; + +/* XDPSQ clean-up timers */ + +/** + * struct libeth_xdpsq_timer - timer for cleaning up XDPSQs w/o interrupts + * @xdpsq: queue this timer belongs to + * @lock: lock for the queue + * @dwork: work performing cleanups + * + * XDPSQs not using interrupts but lazy cleaning, i.e. only when there's no + * space for sending the current queued frame/bulk, must fire up timers to + * make sure there are no stale buffers to free. + */ +struct libeth_xdpsq_timer { + void *xdpsq; + struct libeth_xdpsq_lock *lock; + + struct delayed_work dwork; +}; + +/* Rx polling path */ + +/** + * struct libeth_xdp_buff_stash - struct for stashing &xdp_buff onto a queue + * @data: pointer to the start of the frame, xdp_buff.data + * @headroom: frame headroom, xdp_buff.data - xdp_buff.data_hard_start + * @len: frame linear space length, xdp_buff.data_end - xdp_buff.data + * @frame_sz: truesize occupied by the frame, xdp_buff.frame_sz + * @flags: xdp_buff.flags + * + * &xdp_buff is 56 bytes long on x64, &libeth_xdp_buff is 64 bytes. This + * structure carries only necessary fields to save/restore a partially built + * frame on the queue structure to finish it during the next NAPI poll. + */ +struct libeth_xdp_buff_stash { + void *data; + u16 headroom; + u16 len; + + u32 frame_sz:24; + u32 flags:8; +} __aligned_largest; + #endif /* __LIBETH_TYPES_H */ diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h new file mode 100644 index 000000000000..f4880b50e804 --- /dev/null +++ b/include/net/libeth/xdp.h @@ -0,0 +1,1879 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Intel Corporation */ + +#ifndef __LIBETH_XDP_H +#define __LIBETH_XDP_H + +#include <linux/bpf_trace.h> +#include <linux/unroll.h> + +#include <net/libeth/rx.h> +#include <net/libeth/tx.h> +#include <net/xsk_buff_pool.h> + +/* + * Defined as bits to be able to use them as a mask on Rx. + * Also used as internal return values on Tx. + */ +enum { + LIBETH_XDP_PASS = 0U, + LIBETH_XDP_DROP = BIT(0), + LIBETH_XDP_ABORTED = BIT(1), + LIBETH_XDP_TX = BIT(2), + LIBETH_XDP_REDIRECT = BIT(3), +}; + +/* + * &xdp_buff_xsk is the largest structure &libeth_xdp_buff gets casted to, + * pick maximum pointer-compatible alignment. + */ +#define __LIBETH_XDP_BUFF_ALIGN \ + (IS_ALIGNED(sizeof(struct xdp_buff_xsk), 16) ? 16 : \ + IS_ALIGNED(sizeof(struct xdp_buff_xsk), 8) ? 8 : \ + sizeof(long)) + +/** + * struct libeth_xdp_buff - libeth extension over &xdp_buff + * @base: main &xdp_buff + * @data: shortcut for @base.data + * @desc: RQ descriptor containing metadata for this buffer + * @priv: driver-private scratchspace + * + * The main reason for this is to have a pointer to the descriptor to be able + * to quickly get frame metadata from xdpmo and driver buff-to-xdp callbacks + * (as well as bigger alignment). + * Pointer/layout-compatible with &xdp_buff and &xdp_buff_xsk. + */ +struct libeth_xdp_buff { + union { + struct xdp_buff base; + void *data; + }; + + const void *desc; + unsigned long priv[] + __aligned(__LIBETH_XDP_BUFF_ALIGN); +} __aligned(__LIBETH_XDP_BUFF_ALIGN); +static_assert(offsetof(struct libeth_xdp_buff, data) == + offsetof(struct xdp_buff_xsk, xdp.data)); +static_assert(offsetof(struct libeth_xdp_buff, desc) == + offsetof(struct xdp_buff_xsk, cb)); +static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk), + __alignof(struct libeth_xdp_buff))); + +/** + * __LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack + * @name: name of the variable to declare + * @...: sizeof() of the driver-private data + */ +#define __LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + ___LIBETH_XDP_ONSTACK_BUFF(name, ##__VA_ARGS__) +/** + * LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack + * @name: name of the variable to declare + * @...: type or variable name of the driver-private data + */ +#define LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + __LIBETH_XDP_ONSTACK_BUFF(name, __libeth_xdp_priv_sz(__VA_ARGS__)) + +#define ___LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + __DEFINE_FLEX(struct libeth_xdp_buff, name, priv, \ + LIBETH_XDP_PRIV_SZ(__VA_ARGS__ + 0), \ + __uninitialized); \ + LIBETH_XDP_ASSERT_PRIV_SZ(__VA_ARGS__ + 0) + +#define __libeth_xdp_priv_sz(...) \ + CONCATENATE(__libeth_xdp_psz, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#define __libeth_xdp_psz0(...) +#define __libeth_xdp_psz1(...) sizeof(__VA_ARGS__) + +#define LIBETH_XDP_PRIV_SZ(sz) \ + (ALIGN(sz, __alignof(struct libeth_xdp_buff)) / sizeof(long)) + +/* Performs XSK_CHECK_PRIV_TYPE() */ +#define LIBETH_XDP_ASSERT_PRIV_SZ(sz) \ + static_assert(offsetofend(struct xdp_buff_xsk, cb) >= \ + struct_size_t(struct libeth_xdp_buff, priv, \ + LIBETH_XDP_PRIV_SZ(sz))) + +/* XDPSQ sharing */ + +DECLARE_STATIC_KEY_FALSE(libeth_xdpsq_share); + +/** + * libeth_xdpsq_num - calculate optimal number of XDPSQs for this device + sys + * @rxq: current number of active Rx queues + * @txq: current number of active Tx queues + * @max: maximum number of Tx queues + * + * Each RQ must have its own XDPSQ for XSk pairs, each CPU must have own XDPSQ + * for lockless sending (``XDP_TX``, .ndo_xdp_xmit()). Cap the maximum of these + * two with the number of SQs the device can have (minus used ones). + * + * Return: number of XDP Tx queues the device needs to use. + */ +static inline u32 libeth_xdpsq_num(u32 rxq, u32 txq, u32 max) +{ + return min(max(nr_cpu_ids, rxq), max - txq); +} + +/** + * libeth_xdpsq_shared - whether XDPSQs can be shared between several CPUs + * @num: number of active XDPSQs + * + * Return: true if there's no 1:1 XDPSQ/CPU association, false otherwise. + */ +static inline bool libeth_xdpsq_shared(u32 num) +{ + return num < nr_cpu_ids; +} + +/** + * libeth_xdpsq_id - get XDPSQ index corresponding to this CPU + * @num: number of active XDPSQs + * + * Helper for libeth_xdp routines, do not use in drivers directly. + * + * Return: XDPSQ index needs to be used on this CPU. + */ +static inline u32 libeth_xdpsq_id(u32 num) +{ + u32 ret = raw_smp_processor_id(); + + if (static_branch_unlikely(&libeth_xdpsq_share) && + libeth_xdpsq_shared(num)) + ret %= num; + + return ret; +} + +void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev); +void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev); + +/** + * libeth_xdpsq_get - initialize &libeth_xdpsq_lock + * @lock: lock to initialize + * @dev: netdev which this lock belongs to + * @share: whether XDPSQs can be shared + * + * Tracks the current XDPSQ association and enables the static lock + * if needed. + */ +static inline void libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev, + bool share) +{ + if (unlikely(share)) + __libeth_xdpsq_get(lock, dev); +} + +/** + * libeth_xdpsq_put - deinitialize &libeth_xdpsq_lock + * @lock: lock to deinitialize + * @dev: netdev which this lock belongs to + * + * Tracks the current XDPSQ association and disables the static lock + * if needed. + */ +static inline void libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_put(lock, dev); +} + +void __libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock); +void __libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock); + +/** + * libeth_xdpsq_lock - grab &libeth_xdpsq_lock if needed + * @lock: lock to take + * + * Touches the underlying spinlock only if the static key is enabled + * and the queue itself is marked as shareable. + */ +static inline void libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_lock(lock); +} + +/** + * libeth_xdpsq_unlock - free &libeth_xdpsq_lock if needed + * @lock: lock to free + * + * Touches the underlying spinlock only if the static key is enabled + * and the queue itself is marked as shareable. + */ +static inline void libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_unlock(lock); +} + +/* XDPSQ clean-up timers */ + +void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq, + struct libeth_xdpsq_lock *lock, + void (*poll)(struct work_struct *work)); + +/** + * libeth_xdpsq_deinit_timer - deinitialize &libeth_xdpsq_timer + * @timer: timer to deinitialize + * + * Flush and disable the underlying workqueue. + */ +static inline void libeth_xdpsq_deinit_timer(struct libeth_xdpsq_timer *timer) +{ + cancel_delayed_work_sync(&timer->dwork); +} + +/** + * libeth_xdpsq_queue_timer - run &libeth_xdpsq_timer + * @timer: timer to queue + * + * Should be called after the queue was filled and the transmission was run + * to complete the pending buffers if no further sending will be done in a + * second (-> lazy cleaning won't happen). + * If the timer was already run, it will be requeued back to one second + * timeout again. + */ +static inline void libeth_xdpsq_queue_timer(struct libeth_xdpsq_timer *timer) +{ + mod_delayed_work_on(raw_smp_processor_id(), system_bh_highpri_wq, + &timer->dwork, HZ); +} + +/** + * libeth_xdpsq_run_timer - wrapper to run a queue clean-up on a timer event + * @work: workqueue belonging to the corresponding timer + * @poll: driver-specific completion queue poll function + * + * Run the polling function on the locked queue and requeue the timer if + * there's more work to do. + * Designed to be used via LIBETH_XDP_DEFINE_TIMER() below. + */ +static __always_inline void +libeth_xdpsq_run_timer(struct work_struct *work, + u32 (*poll)(void *xdpsq, u32 budget)) +{ + struct libeth_xdpsq_timer *timer = container_of(work, typeof(*timer), + dwork.work); + + libeth_xdpsq_lock(timer->lock); + + if (poll(timer->xdpsq, U32_MAX)) + libeth_xdpsq_queue_timer(timer); + + libeth_xdpsq_unlock(timer->lock); +} + +/* Common Tx bits */ + +/** + * enum - libeth_xdp internal Tx flags + * @LIBETH_XDP_TX_BULK: one bulk size at which it will be flushed to the queue + * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled + * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent + * @LIBETH_XDP_TX_NDO: whether the send function is called from .ndo_xdp_xmit() + * @LIBETH_XDP_TX_XSK: whether the function is called for ``XDP_TX`` for XSk + */ +enum { + LIBETH_XDP_TX_BULK = DEV_MAP_BULK_SIZE, + LIBETH_XDP_TX_BATCH = 8, + + LIBETH_XDP_TX_DROP = BIT(0), + LIBETH_XDP_TX_NDO = BIT(1), + LIBETH_XDP_TX_XSK = BIT(2), +}; + +/** + * enum - &libeth_xdp_tx_frame and &libeth_xdp_tx_desc flags + * @LIBETH_XDP_TX_LEN: only for ``XDP_TX``, [15:0] of ::len_fl is actual length + * @LIBETH_XDP_TX_CSUM: for XSk xmit, enable checksum offload + * @LIBETH_XDP_TX_XSKMD: for XSk xmit, mask of the metadata bits + * @LIBETH_XDP_TX_FIRST: indicates the frag is the first one of the frame + * @LIBETH_XDP_TX_LAST: whether the frag is the last one of the frame + * @LIBETH_XDP_TX_MULTI: whether the frame contains several frags + * @LIBETH_XDP_TX_FLAGS: only for ``XDP_TX``, [31:16] of ::len_fl is flags + */ +enum { + LIBETH_XDP_TX_LEN = GENMASK(15, 0), + + LIBETH_XDP_TX_CSUM = XDP_TXMD_FLAGS_CHECKSUM, + LIBETH_XDP_TX_XSKMD = LIBETH_XDP_TX_LEN, + + LIBETH_XDP_TX_FIRST = BIT(16), + LIBETH_XDP_TX_LAST = BIT(17), + LIBETH_XDP_TX_MULTI = BIT(18), + + LIBETH_XDP_TX_FLAGS = GENMASK(31, 16), +}; + +/** + * struct libeth_xdp_tx_frame - represents one XDP Tx element + * @data: frame start pointer for ``XDP_TX`` + * @len_fl: ``XDP_TX``, combined flags [31:16] and len [15:0] field for speed + * @soff: ``XDP_TX``, offset from @data to the start of &skb_shared_info + * @frag: one (non-head) frag for ``XDP_TX`` + * @xdpf: &xdp_frame for the head frag for .ndo_xdp_xmit() + * @dma: DMA address of the non-head frag for .ndo_xdp_xmit() + * @xsk: ``XDP_TX`` for XSk, XDP buffer for any frag + * @len: frag length for XSk ``XDP_TX`` and .ndo_xdp_xmit() + * @flags: Tx flags for the above + * @opts: combined @len + @flags for the above for speed + * @desc: XSk xmit descriptor for direct casting + */ +struct libeth_xdp_tx_frame { + union { + /* ``XDP_TX`` */ + struct { + void *data; + u32 len_fl; + u32 soff; + }; + + /* ``XDP_TX`` frag */ + skb_frag_t frag; + + /* .ndo_xdp_xmit(), XSk ``XDP_TX`` */ + struct { + union { + struct xdp_frame *xdpf; + dma_addr_t dma; + + struct libeth_xdp_buff *xsk; + }; + union { + struct { + u32 len; + u32 flags; + }; + aligned_u64 opts; + }; + }; + + /* XSk xmit */ + struct xdp_desc desc; + }; +} __aligned(sizeof(struct xdp_desc)); +static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) == + offsetof(struct libeth_xdp_tx_frame, len_fl)); +static_assert(sizeof(struct libeth_xdp_tx_frame) == sizeof(struct xdp_desc)); + +/** + * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending + * @prog: corresponding active XDP program, %NULL for .ndo_xdp_xmit() + * @dev: &net_device which the frames are transmitted on + * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure + * @act_mask: Rx only, mask of all the XDP prog verdicts for that NAPI session + * @count: current number of frames in @bulk + * @bulk: array of queued frames for bulk Tx + * + * All XDP Tx operations except XSk xmit queue each frame to the bulk first + * and flush it when @count reaches the array end. Bulk is always placed on + * the stack for performance. One bulk element contains all the data necessary + * for sending a frame and then freeing it on completion. + * For XSk xmit, Tx descriptor array from &xsk_buff_pool is casted directly + * to &libeth_xdp_tx_frame as they are compatible and the bulk structure is + * not used. + */ +struct libeth_xdp_tx_bulk { + const struct bpf_prog *prog; + struct net_device *dev; + void *xdpsq; + + u32 act_mask; + u32 count; + struct libeth_xdp_tx_frame bulk[LIBETH_XDP_TX_BULK]; +} __aligned(sizeof(struct libeth_xdp_tx_frame)); + +/** + * LIBETH_XDP_ONSTACK_BULK - declare &libeth_xdp_tx_bulk on the stack + * @bq: name of the variable to declare + * + * Helper to declare a bulk on the stack with a compiler hint that it should + * not be initialized automatically (with `CONFIG_INIT_STACK_ALL_*`) for + * performance reasons. + */ +#define LIBETH_XDP_ONSTACK_BULK(bq) \ + struct libeth_xdp_tx_bulk bq __uninitialized + +/** + * struct libeth_xdpsq - abstraction for an XDPSQ + * @pool: XSk buffer pool for XSk ``XDP_TX`` and xmit + * @sqes: array of Tx buffers from the actual queue struct + * @descs: opaque pointer to the HW descriptor array + * @ntu: pointer to the next free descriptor index + * @count: number of descriptors on that queue + * @pending: pointer to the number of sent-not-completed descs on that queue + * @xdp_tx: pointer to the above, but only for non-XSk-xmit frames + * @lock: corresponding XDPSQ lock + * + * Abstraction for driver-independent implementation of Tx. Placed on the stack + * and filled by the driver before the transmission, so that the generic + * functions can access and modify driver-specific resources. + */ +struct libeth_xdpsq { + struct xsk_buff_pool *pool; + struct libeth_sqe *sqes; + void *descs; + + u32 *ntu; + u32 count; + + u32 *pending; + u32 *xdp_tx; + struct libeth_xdpsq_lock *lock; +}; + +/** + * struct libeth_xdp_tx_desc - abstraction for an XDP Tx descriptor + * @addr: DMA address of the frame + * @len: length of the frame + * @flags: XDP Tx flags + * @opts: combined @len + @flags for speed + * + * Filled by the generic functions and then passed to driver-specific functions + * to fill a HW Tx descriptor, always placed on the [function] stack. + */ +struct libeth_xdp_tx_desc { + dma_addr_t addr; + union { + struct { + u32 len; + u32 flags; + }; + aligned_u64 opts; + }; +} __aligned_largest; + +/** + * libeth_xdp_ptr_to_priv - convert pointer to a libeth_xdp u64 priv + * @ptr: pointer to convert + * + * The main sending function passes private data as the largest scalar, u64. + * Use this helper when you want to pass a pointer there. + */ +#define libeth_xdp_ptr_to_priv(ptr) ({ \ + typecheck_pointer(ptr); \ + ((u64)(uintptr_t)(ptr)); \ +}) +/** + * libeth_xdp_priv_to_ptr - convert libeth_xdp u64 priv to a pointer + * @priv: private data to convert + * + * The main sending function passes private data as the largest scalar, u64. + * Use this helper when your callback takes this u64 and you want to convert + * it back to a pointer. + */ +#define libeth_xdp_priv_to_ptr(priv) ({ \ + static_assert(__same_type(priv, u64)); \ + ((const void *)(uintptr_t)(priv)); \ +}) + +/* + * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len + * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead. + */ +#ifdef __LITTLE_ENDIAN +#define __LIBETH_WORD_ACCESS 1 +#endif +#ifdef __LIBETH_WORD_ACCESS +#define __libeth_xdp_tx_len(flen, ...) \ + .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0))) +#else +#define __libeth_xdp_tx_len(flen, ...) \ + .len = (flen), .flags = (__VA_ARGS__ + 0) +#endif + +/** + * libeth_xdp_tx_xmit_bulk - main XDP Tx function + * @bulk: array of frames to send + * @xdpsq: pointer to the driver-specific XDPSQ struct + * @n: number of frames to send + * @unroll: whether to unroll the queue filling loop for speed + * @priv: driver-specific private data + * @prep: callback for cleaning the queue and filling abstract &libeth_xdpsq + * @fill: internal callback for filling &libeth_sqe and &libeth_xdp_tx_desc + * @xmit: callback for filling a HW descriptor with the frame info + * + * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for + * all types of frames: ``XDP_TX``, .ndo_xdp_xmit(), XSk ``XDP_TX``, and XSk + * xmit. + * @prep must lock the queue as this function releases it at the end. @unroll + * greatly increases the object code size, but also greatly increases XSk xmit + * performance; for other types of frames, it's not enabled. + * The compilers inline all those onstack abstractions to direct data accesses. + * + * Return: number of frames actually placed on the queue, <= @n. The function + * can't fail, but can send less frames if there's no enough free descriptors + * available. The actual free space is returned by @prep from the driver. + */ +static __always_inline u32 +libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq, + u32 n, bool unroll, u64 priv, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + struct libeth_xdp_tx_desc + (*fill)(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv)) +{ + struct libeth_xdpsq sq __uninitialized; + u32 this, batched, off = 0; + u32 ntu, i = 0; + + n = min(n, prep(xdpsq, &sq)); + if (unlikely(!n)) + goto unlock; + + ntu = *sq.ntu; + + this = sq.count - ntu; + if (likely(this > n)) + this = n; + +again: + if (!unroll) + goto linear; + + batched = ALIGN_DOWN(this, LIBETH_XDP_TX_BATCH); + + for ( ; i < off + batched; i += LIBETH_XDP_TX_BATCH) { + u32 base = ntu + i - off; + + unrolled_count(LIBETH_XDP_TX_BATCH) + for (u32 j = 0; j < LIBETH_XDP_TX_BATCH; j++) + xmit(fill(bulk[i + j], base + j, &sq, priv), + base + j, &sq, priv); + } + + if (batched < this) { +linear: + for ( ; i < off + this; i++) + xmit(fill(bulk[i], ntu + i - off, &sq, priv), + ntu + i - off, &sq, priv); + } + + ntu += this; + if (likely(ntu < sq.count)) + goto out; + + ntu = 0; + + if (i < n) { + this = n - i; + off = i; + + goto again; + } + +out: + *sq.ntu = ntu; + *sq.pending += n; + if (sq.xdp_tx) + *sq.xdp_tx += n; + +unlock: + libeth_xdpsq_unlock(sq.lock); + + return n; +} + +/* ``XDP_TX`` bulking */ + +void libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp); + +/** + * libeth_xdp_tx_queue_head - internal helper for queueing one ``XDP_TX`` head + * @bq: XDP Tx bulk to queue the head frag to + * @xdp: XDP buffer with the head to queue + * + * Return: false if it's the only frag of the frame, true if it's an S/G frame. + */ +static inline bool libeth_xdp_tx_queue_head(struct libeth_xdp_tx_bulk *bq, + const struct libeth_xdp_buff *xdp) +{ + const struct xdp_buff *base = &xdp->base; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .data = xdp->data, + .len_fl = (base->data_end - xdp->data) | LIBETH_XDP_TX_FIRST, + .soff = xdp_data_hard_end(base) - xdp->data, + }; + + if (!xdp_buff_has_frags(base)) + return false; + + bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_MULTI; + + return true; +} + +/** + * libeth_xdp_tx_queue_frag - internal helper for queueing one ``XDP_TX`` frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: frag to queue + */ +static inline void libeth_xdp_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, + const skb_frag_t *frag) +{ + bq->bulk[bq->count++].frag = *frag; +} + +/** + * libeth_xdp_tx_queue_bulk - internal helper for queueing one ``XDP_TX`` frame + * @bq: XDP Tx bulk to queue the frame to + * @xdp: XDP buffer to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: true on success, false on flush error. + */ +static __always_inline bool +libeth_xdp_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + const struct skb_shared_info *sinfo; + bool ret = true; + u32 nr_frags; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, 0))) { + libeth_xdp_return_buff_slow(xdp); + return false; + } + + if (!libeth_xdp_tx_queue_head(bq, xdp)) + goto out; + + sinfo = xdp_get_shared_info_from_buff(&xdp->base); + nr_frags = sinfo->nr_frags; + + for (u32 i = 0; i < nr_frags; i++) { + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, 0))) { + ret = false; + break; + } + + libeth_xdp_tx_queue_frag(bq, &sinfo->frags[i]); + } + +out: + bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_LAST; + xdp->data = NULL; + + return ret; +} + +/** + * libeth_xdp_tx_fill_stats - fill &libeth_sqe with ``XDP_TX`` frame stats + * @sqe: SQ element to fill + * @desc: libeth_xdp Tx descriptor + * @sinfo: &skb_shared_info for this frame + * + * Internal helper for filling an SQE with the frame stats, do not use in + * drivers. Fills the number of frags and bytes for this frame. + */ +#define libeth_xdp_tx_fill_stats(sqe, desc, sinfo) \ + __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, __UNIQUE_ID(sqe_), \ + __UNIQUE_ID(desc_), __UNIQUE_ID(sinfo_)) + +#define __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, ue, ud, us) do { \ + const struct libeth_xdp_tx_desc *ud = (desc); \ + const struct skb_shared_info *us; \ + struct libeth_sqe *ue = (sqe); \ + \ + ue->nr_frags = 1; \ + ue->bytes = ud->len; \ + \ + if (ud->flags & LIBETH_XDP_TX_MULTI) { \ + us = (sinfo); \ + ue->nr_frags += us->nr_frags; \ + ue->bytes += us->xdp_frags_size; \ + } \ +} while (0) + +/** + * libeth_xdp_tx_fill_buf - internal helper to fill one ``XDP_TX`` &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the synced DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xdp_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + struct skb_shared_info *sinfo; + skb_frag_t *frag = &frm.frag; + struct libeth_sqe *sqe; + netmem_ref netmem; + + if (frm.len_fl & LIBETH_XDP_TX_FIRST) { + sinfo = frm.data + frm.soff; + skb_frag_fill_netmem_desc(frag, virt_to_netmem(frm.data), + offset_in_page(frm.data), + frm.len_fl); + } else { + sinfo = NULL; + } + + netmem = skb_frag_netmem(frag); + desc = (typeof(desc)){ + .addr = page_pool_get_dma_addr_netmem(netmem) + + skb_frag_off(frag), + .len = skb_frag_size(frag) & LIBETH_XDP_TX_LEN, + .flags = skb_frag_size(frag) & LIBETH_XDP_TX_FLAGS, + }; + + dma_sync_single_for_device(__netmem_get_pp(netmem)->p.dev, desc.addr, + desc.len, DMA_BIDIRECTIONAL); + + if (!sinfo) + return desc; + + sqe = &sq->sqes[i]; + sqe->type = LIBETH_SQE_XDP_TX; + sqe->sinfo = sinfo; + libeth_xdp_tx_fill_stats(sqe, &desc, sinfo); + + return desc; +} + +void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, + u32 flags); + +/** + * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk + * @bq: bulk to flush + * @flags: XDP TX flags (.ndo_xdp_xmit(), XSk etc.) + * @prep: driver-specific callback to prepare the queue for sending + * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc + * @xmit: driver callback to fill a HW descriptor + * + * Internal abstraction to create bulk flush functions for drivers. Used for + * everything except XSk xmit. + * + * Return: true if anything was sent, false otherwise. + */ +static __always_inline bool +__libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + struct libeth_xdp_tx_desc + (*fill)(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, + u64 priv)) +{ + u32 sent, drops; + int err = 0; + + sent = libeth_xdp_tx_xmit_bulk(bq->bulk, bq->xdpsq, + min(bq->count, LIBETH_XDP_TX_BULK), + false, 0, prep, fill, xmit); + drops = bq->count - sent; + + if (unlikely(drops)) { + libeth_xdp_tx_exception(bq, sent, flags); + err = -ENXIO; + } else { + bq->count = 0; + } + + trace_xdp_bulk_tx(bq->dev, sent, drops, err); + + return likely(sent); +} + +/** + * libeth_xdp_tx_flush_bulk - wrapper to define flush of one ``XDP_TX`` bulk + * @bq: bulk to flush + * @flags: Tx flags, see above + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XDP_DEFINE_FLUSH_TX() to define an ``XDP_TX`` driver + * callback. + */ +#define libeth_xdp_tx_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf, \ + xmit) + +/* .ndo_xdp_xmit() implementation */ + +/** + * libeth_xdp_xmit_init_bulk - internal helper to initialize bulk for XDP xmit + * @bq: bulk to initialize + * @dev: target &net_device + * @xdpsqs: array of driver-specific XDPSQ structs + * @num: number of active XDPSQs (the above array length) + */ +#define libeth_xdp_xmit_init_bulk(bq, dev, xdpsqs, num) \ + __libeth_xdp_xmit_init_bulk(bq, dev, (xdpsqs)[libeth_xdpsq_id(num)]) + +static inline void __libeth_xdp_xmit_init_bulk(struct libeth_xdp_tx_bulk *bq, + struct net_device *dev, + void *xdpsq) +{ + bq->dev = dev; + bq->xdpsq = xdpsq; + bq->count = 0; +} + +/** + * libeth_xdp_xmit_frame_dma - internal helper to access DMA of an &xdp_frame + * @xf: pointer to the XDP frame + * + * There's no place in &libeth_xdp_tx_frame to store DMA address for an + * &xdp_frame head. The headroom is used then, the address is placed right + * after the frame struct, naturally aligned. + * + * Return: pointer to the DMA address to use. + */ +#define libeth_xdp_xmit_frame_dma(xf) \ + _Generic((xf), \ + const struct xdp_frame *: \ + (const dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf), \ + struct xdp_frame *: \ + (dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf) \ + ) + +static inline void *__libeth_xdp_xmit_frame_dma(const struct xdp_frame *xdpf) +{ + void *addr = (void *)(xdpf + 1); + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + __alignof(*xdpf) < sizeof(dma_addr_t)) + addr = PTR_ALIGN(addr, sizeof(dma_addr_t)); + + return addr; +} + +/** + * libeth_xdp_xmit_queue_head - internal helper for queueing one XDP xmit head + * @bq: XDP Tx bulk to queue the head frag to + * @xdpf: XDP frame with the head to queue + * @dev: device to perform DMA mapping + * + * Return: ``LIBETH_XDP_DROP`` on DMA mapping error, + * ``LIBETH_XDP_PASS`` if it's the only frag in the frame, + * ``LIBETH_XDP_TX`` if it's an S/G frame. + */ +static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame *xdpf, + struct device *dev) +{ + dma_addr_t dma; + + dma = dma_map_single(dev, xdpf->data, xdpf->len, DMA_TO_DEVICE); + if (dma_mapping_error(dev, dma)) + return LIBETH_XDP_DROP; + + *libeth_xdp_xmit_frame_dma(xdpf) = dma; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xdpf = xdpf, + __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST), + }; + + if (!xdp_frame_has_frags(xdpf)) + return LIBETH_XDP_PASS; + + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI; + + return LIBETH_XDP_TX; +} + +/** + * libeth_xdp_xmit_queue_frag - internal helper for queueing one XDP xmit frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: frag to queue + * @dev: device to perform DMA mapping + * + * Return: true on success, false on DMA mapping error. + */ +static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq, + const skb_frag_t *frag, + struct device *dev) +{ + dma_addr_t dma; + + dma = skb_frag_dma_map(dev, frag); + if (dma_mapping_error(dev, dma)) + return false; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .dma = dma, + __libeth_xdp_tx_len(skb_frag_size(frag)), + }; + + return true; +} + +/** + * libeth_xdp_xmit_queue_bulk - internal helper for queueing one XDP xmit frame + * @bq: XDP Tx bulk to queue the frame to + * @xdpf: XDP frame to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: ``LIBETH_XDP_TX`` on success, + * ``LIBETH_XDP_DROP`` if the frame should be dropped by the stack, + * ``LIBETH_XDP_ABORTED`` if the frame will be dropped by libeth_xdp. + */ +static __always_inline u32 +libeth_xdp_xmit_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame *xdpf, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + u32 head, nr_frags, i, ret = LIBETH_XDP_TX; + struct device *dev = bq->dev->dev.parent; + const struct skb_shared_info *sinfo; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO))) + return LIBETH_XDP_DROP; + + head = libeth_xdp_xmit_queue_head(bq, xdpf, dev); + if (head == LIBETH_XDP_PASS) + goto out; + else if (head == LIBETH_XDP_DROP) + return LIBETH_XDP_DROP; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + nr_frags = sinfo->nr_frags; + + for (i = 0; i < nr_frags; i++) { + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO))) + break; + + if (!libeth_xdp_xmit_queue_frag(bq, &sinfo->frags[i], dev)) + break; + } + + if (unlikely(i < nr_frags)) + ret = LIBETH_XDP_ABORTED; + +out: + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST; + + return ret; +} + +/** + * libeth_xdp_xmit_fill_buf - internal helper to fill one XDP xmit &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the mapped DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xdp_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + struct libeth_sqe *sqe; + struct xdp_frame *xdpf; + + if (frm.flags & LIBETH_XDP_TX_FIRST) { + xdpf = frm.xdpf; + desc.addr = *libeth_xdp_xmit_frame_dma(xdpf); + } else { + xdpf = NULL; + desc.addr = frm.dma; + } + desc.opts = frm.opts; + + sqe = &sq->sqes[i]; + dma_unmap_addr_set(sqe, dma, desc.addr); + dma_unmap_len_set(sqe, len, desc.len); + + if (!xdpf) { + sqe->type = LIBETH_SQE_XDP_XMIT_FRAG; + return desc; + } + + sqe->type = LIBETH_SQE_XDP_XMIT; + sqe->xdpf = xdpf; + libeth_xdp_tx_fill_stats(sqe, &desc, + xdp_get_shared_info_from_frame(xdpf)); + + return desc; +} + +/** + * libeth_xdp_xmit_flush_bulk - wrapper to define flush of one XDP xmit bulk + * @bq: bulk to flush + * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XDP_DEFINE_FLUSH_XMIT() to define an XDP xmit driver + * callback. + */ +#define libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_NDO, prep, \ + libeth_xdp_xmit_fill_buf, xmit) + +u32 libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count, const struct net_device *dev); + +/** + * __libeth_xdp_xmit_do_bulk - internal function to implement .ndo_xdp_xmit() + * @bq: XDP Tx bulk to queue frames to + * @frames: XDP frames passed by the stack + * @n: number of frames + * @flags: flags passed by the stack + * @flush_bulk: driver callback to flush an XDP xmit bulk + * @finalize: driver callback to finalize sending XDP Tx frames on the queue + * + * Perform common checks, map the frags and queue them to the bulk, then flush + * the bulk to the XDPSQ. If requested by the stack, finalize the queue. + * + * Return: number of frames send or -errno on error. + */ +static __always_inline int +__libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame **frames, u32 n, u32 flags, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + u32 nxmit = 0; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + for (u32 i = 0; likely(i < n); i++) { + u32 ret; + + ret = libeth_xdp_xmit_queue_bulk(bq, frames[i], flush_bulk); + if (unlikely(ret != LIBETH_XDP_TX)) { + nxmit += ret == LIBETH_XDP_ABORTED; + break; + } + + nxmit++; + } + + if (bq->count) { + flush_bulk(bq, LIBETH_XDP_TX_NDO); + if (unlikely(bq->count)) + nxmit -= libeth_xdp_xmit_return_bulk(bq->bulk, + bq->count, + bq->dev); + } + + finalize(bq->xdpsq, nxmit, flags & XDP_XMIT_FLUSH); + + return nxmit; +} + +/** + * libeth_xdp_xmit_do_bulk - implement full .ndo_xdp_xmit() in driver + * @dev: target &net_device + * @n: number of frames to send + * @fr: XDP frames to send + * @f: flags passed by the stack + * @xqs: array of XDPSQs driver structs + * @nqs: number of active XDPSQs, the above array length + * @fl: driver callback to flush an XDP xmit bulk + * @fin: driver cabback to finalize the queue + * + * If the driver has active XDPSQs, perform common checks and send the frames. + * Finalize the queue, if requested. + * + * Return: number of frames sent or -errno on error. + */ +#define libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin) \ + _libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(ret_), \ + __UNIQUE_ID(nqs_)) + +#define _libeth_xdp_xmit_do_bulk(d, n, fr, f, xqs, nqs, fl, fin, ub, ur, un) \ +({ \ + u32 un = (nqs); \ + int ur; \ + \ + if (likely(un)) { \ + LIBETH_XDP_ONSTACK_BULK(ub); \ + \ + libeth_xdp_xmit_init_bulk(&ub, d, xqs, un); \ + ur = __libeth_xdp_xmit_do_bulk(&ub, fr, n, f, fl, fin); \ + } else { \ + ur = -ENXIO; \ + } \ + \ + ur; \ +}) + +/* Rx polling path */ + +/** + * libeth_xdp_tx_init_bulk - initialize an XDP Tx bulk for Rx NAPI poll + * @bq: bulk to initialize + * @prog: RCU pointer to the XDP program (can be %NULL) + * @dev: target &net_device + * @xdpsqs: array of driver XDPSQ structs + * @num: number of active XDPSQs, the above array length + * + * Should be called on an onstack XDP Tx bulk before the NAPI polling loop. + * Initializes all the needed fields to run libeth_xdp functions. If @num == 0, + * assumes XDP is not enabled. + * Do not use for XSk, it has its own optimized helper. + */ +#define libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ + __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, false, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) + +#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, xsk, ub, un) do { \ + typeof(bq) ub = (bq); \ + u32 un = (num); \ + \ + rcu_read_lock(); \ + \ + if (un || (xsk)) { \ + ub->prog = rcu_dereference(pr); \ + ub->dev = (d); \ + ub->xdpsq = (xdpsqs)[libeth_xdpsq_id(un)]; \ + } else { \ + ub->prog = NULL; \ + } \ + \ + ub->act_mask = 0; \ + ub->count = 0; \ +} while (0) + +void libeth_xdp_load_stash(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src); +void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src); +void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash); + +/** + * libeth_xdp_init_buff - initialize a &libeth_xdp_buff for Rx NAPI poll + * @dst: onstack buffer to initialize + * @src: XDP buffer stash placed on the queue + * @rxq: registered &xdp_rxq_info corresponding to this queue + * + * Should be called before the main NAPI polling loop. Loads the content of + * the previously saved stash or initializes the buffer from scratch. + * Do not use for XSk. + */ +static inline void +libeth_xdp_init_buff(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src, + struct xdp_rxq_info *rxq) +{ + if (likely(!src->data)) + dst->data = NULL; + else + libeth_xdp_load_stash(dst, src); + + dst->base.rxq = rxq; +} + +/** + * libeth_xdp_save_buff - save a partially built buffer on a queue + * @dst: XDP buffer stash placed on the queue + * @src: onstack buffer to save + * + * Should be called after the main NAPI polling loop. If the loop exited before + * the buffer was finished, saves its content on the queue, so that it can be + * completed during the next poll. Otherwise, clears the stash. + */ +static inline void libeth_xdp_save_buff(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src) +{ + if (likely(!src->data)) + dst->data = NULL; + else + libeth_xdp_save_stash(dst, src); +} + +/** + * libeth_xdp_return_stash - free an XDP buffer stash from a queue + * @stash: stash to free + * + * If the queue is about to be destroyed, but it still has an incompleted + * buffer stash, this helper should be called to free it. + */ +static inline void libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash) +{ + if (stash->data) + __libeth_xdp_return_stash(stash); +} + +static inline void libeth_xdp_return_va(const void *data, bool napi) +{ + netmem_ref netmem = virt_to_netmem(data); + + page_pool_put_full_netmem(__netmem_get_pp(netmem), netmem, napi); +} + +static inline void libeth_xdp_return_frags(const struct skb_shared_info *sinfo, + bool napi) +{ + for (u32 i = 0; i < sinfo->nr_frags; i++) { + netmem_ref netmem = skb_frag_netmem(&sinfo->frags[i]); + + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi); + } +} + +/** + * libeth_xdp_return_buff - free/recycle &libeth_xdp_buff + * @xdp: buffer to free + * + * Hotpath helper to free &libeth_xdp_buff. Comparing to xdp_return_buff(), + * it's faster as it gets inlined and always assumes order-0 pages and safe + * direct recycling. Zeroes @xdp->data to avoid UAFs. + */ +#define libeth_xdp_return_buff(xdp) __libeth_xdp_return_buff(xdp, true) + +static inline void __libeth_xdp_return_buff(struct libeth_xdp_buff *xdp, + bool napi) +{ + if (!xdp_buff_has_frags(&xdp->base)) + goto out; + + libeth_xdp_return_frags(xdp_get_shared_info_from_buff(&xdp->base), + napi); + +out: + libeth_xdp_return_va(xdp->data, napi); + xdp->data = NULL; +} + +bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len); + +/** + * libeth_xdp_prepare_buff - fill &libeth_xdp_buff with head FQE data + * @xdp: XDP buffer to attach the head to + * @fqe: FQE containing the head buffer + * @len: buffer len passed from HW + * + * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer + * head with the Rx buffer data: data pointer, length, headroom, and + * truesize/tailroom. Zeroes the flags. + * Uses faster single u64 write instead of per-field access. + */ +static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + const struct page *page = __netmem_to_page(fqe->netmem); + +#ifdef __LIBETH_WORD_ACCESS + static_assert(offsetofend(typeof(xdp->base), flags) - + offsetof(typeof(xdp->base), frame_sz) == + sizeof(u64)); + + *(u64 *)&xdp->base.frame_sz = fqe->truesize; +#else + xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); +#endif + xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, + pp_page_to_nmdesc(page)->pp->p.offset, len, true); +} + +/** + * libeth_xdp_process_buff - attach Rx buffer to &libeth_xdp_buff + * @xdp: XDP buffer to attach the Rx buffer to + * @fqe: Rx buffer to process + * @len: received data length from the descriptor + * + * If the XDP buffer is empty, attaches the Rx buffer as head and initializes + * the required fields. Otherwise, attaches the buffer as a frag. + * Already performs DMA sync-for-CPU and frame start prefetch + * (for head buffers only). + * + * Return: true on success, false if the descriptor must be skipped (empty or + * no space for a new frag). + */ +static inline bool libeth_xdp_process_buff(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + if (!libeth_rx_sync_for_cpu(fqe, len)) + return false; + + if (xdp->data) + return libeth_xdp_buff_add_frag(xdp, fqe, len); + + libeth_xdp_prepare_buff(xdp, fqe, len); + + prefetch(xdp->data); + + return true; +} + +/** + * libeth_xdp_buff_stats_frags - update onstack RQ stats with XDP frags info + * @ss: onstack stats to update + * @xdp: buffer to account + * + * Internal helper used by __libeth_xdp_run_pass(), do not call directly. + * Adds buffer's frags count and total len to the onstack stats. + */ +static inline void +libeth_xdp_buff_stats_frags(struct libeth_rq_napi_stats *ss, + const struct libeth_xdp_buff *xdp) +{ + const struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(&xdp->base); + ss->bytes += sinfo->xdp_frags_size; + ss->fragments += sinfo->nr_frags + 1; +} + +u32 libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + enum xdp_action act, int ret); + +/** + * __libeth_xdp_run_prog - run XDP program on an XDP buffer + * @xdp: XDP buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * + * Internal inline abstraction to run XDP program. Handles ``XDP_DROP`` + * and ``XDP_REDIRECT`` only, the rest is processed levels up. + * Reports an XDP prog exception on errors. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xdp_run_prog(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq) +{ + enum xdp_action act; + + act = bpf_prog_run_xdp(bq->prog, &xdp->base); + if (unlikely(act < XDP_DROP || act > XDP_REDIRECT)) + goto out; + + switch (act) { + case XDP_PASS: + return LIBETH_XDP_PASS; + case XDP_DROP: + libeth_xdp_return_buff(xdp); + + return LIBETH_XDP_DROP; + case XDP_TX: + return LIBETH_XDP_TX; + case XDP_REDIRECT: + if (unlikely(xdp_do_redirect(bq->dev, &xdp->base, bq->prog))) + break; + + xdp->data = NULL; + + return LIBETH_XDP_REDIRECT; + default: + break; + } + +out: + return libeth_xdp_prog_exception(bq, xdp, act, 0); +} + +/** + * __libeth_xdp_run_flush - run XDP program and handle ``XDP_TX`` verdict + * @xdp: XDP buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * @run: internal callback for running XDP program + * @queue: internal callback for queuing ``XDP_TX`` frame + * @flush_bulk: driver callback for flushing a bulk + * + * Internal inline abstraction to run XDP program and additionally handle + * ``XDP_TX`` verdict. Used by both XDP and XSk, hence @run and @queue. + * Do not use directly. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xdp_run_flush(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, + u32 (*run)(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq), + bool (*queue)(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk) + (struct libeth_xdp_tx_bulk *bq, + u32 flags)), + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + u32 act; + + act = run(xdp, bq); + if (act == LIBETH_XDP_TX && unlikely(!queue(bq, xdp, flush_bulk))) + act = LIBETH_XDP_DROP; + + bq->act_mask |= act; + + return act; +} + +/** + * libeth_xdp_run_prog - run XDP program (non-XSk path) and handle all verdicts + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers + * @fl: driver ``XDP_TX`` bulk flush callback + * + * Run the attached XDP program and handle all possible verdicts. XSk has its + * own version. + * Prefer using it via LIBETH_XDP_DEFINE_RUN{,_PASS,_PROG}(). + * + * Return: true if the buffer should be passed up the stack, false if the poll + * should go to the next buffer. + */ +#define libeth_xdp_run_prog(xdp, bq, fl) \ + (__libeth_xdp_run_flush(xdp, bq, __libeth_xdp_run_prog, \ + libeth_xdp_tx_queue_bulk, \ + fl) == LIBETH_XDP_PASS) + +/** + * __libeth_xdp_run_pass - helper to run XDP program and handle the result + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @md: metadata that should be filled to the XDP buffer + * @prep: callback for filling the metadata + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Inline abstraction that does the following (non-XSk path): + * 1) adds frame size and frag number (if needed) to the onstack stats; + * 2) fills the descriptor metadata to the onstack &libeth_xdp_buff + * 3) runs XDP program if present; + * 4) handles all possible verdicts; + * 5) on ``XDP_PASS`, builds an skb from the buffer; + * 6) populates it with the descriptor metadata; + * 7) passes it up the stack. + * + * In most cases, number 2 means just writing the pointer to the HW descriptor + * to the XDP buffer. If so, please use LIBETH_XDP_DEFINE_RUN{,_PASS}() + * wrappers to build a driver function. + */ +static __always_inline void +__libeth_xdp_run_pass(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi, + struct libeth_rq_napi_stats *rs, const void *md, + void (*prep)(struct libeth_xdp_buff *xdp, + const void *md), + bool (*run)(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq), + bool (*populate)(struct sk_buff *skb, + const struct libeth_xdp_buff *xdp, + struct libeth_rq_napi_stats *rs)) +{ + struct sk_buff *skb; + + rs->bytes += xdp->base.data_end - xdp->data; + rs->packets++; + + if (xdp_buff_has_frags(&xdp->base)) + libeth_xdp_buff_stats_frags(rs, xdp); + + if (prep && (!__builtin_constant_p(!!md) || md)) + prep(xdp, md); + + if (!bq || !run || !bq->prog) + goto build; + + if (!run(xdp, bq)) + return; + +build: + skb = xdp_build_skb_from_buff(&xdp->base); + if (unlikely(!skb)) { + libeth_xdp_return_buff_slow(xdp); + return; + } + + xdp->data = NULL; + + if (unlikely(!populate(skb, xdp, rs))) { + napi_consume_skb(skb, true); + return; + } + + napi_gro_receive(napi, skb); +} + +static inline void libeth_xdp_prep_desc(struct libeth_xdp_buff *xdp, + const void *desc) +{ + xdp->desc = desc; +} + +/** + * libeth_xdp_run_pass - helper to run XDP program and handle the result + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @ss: onstack libeth RQ stats + * @desc: pointer to the HW descriptor for that frame + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Wrapper around the underscored version when "fill the descriptor metadata" + * means just writing the pointer to the HW descriptor as @xdp->desc. + */ +#define libeth_xdp_run_pass(xdp, bq, napi, ss, desc, run, populate) \ + __libeth_xdp_run_pass(xdp, bq, napi, ss, desc, libeth_xdp_prep_desc, \ + run, populate) + +/** + * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop (non-XSk) + * @bq: ``XDP_TX`` frame bulk + * @flush: driver callback to flush the bulk + * @finalize: driver callback to start sending the frames and run the timer + * + * Flush the bulk if there are frames left to send, kick the queue and flush + * the XDP maps. + */ +#define libeth_xdp_finalize_rx(bq, flush, finalize) \ + __libeth_xdp_finalize_rx(bq, 0, flush, finalize) + +static __always_inline void +__libeth_xdp_finalize_rx(struct libeth_xdp_tx_bulk *bq, u32 flags, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + if (bq->act_mask & LIBETH_XDP_TX) { + if (bq->count) + flush_bulk(bq, flags | LIBETH_XDP_TX_DROP); + finalize(bq->xdpsq, true, true); + } + if (bq->act_mask & LIBETH_XDP_REDIRECT) + xdp_do_flush(); + + rcu_read_unlock(); +} + +/* + * Helpers to reduce boilerplate code in drivers. + * + * Typical driver Rx flow would be (excl. bulk and buff init, frag attach): + * + * LIBETH_XDP_DEFINE_START(); + * LIBETH_XDP_DEFINE_FLUSH_TX(static driver_xdp_flush_tx, driver_xdp_tx_prep, + * driver_xdp_xmit); + * LIBETH_XDP_DEFINE_RUN(static driver_xdp_run, driver_xdp_run_prog, + * driver_xdp_flush_tx, driver_populate_skb); + * LIBETH_XDP_DEFINE_FINALIZE(static driver_xdp_finalize_rx, + * driver_xdp_flush_tx, driver_xdp_finalize_sq); + * LIBETH_XDP_DEFINE_END(); + * + * This will build a set of 4 static functions. The compiler is free to decide + * whether to inline them. + * Then, in the NAPI polling function: + * + * while (packets < budget) { + * // ... + * driver_xdp_run(xdp, &bq, napi, &rs, desc); + * } + * driver_xdp_finalize_rx(&bq); + */ + +#define LIBETH_XDP_DEFINE_START() \ + __diag_push(); \ + __diag_ignore(GCC, 8, "-Wold-style-declaration", \ + "Allow specifying \'static\' after the return type") + +/** + * LIBETH_XDP_DEFINE_TIMER - define a driver XDPSQ cleanup timer callback + * @name: name of the function to define + * @poll: Tx polling/completion function + */ +#define LIBETH_XDP_DEFINE_TIMER(name, poll) \ +void name(struct work_struct *work) \ +{ \ + libeth_xdpsq_run_timer(work, poll); \ +} + +/** + * LIBETH_XDP_DEFINE_FLUSH_TX - define a driver ``XDP_TX`` bulk flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit) \ + __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xdp) + +#define __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, pfx) \ +bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \ +{ \ + return libeth_##pfx##_tx_flush_bulk(bq, flags, prep, xmit); \ +} + +/** + * LIBETH_XDP_DEFINE_FLUSH_XMIT - define a driver XDP xmit bulk flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XDP_DEFINE_FLUSH_XMIT(name, prep, xmit) \ +bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \ +{ \ + return libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN_PROG - define a driver XDP program run function + * @name: name of the function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + */ +#define LIBETH_XDP_DEFINE_RUN_PROG(name, flush) \ + bool __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xdp) + +#define __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, pfx) \ +name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq) \ +{ \ + return libeth_##pfx##_run_prog(xdp, bq, flush); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN_PASS - define a driver buffer process + pass function + * @name: name of the function to define + * @run: driver callback to run XDP program (above) + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate) \ + void __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xdp) + +#define __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, pfx) \ +name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq, \ + struct napi_struct *napi, struct libeth_rq_napi_stats *ss, \ + const void *desc) \ +{ \ + return libeth_##pfx##_run_pass(xdp, bq, napi, ss, desc, run, \ + populate); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN - define a driver buffer process, run + pass function + * @name: name of the function to define + * @run: name of the XDP prog run function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XDP_DEFINE_RUN(name, run, flush, populate) \ + __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XDP) + +#define __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, pfx) \ + LIBETH_##pfx##_DEFINE_RUN_PROG(static run, flush); \ + LIBETH_##pfx##_DEFINE_RUN_PASS(name, run, populate) + +/** + * LIBETH_XDP_DEFINE_FINALIZE - define a driver Rx NAPI poll finalize function + * @name: name of the function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + * @finalize: driver callback to finalize an XDPSQ and run the timer + */ +#define LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize) \ + __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xdp) + +#define __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, pfx) \ +void name(struct libeth_xdp_tx_bulk *bq) \ +{ \ + libeth_##pfx##_finalize_rx(bq, flush, finalize); \ +} + +#define LIBETH_XDP_DEFINE_END() __diag_pop() + +/* XMO */ + +/** + * libeth_xdp_buff_to_rq - get RQ pointer from an XDP buffer pointer + * @xdp: &libeth_xdp_buff corresponding to the queue + * @type: typeof() of the driver Rx queue structure + * @member: name of &xdp_rxq_info inside @type + * + * Often times, pointer to the RQ is needed when reading/filling metadata from + * HW descriptors. The helper can be used to quickly jump from an XDP buffer + * to the queue corresponding to its &xdp_rxq_info without introducing + * additional fields (&libeth_xdp_buff is precisely 1 cacheline long on x64). + */ +#define libeth_xdp_buff_to_rq(xdp, type, member) \ + container_of_const((xdp)->base.rxq, type, member) + +/** + * libeth_xdpmo_rx_hash - convert &libeth_rx_pt to an XDP RSS hash metadata + * @hash: pointer to the variable to write the hash to + * @rss_type: pointer to the variable to write the hash type to + * @val: hash value from the HW descriptor + * @pt: libeth parsed packet type + * + * Handle zeroed/non-available hash and convert libeth parsed packet type to + * the corresponding XDP RSS hash type. To be called at the end of + * xdp_metadata_ops idpf_xdpmo::xmo_rx_hash() implementation. + * Note that if the driver doesn't use a constant packet type lookup table but + * generates it at runtime, it must call libeth_rx_pt_gen_hash_type(pt) to + * generate XDP RSS hash type for each packet type. + * + * Return: 0 on success, -ENODATA when the hash is not available. + */ +static inline int libeth_xdpmo_rx_hash(u32 *hash, + enum xdp_rss_hash_type *rss_type, + u32 val, struct libeth_rx_pt pt) +{ + if (unlikely(!val)) + return -ENODATA; + + *hash = val; + *rss_type = pt.hash_type; + + return 0; +} + +/* Tx buffer completion */ + +void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, + struct xdp_frame_bulk *bq, bool frags); +void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp); + +/** + * __libeth_xdp_complete_tx - complete sent XDPSQE + * @sqe: SQ element / Tx buffer to complete + * @cp: Tx polling/completion params + * @bulk: internal callback to bulk-free ``XDP_TX`` buffers + * @xsk: internal callback to free XSk ``XDP_TX`` buffers + * + * Use the non-underscored version in drivers instead. This one is shared + * internally with libeth_tx_complete_any(). + * Complete an XDPSQE of any type of XDP frame. This includes DMA unmapping + * when needed, buffer freeing, stats update, and SQE invalidation. + */ +static __always_inline void +__libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp, + typeof(libeth_xdp_return_buff_bulk) bulk, + typeof(libeth_xsk_buff_free_slow) xsk) +{ + enum libeth_sqe_type type = sqe->type; + + switch (type) { + case LIBETH_SQE_EMPTY: + return; + case LIBETH_SQE_XDP_XMIT: + case LIBETH_SQE_XDP_XMIT_FRAG: + dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma), + dma_unmap_len(sqe, len), DMA_TO_DEVICE); + break; + default: + break; + } + + switch (type) { + case LIBETH_SQE_XDP_TX: + bulk(sqe->sinfo, cp->bq, sqe->nr_frags != 1); + break; + case LIBETH_SQE_XDP_XMIT: + xdp_return_frame_bulk(sqe->xdpf, cp->bq); + break; + case LIBETH_SQE_XSK_TX: + case LIBETH_SQE_XSK_TX_FRAG: + xsk(sqe->xsk); + break; + default: + break; + } + + switch (type) { + case LIBETH_SQE_XDP_TX: + case LIBETH_SQE_XDP_XMIT: + case LIBETH_SQE_XSK_TX: + cp->xdp_tx -= sqe->nr_frags; + + cp->xss->packets++; + cp->xss->bytes += sqe->bytes; + break; + default: + break; + } + + sqe->type = LIBETH_SQE_EMPTY; +} + +static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe, + struct libeth_cq_pp *cp) +{ + __libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk, + libeth_xsk_buff_free_slow); +} + +/* Misc */ + +u32 libeth_xdp_queue_threshold(u32 count); + +void __libeth_xdp_set_features(struct net_device *dev, + const struct xdp_metadata_ops *xmo, + u32 zc_segs, + const struct xsk_tx_metadata_ops *tmo); +void libeth_xdp_set_redirect(struct net_device *dev, bool enable); + +/** + * libeth_xdp_set_features - set XDP features for netdev + * @dev: &net_device to configure + * @...: optional params, see __libeth_xdp_set_features() + * + * Set all the features libeth_xdp supports, including .ndo_xdp_xmit(). That + * said, it should be used only when XDPSQs are always available regardless + * of whether an XDP prog is attached to @dev. + */ +#define libeth_xdp_set_features(dev, ...) \ + CONCATENATE(__libeth_xdp_feat, \ + COUNT_ARGS(__VA_ARGS__))(dev, ##__VA_ARGS__) + +#define __libeth_xdp_feat0(dev) \ + __libeth_xdp_set_features(dev, NULL, 0, NULL) +#define __libeth_xdp_feat1(dev, xmo) \ + __libeth_xdp_set_features(dev, xmo, 0, NULL) +#define __libeth_xdp_feat2(dev, xmo, zc_segs) \ + __libeth_xdp_set_features(dev, xmo, zc_segs, NULL) +#define __libeth_xdp_feat3(dev, xmo, zc_segs, tmo) \ + __libeth_xdp_set_features(dev, xmo, zc_segs, tmo) + +/** + * libeth_xdp_set_features_noredir - enable all libeth_xdp features w/o redir + * @dev: target &net_device + * @...: optional params, see __libeth_xdp_set_features() + * + * Enable everything except the .ndo_xdp_xmit() feature, use when XDPSQs are + * not available right after netdev registration. + */ +#define libeth_xdp_set_features_noredir(dev, ...) \ + __libeth_xdp_set_features_noredir(dev, __UNIQUE_ID(dev_), \ + ##__VA_ARGS__) + +#define __libeth_xdp_set_features_noredir(dev, ud, ...) do { \ + struct net_device *ud = (dev); \ + \ + libeth_xdp_set_features(ud, ##__VA_ARGS__); \ + libeth_xdp_set_redirect(ud, false); \ +} while (0) + +#define libeth_xsktmo ((const void *)GOLDEN_RATIO_PRIME) + +#endif /* __LIBETH_XDP_H */ diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h new file mode 100644 index 000000000000..481a7b28e6f2 --- /dev/null +++ b/include/net/libeth/xsk.h @@ -0,0 +1,685 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Intel Corporation */ + +#ifndef __LIBETH_XSK_H +#define __LIBETH_XSK_H + +#include <net/libeth/xdp.h> +#include <net/xdp_sock_drv.h> + +/* ``XDP_TXMD_FLAGS_VALID`` is defined only under ``CONFIG_XDP_SOCKETS`` */ +#ifdef XDP_TXMD_FLAGS_VALID +static_assert(XDP_TXMD_FLAGS_VALID <= LIBETH_XDP_TX_XSKMD); +#endif + +/* ``XDP_TX`` bulking */ + +/** + * libeth_xsk_tx_queue_head - internal helper for queueing XSk ``XDP_TX`` head + * @bq: XDP Tx bulk to queue the head frag to + * @xdp: XSk buffer with the head to queue + * + * Return: false if it's the only frag of the frame, true if it's an S/G frame. + */ +static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp) +{ + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xsk = xdp, + __libeth_xdp_tx_len(xdp->base.data_end - xdp->data, + LIBETH_XDP_TX_FIRST), + }; + + if (likely(!xdp_buff_has_frags(&xdp->base))) + return false; + + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI; + + return true; +} + +/** + * libeth_xsk_tx_queue_frag - internal helper for queueing XSk ``XDP_TX`` frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: XSk frag to queue + */ +static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *frag) +{ + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xsk = frag, + __libeth_xdp_tx_len(frag->base.data_end - frag->data), + }; +} + +/** + * libeth_xsk_tx_queue_bulk - internal helper for queueing XSk ``XDP_TX`` frame + * @bq: XDP Tx bulk to queue the frame to + * @xdp: XSk buffer to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: true on success, false on flush error. + */ +static __always_inline bool +libeth_xsk_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + bool ret = true; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) { + libeth_xsk_buff_free_slow(xdp); + return false; + } + + if (!libeth_xsk_tx_queue_head(bq, xdp)) + goto out; + + for (const struct libeth_xdp_buff *head = xdp; ; ) { + xdp = container_of(xsk_buff_get_frag(&head->base), + typeof(*xdp), base); + if (!xdp) + break; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) { + ret = false; + break; + } + + libeth_xsk_tx_queue_frag(bq, xdp); + } + +out: + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST; + + return ret; +} + +/** + * libeth_xsk_tx_fill_buf - internal helper to fill XSk ``XDP_TX`` &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the synced DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xsk_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_buff *xdp = frm.xsk; + struct libeth_xdp_tx_desc desc = { + .addr = xsk_buff_xdp_get_dma(&xdp->base), + .opts = frm.opts, + }; + struct libeth_sqe *sqe; + + xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len); + + sqe = &sq->sqes[i]; + sqe->xsk = xdp; + + if (!(desc.flags & LIBETH_XDP_TX_FIRST)) { + sqe->type = LIBETH_SQE_XSK_TX_FRAG; + return desc; + } + + sqe->type = LIBETH_SQE_XSK_TX; + libeth_xdp_tx_fill_stats(sqe, &desc, + xdp_get_shared_info_from_buff(&xdp->base)); + + return desc; +} + +/** + * libeth_xsk_tx_flush_bulk - wrapper to define flush of XSk ``XDP_TX`` bulk + * @bq: bulk to flush + * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XSK_DEFINE_FLUSH_TX() to define an XSk ``XDP_TX`` driver + * callback. + */ +#define libeth_xsk_tx_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_XSK, prep, \ + libeth_xsk_tx_fill_buf, xmit) + +/* XSk TMO */ + +/** + * libeth_xsktmo_req_csum - XSk Tx metadata op to request checksum offload + * @csum_start: unused + * @csum_offset: unused + * @priv: &libeth_xdp_tx_desc from the filling helper + * + * Generic implementation of ::tmo_request_checksum. Works only when HW doesn't + * require filling checksum offsets and other parameters beside the checksum + * request bit. + * Consider using within @libeth_xsktmo unless the driver requires HW-specific + * callbacks. + */ +static inline void libeth_xsktmo_req_csum(u16 csum_start, u16 csum_offset, + void *priv) +{ + ((struct libeth_xdp_tx_desc *)priv)->flags |= LIBETH_XDP_TX_CSUM; +} + +/* Only to inline the callbacks below, use @libeth_xsktmo in drivers instead */ +static const struct xsk_tx_metadata_ops __libeth_xsktmo = { + .tmo_request_checksum = libeth_xsktmo_req_csum, +}; + +/** + * __libeth_xsk_xmit_fill_buf_md - internal helper to prepare XSk xmit w/meta + * @xdesc: &xdp_desc from the XSk buffer pool + * @sq: XDPSQ abstraction for the queue + * @priv: XSk Tx metadata ops + * + * Same as __libeth_xsk_xmit_fill_buf(), but requests metadata pointer and + * fills additional fields in &libeth_xdp_tx_desc to ask for metadata offload. + * + * Return: XDP Tx descriptor with the DMA, metadata request bits, and other + * info to pass to the driver callback. + */ +static __always_inline struct libeth_xdp_tx_desc +__libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc, + const struct libeth_xdpsq *sq, + u64 priv) +{ + const struct xsk_tx_metadata_ops *tmo = libeth_xdp_priv_to_ptr(priv); + struct libeth_xdp_tx_desc desc; + struct xdp_desc_ctx ctx; + + ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr); + desc = (typeof(desc)){ + .addr = ctx.dma, + __libeth_xdp_tx_len(xdesc->len), + }; + + BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo)); + tmo = tmo == libeth_xsktmo ? &__libeth_xsktmo : tmo; + + xsk_tx_metadata_request(ctx.meta, tmo, &desc); + + return desc; +} + +/* XSk xmit implementation */ + +/** + * __libeth_xsk_xmit_fill_buf - internal helper to prepare XSk xmit w/o meta + * @xdesc: &xdp_desc from the XSk buffer pool + * @sq: XDPSQ abstraction for the queue + * + * Return: XDP Tx descriptor with the DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +__libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc, + const struct libeth_xdpsq *sq) +{ + return (struct libeth_xdp_tx_desc){ + .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr), + __libeth_xdp_tx_len(xdesc->len), + }; +} + +/** + * libeth_xsk_xmit_fill_buf - internal helper to prepare an XSk xmit + * @frm: &xdp_desc from the XSk buffer pool + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: XSk Tx metadata ops + * + * Depending on the metadata ops presence (determined at compile time), calls + * the quickest helper to build a libeth XDP Tx descriptor. + * + * Return: XDP Tx descriptor with the synced DMA, metadata request bits, + * and other info to pass to the driver callback. + */ +static __always_inline struct libeth_xdp_tx_desc +libeth_xsk_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + + if (priv) + desc = __libeth_xsk_xmit_fill_buf_md(&frm.desc, sq, priv); + else + desc = __libeth_xsk_xmit_fill_buf(&frm.desc, sq); + + desc.flags |= xsk_is_eop_desc(&frm.desc) ? LIBETH_XDP_TX_LAST : 0; + + xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len); + + return desc; +} + +/** + * libeth_xsk_xmit_do_bulk - send XSk xmit frames + * @pool: XSk buffer pool containing the frames to send + * @xdpsq: opaque pointer to driver's XDPSQ struct + * @budget: maximum number of frames can be sent + * @tmo: optional XSk Tx metadata ops + * @prep: driver callback to build a &libeth_xdpsq + * @xmit: driver callback to put frames to a HW queue + * @finalize: driver callback to start a transmission + * + * Implements generic XSk xmit. Always turns on XSk Tx wakeup as it's assumed + * lazy cleaning is used and interrupts are disabled for the queue. + * HW descriptor filling is unrolled by ``LIBETH_XDP_TX_BATCH`` to optimize + * writes. + * Note that unlike other XDP Tx ops, the queue must be locked and cleaned + * prior to calling this function to already know available @budget. + * @prepare must only build a &libeth_xdpsq and return ``U32_MAX``. + * + * Return: false if @budget was exhausted, true otherwise. + */ +static __always_inline bool +libeth_xsk_xmit_do_bulk(struct xsk_buff_pool *pool, void *xdpsq, u32 budget, + const struct xsk_tx_metadata_ops *tmo, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + const struct libeth_xdp_tx_frame *bulk; + bool wake; + u32 n; + + wake = xsk_uses_need_wakeup(pool); + if (wake) + xsk_clear_tx_need_wakeup(pool); + + n = xsk_tx_peek_release_desc_batch(pool, budget); + bulk = container_of(&pool->tx_descs[0], typeof(*bulk), desc); + + libeth_xdp_tx_xmit_bulk(bulk, xdpsq, n, true, + libeth_xdp_ptr_to_priv(tmo), prep, + libeth_xsk_xmit_fill_buf, xmit); + finalize(xdpsq, n, true); + + if (wake) + xsk_set_tx_need_wakeup(pool); + + return n < budget; +} + +/* Rx polling path */ + +/** + * libeth_xsk_tx_init_bulk - initialize XDP Tx bulk for an XSk Rx NAPI poll + * @bq: bulk to initialize + * @prog: RCU pointer to the XDP program (never %NULL) + * @dev: target &net_device + * @xdpsqs: array of driver XDPSQ structs + * @num: number of active XDPSQs, the above array length + * + * Should be called on an onstack XDP Tx bulk before the XSk NAPI polling loop. + * Initializes all the needed fields to run libeth_xdp functions. + * Never checks if @prog is %NULL or @num == 0 as XDP must always be enabled + * when hitting this path. + */ +#define libeth_xsk_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ + __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, true, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) + +struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp); + +/** + * libeth_xsk_process_buff - attach XSk Rx buffer to &libeth_xdp_buff + * @head: head XSk buffer to attach the XSk buffer to (or %NULL) + * @xdp: XSk buffer to process + * @len: received data length from the descriptor + * + * If @head == %NULL, treats the XSk buffer as head and initializes + * the required fields. Otherwise, attaches the buffer as a frag. + * Already performs DMA sync-for-CPU and frame start prefetch + * (for head buffers only). + * + * Return: head XSk buffer on success or if the descriptor must be skipped + * (empty), %NULL if there is no space for a new frag. + */ +static inline struct libeth_xdp_buff * +libeth_xsk_process_buff(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp, u32 len) +{ + if (unlikely(!len)) { + libeth_xsk_buff_free_slow(xdp); + return head; + } + + xsk_buff_set_size(&xdp->base, len); + xsk_buff_dma_sync_for_cpu(&xdp->base); + + if (head) + return libeth_xsk_buff_add_frag(head, xdp); + + prefetch(xdp->data); + + return xdp; +} + +void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs, + const struct libeth_xdp_buff *xdp); + +u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq, + enum xdp_action act, int ret); + +/** + * __libeth_xsk_run_prog - run XDP program on XSk buffer + * @xdp: XSk buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * + * Internal inline abstraction to run XDP program on XSk Rx path. Handles + * only the most common ``XDP_REDIRECT`` inline, the rest is processed + * externally. + * Reports an XDP prog exception on errors. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xsk_run_prog(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq) +{ + enum xdp_action act; + int ret = 0; + + act = bpf_prog_run_xdp(bq->prog, &xdp->base); + if (unlikely(act != XDP_REDIRECT)) +rest: + return __libeth_xsk_run_prog_slow(xdp, bq, act, ret); + + ret = xdp_do_redirect(bq->dev, &xdp->base, bq->prog); + if (unlikely(ret)) + goto rest; + + return LIBETH_XDP_REDIRECT; +} + +/** + * libeth_xsk_run_prog - run XDP program on XSk path and handle all verdicts + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers + * @fl: driver ``XDP_TX`` bulk flush callback + * + * Run the attached XDP program and handle all possible verdicts. + * Prefer using it via LIBETH_XSK_DEFINE_RUN{,_PASS,_PROG}(). + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +#define libeth_xsk_run_prog(xdp, bq, fl) \ + __libeth_xdp_run_flush(xdp, bq, __libeth_xsk_run_prog, \ + libeth_xsk_tx_queue_bulk, fl) + +/** + * __libeth_xsk_run_pass - helper to run XDP program and handle the result + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @md: metadata that should be filled to the XSk buffer + * @prep: callback for filling the metadata + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Inline abstraction, XSk's counterpart of __libeth_xdp_run_pass(), see its + * doc for details. + * + * Return: false if the polling loop must be exited due to lack of free + * buffers, true otherwise. + */ +static __always_inline bool +__libeth_xsk_run_pass(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi, + struct libeth_rq_napi_stats *rs, const void *md, + void (*prep)(struct libeth_xdp_buff *xdp, + const void *md), + u32 (*run)(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq), + bool (*populate)(struct sk_buff *skb, + const struct libeth_xdp_buff *xdp, + struct libeth_rq_napi_stats *rs)) +{ + struct sk_buff *skb; + u32 act; + + rs->bytes += xdp->base.data_end - xdp->data; + rs->packets++; + + if (unlikely(xdp_buff_has_frags(&xdp->base))) + libeth_xsk_buff_stats_frags(rs, xdp); + + if (prep && (!__builtin_constant_p(!!md) || md)) + prep(xdp, md); + + act = run(xdp, bq); + if (likely(act == LIBETH_XDP_REDIRECT)) + return true; + + if (act != LIBETH_XDP_PASS) + return act != LIBETH_XDP_ABORTED; + + skb = xdp_build_skb_from_zc(&xdp->base); + if (unlikely(!skb)) { + libeth_xsk_buff_free_slow(xdp); + return true; + } + + if (unlikely(!populate(skb, xdp, rs))) { + napi_consume_skb(skb, true); + return true; + } + + napi_gro_receive(napi, skb); + + return true; +} + +/** + * libeth_xsk_run_pass - helper to run XDP program and handle the result + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @desc: pointer to the HW descriptor for that frame + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Wrapper around the underscored version when "fill the descriptor metadata" + * means just writing the pointer to the HW descriptor as @xdp->desc. + */ +#define libeth_xsk_run_pass(xdp, bq, napi, rs, desc, run, populate) \ + __libeth_xsk_run_pass(xdp, bq, napi, rs, desc, libeth_xdp_prep_desc, \ + run, populate) + +/** + * libeth_xsk_finalize_rx - finalize XDPSQ after an XSk NAPI polling loop + * @bq: ``XDP_TX`` frame bulk + * @flush: driver callback to flush the bulk + * @finalize: driver callback to start sending the frames and run the timer + * + * Flush the bulk if there are frames left to send, kick the queue and flush + * the XDP maps. + */ +#define libeth_xsk_finalize_rx(bq, flush, finalize) \ + __libeth_xdp_finalize_rx(bq, LIBETH_XDP_TX_XSK, flush, finalize) + +/* + * Helpers to reduce boilerplate code in drivers. + * + * Typical driver XSk Rx flow would be (excl. bulk and buff init, frag attach): + * + * LIBETH_XDP_DEFINE_START(); + * LIBETH_XSK_DEFINE_FLUSH_TX(static driver_xsk_flush_tx, driver_xsk_tx_prep, + * driver_xdp_xmit); + * LIBETH_XSK_DEFINE_RUN(static driver_xsk_run, driver_xsk_run_prog, + * driver_xsk_flush_tx, driver_populate_skb); + * LIBETH_XSK_DEFINE_FINALIZE(static driver_xsk_finalize_rx, + * driver_xsk_flush_tx, driver_xdp_finalize_sq); + * LIBETH_XDP_DEFINE_END(); + * + * This will build a set of 4 static functions. The compiler is free to decide + * whether to inline them. + * Then, in the NAPI polling function: + * + * while (packets < budget) { + * // ... + * if (!driver_xsk_run(xdp, &bq, napi, &rs, desc)) + * break; + * } + * driver_xsk_finalize_rx(&bq); + */ + +/** + * LIBETH_XSK_DEFINE_FLUSH_TX - define a driver XSk ``XDP_TX`` flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XSK_DEFINE_FLUSH_TX(name, prep, xmit) \ + __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN_PROG - define a driver XDP program run function + * @name: name of the function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + */ +#define LIBETH_XSK_DEFINE_RUN_PROG(name, flush) \ + u32 __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN_PASS - define a driver buffer process + pass function + * @name: name of the function to define + * @run: driver callback to run XDP program (above) + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XSK_DEFINE_RUN_PASS(name, run, populate) \ + bool __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN - define a driver buffer process, run + pass function + * @name: name of the function to define + * @run: name of the XDP prog run function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XSK_DEFINE_RUN(name, run, flush, populate) \ + __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XSK) + +/** + * LIBETH_XSK_DEFINE_FINALIZE - define a driver XSk NAPI poll finalize function + * @name: name of the function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + * @finalize: driver callback to finalize an XDPSQ and run the timer + */ +#define LIBETH_XSK_DEFINE_FINALIZE(name, flush, finalize) \ + __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xsk) + +/* Refilling */ + +/** + * struct libeth_xskfq - structure representing an XSk buffer (fill) queue + * @fp: hotpath part of the structure + * @pool: &xsk_buff_pool for buffer management + * @fqes: array of XSk buffer pointers + * @descs: opaque pointer to the HW descriptor array + * @ntu: index of the next buffer to poll + * @count: number of descriptors/buffers the queue has + * @pending: current number of XSkFQEs to refill + * @thresh: threshold below which the queue is refilled + * @buf_len: HW-writeable length per each buffer + * @nid: ID of the closest NUMA node with memory + */ +struct libeth_xskfq { + struct_group_tagged(libeth_xskfq_fp, fp, + struct xsk_buff_pool *pool; + struct libeth_xdp_buff **fqes; + void *descs; + + u32 ntu; + u32 count; + ); + + /* Cold fields */ + u32 pending; + u32 thresh; + + u32 buf_len; + int nid; +}; + +int libeth_xskfq_create(struct libeth_xskfq *fq); +void libeth_xskfq_destroy(struct libeth_xskfq *fq); + +/** + * libeth_xsk_buff_xdp_get_dma - get DMA address of XSk &libeth_xdp_buff + * @xdp: buffer to get the DMA addr for + */ +#define libeth_xsk_buff_xdp_get_dma(xdp) \ + xsk_buff_xdp_get_dma(&(xdp)->base) + +/** + * libeth_xskfqe_alloc - allocate @n XSk Rx buffers + * @fq: hotpath part of the XSkFQ, usually onstack + * @n: number of buffers to allocate + * @fill: driver callback to write DMA addresses to HW descriptors + * + * Note that @fq->ntu gets updated, but ::pending must be recalculated + * by the caller. + * + * Return: number of buffers refilled. + */ +static __always_inline u32 +libeth_xskfqe_alloc(struct libeth_xskfq_fp *fq, u32 n, + void (*fill)(const struct libeth_xskfq_fp *fq, u32 i)) +{ + u32 this, ret, done = 0; + struct xdp_buff **xskb; + + this = fq->count - fq->ntu; + if (likely(this > n)) + this = n; + +again: + xskb = (typeof(xskb))&fq->fqes[fq->ntu]; + ret = xsk_buff_alloc_batch(fq->pool, xskb, this); + + for (u32 i = 0, ntu = fq->ntu; likely(i < ret); i++) + fill(fq, ntu + i); + + done += ret; + fq->ntu += ret; + + if (likely(fq->ntu < fq->count) || unlikely(ret < this)) + goto out; + + fq->ntu = 0; + + if (this < n) { + this = n - this; + goto again; + } + +out: + return done; +} + +/* .ndo_xsk_wakeup */ + +void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi); +void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid); + +/* Pool setup */ + +int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable); + +#endif /* __LIBETH_XSK_H */ diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 39cd50300a18..26232f603e33 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -116,11 +116,9 @@ int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_valid_encap_type(u16 encap_type, - struct netlink_ext_ack *extack, - bool rtnl_is_held); + struct netlink_ext_ack *extack); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, - struct netlink_ext_ack *extack, - bool rtnl_is_held); + struct netlink_ext_ack *extack); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, @@ -140,12 +138,12 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, static inline void lwtunnel_set_redirect(struct dst_entry *dst) { if (lwtunnel_output_redirect(dst->lwtstate)) { - dst->lwtstate->orig_output = dst->output; - dst->output = lwtunnel_output; + dst->lwtstate->orig_output = READ_ONCE(dst->output); + WRITE_ONCE(dst->output, lwtunnel_output); } if (lwtunnel_input_redirect(dst->lwtstate)) { - dst->lwtstate->orig_input = dst->input; - dst->input = lwtunnel_input; + dst->lwtstate->orig_input = READ_ONCE(dst->input); + WRITE_ONCE(dst->input, lwtunnel_input); } } #else @@ -203,15 +201,14 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, } static inline int lwtunnel_valid_encap_type(u16 encap_type, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } + static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { /* return 0 since we are not walking attr looking for * RTA_ENCAP_TYPE attribute on nexthops. diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c498f685d01f..a45e4bee65d4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -682,6 +682,9 @@ struct ieee80211_parsed_tpe { * responder functionality. * @ftmr_params: configurable lci/civic parameter when enabling FTM responder. * @nontransmitted: this BSS is a nontransmitted BSS profile + * @tx_bss_conf: Pointer to the BSS configuration of transmitting interface + * if MBSSID is enabled. This pointer is RCU-protected due to CSA finish + * and BSS color change flows accessing it. * @transmitter_bssid: the address of transmitter AP * @bssid_index: index inside the multiple BSSID set * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set @@ -741,6 +744,7 @@ struct ieee80211_parsed_tpe { * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the * reception of an EHT TB PPDU on an RU that spans the entire PPDU * bandwidth + * @eht_disable_mcs15: disable EHT-MCS 15 reception capability. * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This * information is the latest known value. It can come from this link's * beacon or from a beacon sent by another link. @@ -754,6 +758,8 @@ struct ieee80211_parsed_tpe { * be updated to 1, even if bss_param_ch_cnt didn't change. This allows * the link to know that it heard the latest value from its own beacon * (as opposed to hearing its value from another link's beacon). + * @s1g_long_beacon_period: number of beacon intervals between each long + * beacon transmission. */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; @@ -804,6 +810,7 @@ struct ieee80211_bss_conf { struct ieee80211_ftm_responder_params *ftmr_params; /* Multiple BSSID data */ bool nontransmitted; + struct ieee80211_bss_conf __rcu *tx_bss_conf; u8 transmitter_bssid[ETH_ALEN]; u8 bssid_index; u8 bssid_indicator; @@ -848,8 +855,12 @@ struct ieee80211_bss_conf { bool eht_su_beamformee; bool eht_mu_beamformer; bool eht_80mhz_full_bw_ul_mumimo; + bool eht_disable_mcs15; + u8 bss_param_ch_cnt; u8 bss_param_ch_cnt_link_id; + + u8 s1g_long_beacon_period; }; /** @@ -2023,7 +2034,6 @@ enum ieee80211_neg_ttlm_res { * @txq: the multicast data TX queue * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see * &enum ieee80211_offload_flags. - * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled. */ struct ieee80211_vif { enum nl80211_iftype type; @@ -2052,8 +2062,6 @@ struct ieee80211_vif { bool probe_req_reg; bool rx_mcast_action_reg; - struct ieee80211_vif *mbssid_tx_vif; - /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; @@ -2424,6 +2432,7 @@ struct ieee80211_sta_aggregates { * @he_cap: HE capabilities of this STA * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @eht_cap: EHT capabilities of this STA + * @s1g_cap: S1G capabilities of this STA * @agg: per-link data for multi-link aggregation * @bandwidth: current bandwidth the station can receive with * @rx_nss: in HT/VHT, the maximum number of spatial streams the @@ -2446,6 +2455,7 @@ struct ieee80211_link_sta { struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; struct ieee80211_sta_eht_cap eht_cap; + struct ieee80211_sta_s1g_cap s1g_cap; struct ieee80211_sta_aggregates agg; @@ -2488,6 +2498,7 @@ struct ieee80211_link_sta { * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single * A-MSDU. Taken from the Extended Capabilities element. 0 means * unlimited. + * @eml_cap: EML capabilities of this MLO station * @cur: currently valid data as aggregated from the active links * For non MLO STA it will point to the deflink data. For MLO STA * ieee80211_sta_recalc_aggregates() must be called to update it. @@ -2522,6 +2533,7 @@ struct ieee80211_sta { bool mlo; bool spp_amsdu; u8 max_amsdu_subframes; + u16 eml_cap; struct ieee80211_sta_aggregates *cur; @@ -2844,8 +2856,6 @@ struct ieee80211_txq { * * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT * and connecting with a lower bandwidth instead - * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in - * EHT in 5 GHz and connecting with a lower bandwidth instead * * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so * no need to stop queues. This really should be set by a driver that @@ -2915,7 +2925,6 @@ enum ieee80211_hw_flags { IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, IEEE80211_HW_DISALLOW_PUNCTURING, - IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ, IEEE80211_HW_HANDLES_QUIET_CSA, IEEE80211_HW_STRICT, @@ -4127,6 +4136,15 @@ struct ieee80211_prep_tx_info { * Statistics that the driver doesn't fill will be filled by mac80211. * The callback can sleep. * + * @link_sta_statistics: Get link statistics for this station. For example with + * beacon filtering, the statistics kept by mac80211 might not be + * accurate, so let the driver pre-fill the statistics. The driver can + * fill most of the values (indicating which by setting the filled + * bitmap), but not all of them make sense - see the source for which + * ones are possible. + * Statistics that the driver doesn't fill will be filled by mac80211. + * The callback can sleep. + * * @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max), * bursting) for a hardware TX queue. * Returns a negative error code on failure. @@ -4296,6 +4314,8 @@ struct ieee80211_prep_tx_info { * @mgd_complete_tx: Notify the driver that the response frame for a previously * transmitted frame announced with @mgd_prepare_tx was received, the data * is filled similarly to @mgd_prepare_tx though the duration is not used. + * Note that this isn't always called for each mgd_prepare_tx() call, for + * example for SAE the 'confirm' messages can be on the air in any order. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending * a TDLS discovery-request, we expect a reply to arrive on the AP's @@ -4460,6 +4480,8 @@ struct ieee80211_prep_tx_info { * new links bitmaps may be 0 if going from/to a non-MLO situation. * The @old array contains pointers to the old bss_conf structures * that were already removed, in case they're needed. + * Note that removal of link should always succeed, so the return value + * will be ignored in a removal only case. * This callback can sleep. * @change_sta_links: Change the valid links of a station, similar to * @change_vif_links. This callback can sleep. @@ -4502,7 +4524,7 @@ struct ieee80211_ops { enum nl80211_iftype new_type, bool p2p); void (*remove_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); - int (*config)(struct ieee80211_hw *hw, u32 changed); + int (*config)(struct ieee80211_hw *hw, int radio_idx, u32 changed); void (*bss_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, @@ -4565,8 +4587,10 @@ struct ieee80211_ops { void (*get_key_seq)(struct ieee80211_hw *hw, struct ieee80211_key_conf *key, struct ieee80211_key_seq *seq); - int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value); - int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value); + int (*set_frag_threshold)(struct ieee80211_hw *hw, int radio_idx, + u32 value); + int (*set_rts_threshold)(struct ieee80211_hw *hw, int radio_idx, + u32 value); int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4621,6 +4645,10 @@ struct ieee80211_ops { s64 offset); void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*tx_last_beacon)(struct ieee80211_hw *hw); + void (*link_sta_statistics)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + struct link_station_info *link_sinfo); /** * @ampdu_action: @@ -4659,7 +4687,8 @@ struct ieee80211_ops { int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); - void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class); + void (*set_coverage_class)(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class); #ifdef CONFIG_NL80211_TESTMODE int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len); @@ -4674,8 +4703,10 @@ struct ieee80211_ops { void (*channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); - int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); - int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); + int (*set_antenna)(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant); + int (*get_antenna)(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant); int (*remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -5347,22 +5378,6 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, int max_rates); /** - * ieee80211_sta_set_expected_throughput - set the expected tpt for a station - * - * Call this function to notify mac80211 about a change in expected throughput - * to a station. A driver for a device that does rate control in firmware can - * call this function when the expected throughput estimate towards a station - * changes. The information is used to tune the CoDel AQM applied to traffic - * going towards that station (which can otherwise be too aggressive and cause - * slow stations to starve). - * - * @pubsta: the station to set throughput for. - * @thr: the current expected throughput in kbps. - */ -void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, - u32 thr); - -/** * ieee80211_tx_rate_update - transmit rate update callback * * Drivers should call this functions with a non-NULL pub sta @@ -6018,21 +6033,12 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); /** - * ieee80211_remove_key - remove the given key - * @keyconf: the parameter passed with the set key - * - * Context: Must be called with the wiphy mutex held. - * - * Remove the given key. If the key was uploaded to the hardware at the - * time this function is called, it is not deleted in the hardware but - * instead assumed to have been removed already. - */ -void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); - -/** * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN * @vif: the virtual interface to add the key on - * @keyconf: new key data + * @idx: the keyidx of the key + * @key_data: the key data + * @key_len: the key data. Might be bigger than the actual key length, + * but not smaller (for the driver convinence) * @link_id: the link id of the key or -1 for non-MLO * * When GTK rekeying was done while the system was suspended, (a) new @@ -6055,13 +6061,11 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); * for the new key for each TID to set up sequence counters properly. * * IMPORTANT: If this replaces a key that is present in the hardware, - * then it will attempt to remove it during this call. In many cases - * this isn't what you want, so call ieee80211_remove_key() first for - * the key that's being replaced. + * then it will attempt to remove it during this call. */ struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, - struct ieee80211_key_conf *keyconf, + u8 idx, u8 *key_data, u8 key_len, int link_id); /** @@ -7252,13 +7256,14 @@ void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif); * ieee80211_ave_rssi - report the average RSSI for the specified interface * * @vif: the specified virtual interface + * @link_id: the link ID for MLO, or -1 for non-MLO * * Note: This function assumes that the given vif is valid. * * Return: The average RSSI value for the requested interface, or 0 if not * applicable. */ -int ieee80211_ave_rssi(struct ieee80211_vif *vif); +int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id); /** * ieee80211_report_wowlan_wakeup - report WoWLAN wakeup diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 228603bf03f2..57df78cfbf82 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -10,6 +10,7 @@ #include "shm_channel.h" #define GDMA_STATUS_MORE_ENTRIES 0x00000105 +#define GDMA_STATUS_CMD_UNSUPPORTED 0xffffffff /* Structures labeled with "HW DATA" are exchanged with the hardware. All of * them are naturally aligned and hence don't need __packed. @@ -58,8 +59,10 @@ enum gdma_eqe_type { GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, GDMA_EQE_HWC_INIT_DATA = 130, GDMA_EQE_HWC_INIT_DONE = 131, - GDMA_EQE_HWC_SOC_RECONFIG = 132, + GDMA_EQE_HWC_FPGA_RECONFIG = 132, GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, + GDMA_EQE_HWC_SOC_SERVICE = 134, + GDMA_EQE_HWC_RESET_REQUEST = 135, GDMA_EQE_RNIC_QP_FATAL = 176, }; @@ -70,6 +73,18 @@ enum { GDMA_DEVICE_MANA_IB = 3, }; +enum gdma_service_type { + GDMA_SERVICE_TYPE_NONE = 0, + GDMA_SERVICE_TYPE_RDMA_SUSPEND = 1, + GDMA_SERVICE_TYPE_RDMA_RESUME = 2, +}; + +struct mana_service_work { + struct work_struct work; + struct gdma_dev *gdma_dev; + enum gdma_service_type event; +}; + struct gdma_resource { /* Protect the bitmap */ spinlock_t lock; @@ -224,6 +239,8 @@ struct gdma_dev { void *driver_data; struct auxiliary_device *adev; + bool is_suspended; + bool rdma_teardown; }; /* MANA_PAGE_SIZE is the DMA unit */ @@ -373,7 +390,7 @@ struct gdma_context { unsigned int max_num_queues; unsigned int max_num_msix; unsigned int num_msix_usable; - struct gdma_irq_context *irq_contexts; + struct xarray irq_contexts; /* L2 MTU */ u16 adapter_mtu; @@ -388,6 +405,8 @@ struct gdma_context { u32 test_event_eq_id; bool is_pf; + bool in_service; + phys_addr_t bar0_pa; void __iomem *bar0_va; void __iomem *shm_base; @@ -407,6 +426,10 @@ struct gdma_context { /* Azure RDMA adapter */ struct gdma_dev mana_ib; + + u64 pf_cap_flags1; + + struct workqueue_struct *service_wq; }; static inline bool mana_gd_is_mana(struct gdma_dev *gd) @@ -553,17 +576,30 @@ enum { */ #define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2) #define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3) +#define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4) #define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5) /* Driver can handle holes (zeros) in the device list */ #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) +/* Driver supports dynamic MSI-X vector allocation */ +#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13) + +/* Driver can self reset on EQE notification */ +#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14) + +/* Driver can self reset on FPGA Reconfig EQE notification */ +#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ - GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP) + GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ + GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \ + GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \ + GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE) #define GDMA_DRV_CAP_FLAGS2 0 @@ -707,20 +743,6 @@ struct gdma_query_hwc_timeout_resp { u32 reserved; }; -enum atb_page_size { - ATB_PAGE_SIZE_4K, - ATB_PAGE_SIZE_8K, - ATB_PAGE_SIZE_16K, - ATB_PAGE_SIZE_32K, - ATB_PAGE_SIZE_64K, - ATB_PAGE_SIZE_128K, - ATB_PAGE_SIZE_256K, - ATB_PAGE_SIZE_512K, - ATB_PAGE_SIZE_1M, - ATB_PAGE_SIZE_2M, - ATB_PAGE_SIZE_MAX, -}; - enum gdma_mr_access_flags { GDMA_ACCESS_FLAG_LOCAL_READ = BIT_ULL(0), GDMA_ACCESS_FLAG_LOCAL_WRITE = BIT_ULL(1), @@ -815,6 +837,8 @@ enum gdma_mr_type { * address that is set up in the MST */ GDMA_MR_TYPE_GVA = 2, + /* Guest zero-based address MRs */ + GDMA_MR_TYPE_ZBVA = 4, }; struct gdma_create_mr_params { @@ -826,6 +850,10 @@ struct gdma_create_mr_params { u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; + struct { + u64 dma_region_handle; + enum gdma_mr_access_flags access_flags; + } zbva; }; }; @@ -841,7 +869,10 @@ struct gdma_create_mr_request { u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; - + struct { + u64 dma_region_handle; + enum gdma_mr_access_flags access_flags; + } zbva; }; u32 reserved_2; };/* HW DATA */ @@ -893,4 +924,11 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); void mana_register_debugfs(void); void mana_unregister_debugfs(void); +int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event); + +int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state); +int mana_gd_resume(struct pci_dev *pdev); + +bool mana_need_log(struct gdma_context *gc, int err); + #endif /* _GDMA_H */ diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h index 158b125692c2..83cf93338eb3 100644 --- a/include/net/mana/hw_channel.h +++ b/include/net/mana/hw_channel.h @@ -49,6 +49,15 @@ union hwc_init_type_data { }; }; /* HW DATA */ +union hwc_init_soc_service_type { + u32 as_uint32; + + struct { + u32 value : 28; + u32 type : 4; + }; +}; /* HW DATA */ + struct hwc_rx_oob { u32 type : 6; u32 eom : 1; diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 0f78065de8fe..e1030a7d2daa 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -5,6 +5,7 @@ #define _MANA_H #include <net/xdp.h> +#include <net/net_shaper.h> #include "gdma.h" #include "hw_channel.h" @@ -404,10 +405,70 @@ struct mana_ethtool_stats { u64 rx_cqe_unknown_type; }; +struct mana_ethtool_phy_stats { + /* Drop Counters */ + u64 rx_pkt_drop_phy; + u64 tx_pkt_drop_phy; + + /* Per TC traffic Counters */ + u64 rx_pkt_tc0_phy; + u64 tx_pkt_tc0_phy; + u64 rx_pkt_tc1_phy; + u64 tx_pkt_tc1_phy; + u64 rx_pkt_tc2_phy; + u64 tx_pkt_tc2_phy; + u64 rx_pkt_tc3_phy; + u64 tx_pkt_tc3_phy; + u64 rx_pkt_tc4_phy; + u64 tx_pkt_tc4_phy; + u64 rx_pkt_tc5_phy; + u64 tx_pkt_tc5_phy; + u64 rx_pkt_tc6_phy; + u64 tx_pkt_tc6_phy; + u64 rx_pkt_tc7_phy; + u64 tx_pkt_tc7_phy; + + u64 rx_byte_tc0_phy; + u64 tx_byte_tc0_phy; + u64 rx_byte_tc1_phy; + u64 tx_byte_tc1_phy; + u64 rx_byte_tc2_phy; + u64 tx_byte_tc2_phy; + u64 rx_byte_tc3_phy; + u64 tx_byte_tc3_phy; + u64 rx_byte_tc4_phy; + u64 tx_byte_tc4_phy; + u64 rx_byte_tc5_phy; + u64 tx_byte_tc5_phy; + u64 rx_byte_tc6_phy; + u64 tx_byte_tc6_phy; + u64 rx_byte_tc7_phy; + u64 tx_byte_tc7_phy; + + /* Per TC pause Counters */ + u64 rx_pause_tc0_phy; + u64 tx_pause_tc0_phy; + u64 rx_pause_tc1_phy; + u64 tx_pause_tc1_phy; + u64 rx_pause_tc2_phy; + u64 tx_pause_tc2_phy; + u64 rx_pause_tc3_phy; + u64 tx_pause_tc3_phy; + u64 rx_pause_tc4_phy; + u64 tx_pause_tc4_phy; + u64 rx_pause_tc5_phy; + u64 tx_pause_tc5_phy; + u64 rx_pause_tc6_phy; + u64 tx_pause_tc6_phy; + u64 rx_pause_tc7_phy; + u64 tx_pause_tc7_phy; +}; + struct mana_context { struct gdma_dev *gdma_dev; u16 num_ports; + u8 bm_hostmode; struct mana_eq *eqs; struct dentry *mana_eqs_debugfs; @@ -466,13 +527,22 @@ struct mana_port_context { struct mutex vport_mutex; int vport_use_count; + /* Net shaper handle*/ + struct net_shaper_handle handle; + u16 port_idx; + /* Currently configured speed (mbps) */ + u32 speed; + /* Maximum speed supported by the SKU (mbps) */ + u32 max_speed; bool port_is_up; bool port_st_save; /* Saved port state */ struct mana_ethtool_stats eth_stats; + struct mana_ethtool_phy_stats phy_stats; + /* Debugfs */ struct dentry *mana_port_debugfs; }; @@ -488,6 +558,9 @@ int mana_detach(struct net_device *ndev, bool from_close); int mana_probe(struct gdma_dev *gd, bool resuming); void mana_remove(struct gdma_dev *gd, bool suspending); +int mana_rdma_probe(struct gdma_dev *gd); +void mana_rdma_remove(struct gdma_dev *gd); + void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev); int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames, u32 flags); @@ -497,6 +570,10 @@ struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); void mana_query_gf_stats(struct mana_port_context *apc); +int mana_query_link_cfg(struct mana_port_context *apc); +int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, + int enable_clamping); +void mana_query_phy_stats(struct mana_port_context *apc); int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues); void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); @@ -523,6 +600,9 @@ enum mana_command_code { MANA_FENCE_RQ = 0x20006, MANA_CONFIG_VPORT_RX = 0x20007, MANA_QUERY_VPORT_CONFIG = 0x20008, + MANA_QUERY_LINK_CONFIG = 0x2000A, + MANA_SET_BW_CLAMP = 0x2000B, + MANA_QUERY_PHY_STAT = 0x2000c, /* Privileged commands for the PF mode */ MANA_REGISTER_FILTER = 0x28000, @@ -531,6 +611,35 @@ enum mana_command_code { MANA_DEREGISTER_HW_PORT = 0x28004, }; +/* Query Link Configuration*/ +struct mana_query_link_config_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; +}; /* HW DATA */ + +struct mana_query_link_config_resp { + struct gdma_resp_hdr hdr; + u32 qos_speed_mbps; + u8 qos_unconfigured; + u8 reserved1[3]; + u32 link_speed_mbps; + u8 reserved2[4]; +}; /* HW DATA */ + +/* Set Bandwidth Clamp*/ +struct mana_set_bw_clamp_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + enum TRI_STATE enable_clamping; + u32 link_speed_mbps; +}; /* HW DATA */ + +struct mana_set_bw_clamp_resp { + struct gdma_resp_hdr hdr; + u8 qos_unconfigured; + u8 reserved[7]; +}; /* HW DATA */ + /* Query Device Configuration */ struct mana_query_device_cfg_req { struct gdma_req_hdr hdr; @@ -557,7 +666,8 @@ struct mana_query_device_cfg_resp { u64 pf_cap_flags4; u16 max_num_vports; - u16 reserved; + u8 bm_hostmode; /* response v3: Bare Metal Host Mode */ + u8 reserved; u32 max_num_eqs; /* response v2: */ @@ -684,6 +794,74 @@ struct mana_query_gf_stat_resp { u64 tx_err_gdma; }; /* HW DATA */ +/* Query phy stats */ +struct mana_query_phy_stat_req { + struct gdma_req_hdr hdr; + u64 req_stats; +}; /* HW DATA */ + +struct mana_query_phy_stat_resp { + struct gdma_resp_hdr hdr; + u64 reported_stats; + + /* Aggregate Drop Counters */ + u64 rx_pkt_drop_phy; + u64 tx_pkt_drop_phy; + + /* Per TC(Traffic class) traffic Counters */ + u64 rx_pkt_tc0_phy; + u64 tx_pkt_tc0_phy; + u64 rx_pkt_tc1_phy; + u64 tx_pkt_tc1_phy; + u64 rx_pkt_tc2_phy; + u64 tx_pkt_tc2_phy; + u64 rx_pkt_tc3_phy; + u64 tx_pkt_tc3_phy; + u64 rx_pkt_tc4_phy; + u64 tx_pkt_tc4_phy; + u64 rx_pkt_tc5_phy; + u64 tx_pkt_tc5_phy; + u64 rx_pkt_tc6_phy; + u64 tx_pkt_tc6_phy; + u64 rx_pkt_tc7_phy; + u64 tx_pkt_tc7_phy; + + u64 rx_byte_tc0_phy; + u64 tx_byte_tc0_phy; + u64 rx_byte_tc1_phy; + u64 tx_byte_tc1_phy; + u64 rx_byte_tc2_phy; + u64 tx_byte_tc2_phy; + u64 rx_byte_tc3_phy; + u64 tx_byte_tc3_phy; + u64 rx_byte_tc4_phy; + u64 tx_byte_tc4_phy; + u64 rx_byte_tc5_phy; + u64 tx_byte_tc5_phy; + u64 rx_byte_tc6_phy; + u64 tx_byte_tc6_phy; + u64 rx_byte_tc7_phy; + u64 tx_byte_tc7_phy; + + /* Per TC(Traffic Class) pause Counters */ + u64 rx_pause_tc0_phy; + u64 tx_pause_tc0_phy; + u64 rx_pause_tc1_phy; + u64 tx_pause_tc1_phy; + u64 rx_pause_tc2_phy; + u64 tx_pause_tc2_phy; + u64 rx_pause_tc3_phy; + u64 tx_pause_tc3_phy; + u64 rx_pause_tc4_phy; + u64 tx_pause_tc4_phy; + u64 rx_pause_tc5_phy; + u64 tx_pause_tc5_phy; + u64 rx_pause_tc6_phy; + u64 tx_pause_tc6_phy; + u64 rx_pause_tc7_phy; + u64 tx_pause_tc7_phy; +}; /* HW DATA */ + /* Configure vPort Rx Steering */ struct mana_cfg_rx_steer_req_v2 { struct gdma_req_hdr hdr; diff --git a/include/net/mctp.h b/include/net/mctp.h index 07d458990113..c3207ce98f07 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -69,7 +69,10 @@ struct mctp_sock { /* bind() params */ unsigned int bind_net; - mctp_eid_t bind_addr; + mctp_eid_t bind_local_addr; + mctp_eid_t bind_peer_addr; + unsigned int bind_peer_net; + bool bind_peer_set; __u8 bind_type; /* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */ @@ -183,8 +186,8 @@ struct mctp_sk_key { struct mctp_skb_cb { unsigned int magic; unsigned int net; - int ifindex; /* extended/direct addressing if set */ - mctp_eid_t src; + /* fields below provide extended addressing for ingress to recvmsg() */ + int ifindex; unsigned char halen; unsigned char haddr[MAX_ADDR_LEN]; }; @@ -222,6 +225,8 @@ struct mctp_flow { struct mctp_sk_key *key; }; +struct mctp_dst; + /* Route definition. * * These are held in the pernet->mctp.routes list, with RCU protection for @@ -229,16 +234,25 @@ struct mctp_flow { * dropped on NETDEV_UNREGISTER events. * * Updates to the route table are performed under rtnl; all reads under RCU, - * so routes cannot be referenced over a RCU grace period. Specifically: A - * caller cannot block between mctp_route_lookup and mctp_route_release() + * so routes cannot be referenced over a RCU grace period. */ struct mctp_route { mctp_eid_t min, max; unsigned char type; + unsigned int mtu; - struct mctp_dev *dev; - int (*output)(struct mctp_route *route, + + enum { + MCTP_ROUTE_DIRECT, + MCTP_ROUTE_GATEWAY, + } dst_type; + union { + struct mctp_dev *dev; + struct mctp_fq_addr gateway; + }; + + int (*output)(struct mctp_dst *dst, struct sk_buff *skb); struct list_head list; @@ -246,12 +260,35 @@ struct mctp_route { struct rcu_head rcu; }; +/* Route lookup result: dst. Represents the results of a routing decision, + * but is only held over the individual routing operation. + * + * Will typically be stored on the caller stack, and must be released after + * usage. + */ +struct mctp_dst { + struct mctp_dev *dev; + unsigned int mtu; + mctp_eid_t nexthop; + + /* set for direct addressing */ + unsigned char halen; + unsigned char haddr[MAX_ADDR_LEN]; + + int (*output)(struct mctp_dst *dst, struct sk_buff *skb); +}; + +int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, + unsigned char halen, const unsigned char *haddr); + /* route interfaces */ -struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, - mctp_eid_t daddr); +int mctp_route_lookup(struct net *net, unsigned int dnet, + mctp_eid_t daddr, struct mctp_dst *dst); + +void mctp_dst_release(struct mctp_dst *dst); /* always takes ownership of skb */ -int mctp_local_output(struct sock *sk, struct mctp_route *rt, +int mctp_local_output(struct sock *sk, struct mctp_dst *dst, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); diff --git a/include/net/mptcp.h b/include/net/mptcp.h index bfbad695951c..f7263fe2a2e4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -101,18 +101,9 @@ struct mptcp_out_options { #define MPTCP_SCHED_MAX 128 #define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) -#define MPTCP_SUBFLOWS_MAX 8 - -struct mptcp_sched_data { - u8 subflows; - struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; -}; - struct mptcp_sched_ops { - int (*get_send)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); - int (*get_retrans)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); + int (*get_send)(struct mptcp_sock *msk); + int (*get_retrans)(struct mptcp_sock *msk); char name[MPTCP_SCHED_NAME_MAX]; struct module *owner; diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 3c88d5bc5eed..d38783a2ce57 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -60,15 +60,6 @@ enum { #include <net/neighbour.h> -/* Set to 3 to get tracing... */ -#define ND_DEBUG 1 - -#define ND_PRINTK(val, level, fmt, ...) \ -do { \ - if (val <= ND_DEBUG) \ - net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ -} while (0) - struct ctl_table; struct inet6_dev; struct net_device; diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 9a832cab5b1d..4a30bd458c5a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -176,12 +176,17 @@ struct neigh_ops { }; struct pneigh_entry { - struct pneigh_entry *next; + struct pneigh_entry __rcu *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; + union { + struct list_head free_node; + struct rcu_head rcu; + }; u32 flags; u8 protocol; + bool permanent; u32 key[]; }; @@ -235,7 +240,8 @@ struct neigh_table { unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; - struct pneigh_entry **phash_buckets; + struct mutex phash_lock; + struct pneigh_entry __rcu **phash_buckets; }; static inline int neigh_parms_family(struct neigh_parms *p) @@ -260,13 +266,15 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) #define NEIGH_UPDATE_F_ISROUTER BIT(6) #define NEIGH_UPDATE_F_ADMIN BIT(7) +#define NEIGH_UPDATE_F_EXT_VALIDATED BIT(8) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 -#define NTF_EXT_MASK (NTF_EXT_MANAGED) +#define NTF_EXT_MASK (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED) #define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) +#define NTF_EXT_VALIDATED (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; @@ -373,10 +381,10 @@ unsigned long neigh_rand_reach_time(unsigned long base); void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev, - int creat); -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); + const void *key, struct net_device *dev); +int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, + struct net_device *dev, u32 flags, u8 protocol, + bool permanent); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index bd57d8fb54f1..025a7574b275 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -475,8 +475,8 @@ struct pernet_operations { void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); /* Following method is called with RTNL held. */ - void (*exit_batch_rtnl)(struct list_head *net_exit_list, - struct list_head *dev_kill_list); + void (*exit_rtnl)(struct net *net, + struct list_head *dev_kill_list); unsigned int * const id; const size_t size; }; diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index 1c0c9a94cc22..3d3aef80beac 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -48,6 +48,22 @@ static inline void netdev_unlock_ops(struct net_device *dev) netdev_unlock(dev); } +static inline void netdev_lock_ops_to_full(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_lock(dev); +} + +static inline void netdev_unlock_full_to_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_unlock(dev); +} + static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) @@ -64,19 +80,34 @@ netdev_ops_assert_locked_or_invisible(const struct net_device *dev) netdev_ops_assert_locked(dev); } +static inline void netdev_lock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); + else + rtnl_lock(); +} + +static inline void netdev_unlock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); + else + rtnl_unlock(); +} + static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { - /* Only lower devices currently grab the instance lock, so no - * real ordering issues can occur. In the near future, only - * hardware devices will grab instance lock which also does not - * involve any ordering. Suppress lockdep ordering warnings - * until (if) we start grabbing instance lock on pure SW - * devices (bond/team/veth/etc). - */ if (a == b) return 0; - return -1; + + /* Allow locking multiple devices only under rtnl_lock, + * the exact order doesn't matter. + * Note that upper devices don't lock their ops, so nesting + * mostly happens in batched device removal for now. + */ + return lockdep_rtnl_is_held() ? -1 : 1; } #define netdev_lockdep_set_classes(dev) \ @@ -98,4 +129,10 @@ static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, &qdisc_xmit_lock_key); \ } +#define netdev_lock_dereference(p, dev) \ + rcu_dereference_protected(p, lockdep_is_held(&(dev)->lock)) + +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr); + #endif diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 825141d675e5..6e835972abd1 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -85,9 +85,11 @@ struct netdev_queue_stats_tx { * for some of the events is not maintained, and reliable "total" cannot * be provided). * + * Ops are called under the instance lock if netdev_need_ops_lock() + * returns true, otherwise under rtnl_lock. * Device drivers can assume that when collecting total device stats, * the @get_base_stats and subsequent per-queue calls are performed - * "atomically" (without releasing the rtnl_lock). + * "atomically" (without releasing the relevant lock). * * Device drivers are encouraged to reset the per-queue statistics when * number of queues change. This is because the primary use case for @@ -103,6 +105,12 @@ struct netdev_stat_ops { struct netdev_queue_stats_tx *tx); }; +void netdev_stat_queue_sum(struct net_device *netdev, + int rx_start, int rx_end, + struct netdev_queue_stats_rx *rx_sum, + int tx_start, int tx_end, + struct netdev_queue_stats_tx *tx_sum); + /** * struct netdev_queue_mgmt_ops - netdev ops for queue management * @@ -280,27 +288,36 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue, #define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_try_stop(txq, get_desc, start_thrs); \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_try_stop(_txq, get_desc, start_thrs); \ }) +static inline void netif_subqueue_sent(const struct net_device *dev, + unsigned int idx, unsigned int bytes) +{ + struct netdev_queue *txq; + + txq = netdev_get_tx_queue(dev, idx); + netdev_tx_sent_queue(txq, bytes); +} + #define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_maybe_stop(_txq, get_desc, stop_thrs, start_thrs); \ }) #define netif_subqueue_completed_wake(dev, idx, pkts, bytes, \ get_desc, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_completed_wake(txq, pkts, bytes, \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_completed_wake(_txq, pkts, bytes, \ get_desc, start_thrs); \ }) diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index b2238b551dce..8cdcd138b33f 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -20,12 +20,12 @@ struct netdev_rx_queue { struct net_device *dev; netdevice_tracker dev_tracker; + /* All fields below are "ops protected", + * see comment about net_device::lock + */ #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif - /* NAPI instance for the queue - * "ops protected", see comment about net_device::lock - */ struct napi_struct *napi; struct pp_memory_provider_params mp_params; } ____cacheline_aligned_in_smp; diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 2c8c2b023848..8d65ffbf57de 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -13,9 +13,6 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; -#ifdef CONFIG_NF_CT_PROTO_DCCP -extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp; #endif diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 3f02a45773e8..aa0a7c82199e 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -18,7 +18,6 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tcp.h> -#include <linux/netfilter/nf_conntrack_dccp.h> #include <linux/netfilter/nf_conntrack_sctp.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> @@ -31,7 +30,6 @@ struct nf_ct_udp { /* per conntrack: protocol private data */ union nf_conntrack_proto { /* insert conntrack proto private data here */ - struct nf_ct_dccp dccp; struct ip_ct_sctp sctp; struct ip_ct_tcp tcp; struct nf_ct_udp udp; @@ -306,8 +304,19 @@ static inline bool nf_ct_is_expired(const struct nf_conn *ct) /* use after obtaining a reference count */ static inline bool nf_ct_should_gc(const struct nf_conn *ct) { - return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) && - !nf_ct_is_dying(ct); + if (!nf_ct_is_confirmed(ct)) + return false; + + /* load ct->timeout after is_confirmed() test. + * Pairs with __nf_conntrack_confirm() which: + * 1. Increases ct->timeout value + * 2. Inserts ct into rcu hlist + * 3. Sets the confirmed bit + * 4. Unlocks the hlist lock + */ + smp_acquire__after_ctrl_dep(); + + return nf_ct_is_expired(ct) && !nf_ct_is_dying(ct); } #define NF_CT_DAY (86400 * HZ) diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 1f47bef51722..6929f8daf1ed 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -117,11 +117,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); -int nf_conntrack_dccp_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state); int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, @@ -137,7 +132,6 @@ void nf_conntrack_generic_init_net(struct net *net); void nf_conntrack_tcp_init_net(struct net *net); void nf_conntrack_udp_init_net(struct net *net); void nf_conntrack_gre_init_net(struct net *net); -void nf_conntrack_dccp_init_net(struct net *net); void nf_conntrack_sctp_init_net(struct net *net); void nf_conntrack_icmp_init_net(struct net *net); void nf_conntrack_icmpv6_init_net(struct net *net); @@ -223,13 +217,6 @@ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct) } #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP -static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.dccp; -} -#endif - #ifdef CONFIG_NF_CT_PROTO_SCTP static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) { diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index d711642e78b5..c003cd194fa2 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -370,7 +370,7 @@ static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) { - if (!pskb_may_pull(skb, PPPOE_SES_HLEN)) + if (!pskb_may_pull(skb, ETH_HLEN + PPPOE_SES_HLEN)) return false; *inner_proto = __nf_flow_pppoe_proto(skb); diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index e55eedc84ed7..00506792a06d 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -59,6 +59,9 @@ extern int sysctl_nf_log_all_netns; int nf_log_register(u_int8_t pf, struct nf_logger *logger); void nf_log_unregister(struct nf_logger *logger); +/* Check if any logger is registered for a given protocol family. */ +bool nf_log_is_registered(u_int8_t pf); + int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger); void nf_log_unset(struct net *net, const struct nf_logger *logger); diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h index 7c669792fb9c..f1db33bc6bf8 100644 --- a/include/net/netfilter/nf_reject.h +++ b/include/net/netfilter/nf_reject.h @@ -34,7 +34,6 @@ static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff, /* Protocols with partial checksums. */ case IPPROTO_UDPLITE: - case IPPROTO_DCCP: return false; } return true; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 803d5f1601f9..891e43a01bdc 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -459,19 +459,13 @@ struct nft_set_ext; * control plane functions. */ struct nft_set_ops { - bool (*lookup)(const struct net *net, + const struct nft_set_ext * (*lookup)(const struct net *net, const struct nft_set *set, + const u32 *key); + const struct nft_set_ext * (*update)(struct nft_set *set, const u32 *key, - const struct nft_set_ext **ext); - bool (*update)(struct nft_set *set, - const u32 *key, - struct nft_elem_priv * - (*new)(struct nft_set *, - const struct nft_expr *, - struct nft_regs *), const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_set_ext **ext); + struct nft_regs *regs); bool (*delete)(const struct nft_set *set, const u32 *key); @@ -1199,12 +1193,17 @@ struct nft_stats { struct nft_hook { struct list_head list; - struct nf_hook_ops ops; + struct list_head ops_list; struct rcu_head rcu; char ifname[IFNAMSIZ]; u8 ifnamelen; }; +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev); +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev); + /** * struct nft_base_chain - nf_tables base chain * @@ -1934,11 +1933,6 @@ static inline u64 nft_net_tstamp(const struct net *net) #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY -static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) -{ - return expr->ops->reduce == NFT_REDUCE_READONLY; -} - void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 03b6165756fc..6c2f483d9828 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -94,34 +94,41 @@ extern const struct nft_set_type nft_set_pipapo_type; extern const struct nft_set_type nft_set_pipapo_avx2_type; #ifdef CONFIG_MITIGATION_RETPOLINE -bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +const struct nft_set_ext * +nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); #else -static inline bool +static inline const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) + const u32 *key) { - return set->ops->lookup(net, set, key, ext); + return set->ops->lookup(net, set, key); } #endif /* called from nft_pipapo_avx2.c */ -bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +const struct nft_set_ext * +nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); /* called from nft_set_pipapo.c */ -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); void nft_counter_init_seqcount(void); @@ -181,4 +188,7 @@ void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); +struct nft_elem_priv *nft_dynset_new(struct nft_set *set, + const struct nft_expr *expr, + struct nft_regs *regs); #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 6e202ed5e63f..7370fba844ef 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -2,6 +2,7 @@ #ifndef _NFT_FIB_H_ #define _NFT_FIB_H_ +#include <net/l3mdev.h> #include <net/netfilter/nf_tables.h> struct nft_fib { @@ -39,6 +40,14 @@ static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt) return nft_fib_is_loopback(pkt->skb, indev); } +static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt, + const struct net_device *iif) +{ + const struct net_device *dev = iif ? iif : pkt->skb->dev; + + return l3mdev_master_ifindex_rcu(dev); +} + int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); diff --git a/include/net/netlink.h b/include/net/netlink.h index 29e0db940382..1a8356ca4b78 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -68,6 +68,8 @@ * nlmsg_for_each_msg() loop over all messages * nlmsg_validate() validate netlink message incl. attrs * nlmsg_for_each_attr() loop over all attributes + * nlmsg_for_each_attr_type() loop over all attributes with the + * given type * * Misc: * nlmsg_report() report back to application? @@ -321,7 +323,13 @@ enum nla_policy_validation { * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: + * NLA_U8, NLA_U16, + * NLA_U32, NLA_U64, + * NLA_S8, NLA_S16, + * NLA_S32, NLA_S64, + * NLA_MSECS, * NLA_BINARY Validation function called for the attribute. + * * All other Unused - but note that it's a union * * Example: @@ -612,6 +620,22 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) } /** + * nlmsg_payload - message payload if the data fits in the len + * @nlh: netlink message header + * @len: struct length + * + * Returns: The netlink message payload/data if the length is sufficient, + * otherwise NULL. + */ +static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len) +{ + if (nlh->nlmsg_len < nlmsg_msg_size(len)) + return NULL; + + return nlmsg_data(nlh); +} + +/** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header @@ -945,6 +969,18 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) nlmsg_attrlen(nlh, hdrlen), rem) /** + * nlmsg_for_each_attr_type - iterate over a stream of attributes + * @pos: loop counter, set to the current attribute + * @type: required attribute type for @pos + * @nlh: netlink message header + * @hdrlen: length of the family specific header + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \ + nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ + if (nla_type(pos) == type) + +/** * nlmsg_put - Add a new netlink message to an skb * @skb: socket buffer to store message in * @portid: netlink PORTID of requesting application diff --git a/include/net/netmem.h b/include/net/netmem.h index c61d5b21e7b4..f7dacc9e75fd 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -8,9 +8,54 @@ #ifndef _NET_NETMEM_H #define _NET_NETMEM_H +#include <linux/dma-mapping.h> #include <linux/mm.h> #include <net/net_debug.h> +/* These fields in struct page are used by the page_pool and net stack: + * + * struct { + * unsigned long pp_magic; + * struct page_pool *pp; + * unsigned long _pp_mapping_pad; + * unsigned long dma_addr; + * atomic_long_t pp_ref_count; + * }; + * + * We mirror the page_pool fields here so the page_pool can access these + * fields without worrying whether the underlying fields belong to a + * page or netmem_desc. + * + * CAUTION: Do not update the fields in netmem_desc without also + * updating the anonymous aliasing union in struct net_iov. + */ +struct netmem_desc { + unsigned long _flags; + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; + unsigned long dma_addr; + atomic_long_t pp_ref_count; +}; + +#define NETMEM_DESC_ASSERT_OFFSET(pg, desc) \ + static_assert(offsetof(struct page, pg) == \ + offsetof(struct netmem_desc, desc)) +NETMEM_DESC_ASSERT_OFFSET(flags, _flags); +NETMEM_DESC_ASSERT_OFFSET(pp_magic, pp_magic); +NETMEM_DESC_ASSERT_OFFSET(pp, pp); +NETMEM_DESC_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad); +NETMEM_DESC_ASSERT_OFFSET(dma_addr, dma_addr); +NETMEM_DESC_ASSERT_OFFSET(pp_ref_count, pp_ref_count); +#undef NETMEM_DESC_ASSERT_OFFSET + +/* + * Since struct netmem_desc uses the space in struct page, the size + * should be checked, until struct netmem_desc has its own instance from + * slab, to avoid conflicting with other members within struct page. + */ +static_assert(sizeof(struct netmem_desc) <= offsetof(struct page, _refcount)); + /* net_iov */ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); @@ -20,13 +65,57 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); */ #define NET_IOV 0x01UL +enum net_iov_type { + NET_IOV_DMABUF, + NET_IOV_IOURING, + + /* Force size to unsigned long to make the NET_IOV_ASSERTS below pass. + */ + NET_IOV_MAX = ULONG_MAX +}; + +/* A memory descriptor representing abstract networking I/O vectors, + * generally for non-pages memory that doesn't have its corresponding + * struct page and needs to be explicitly allocated through slab. + * + * net_iovs are allocated and used by networking code, and the size of + * the chunk is PAGE_SIZE. + * + * This memory can be any form of non-struct paged memory. Examples + * include imported dmabuf memory and imported io_uring memory. See + * net_iov_type for all the supported types. + * + * @pp_magic: pp field, similar to the one in struct page/struct + * netmem_desc. + * @pp: the pp this net_iov belongs to, if any. + * @dma_addr: the dma addrs of the net_iov. Needed for the network + * card to send/receive this net_iov. + * @pp_ref_count: the pp ref count of this net_iov, exactly the same + * usage as struct page/struct netmem_desc. + * @owner: the net_iov_area this net_iov belongs to, if any. + * @type: the type of the memory. Different types of net_iovs are + * supported. + */ struct net_iov { - unsigned long __unused_padding; - unsigned long pp_magic; - struct page_pool *pp; + union { + struct netmem_desc desc; + + /* XXX: The following part should be removed once all + * the references to them are converted so as to be + * accessed via netmem_desc e.g. niov->desc.pp instead + * of niov->pp. + */ + struct { + unsigned long _flags; + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; + unsigned long dma_addr; + atomic_long_t pp_ref_count; + }; + }; struct net_iov_area *owner; - unsigned long dma_addr; - atomic_long_t pp_ref_count; + enum net_iov_type type; }; struct net_iov_area { @@ -38,27 +127,22 @@ struct net_iov_area { unsigned long base_virtual; }; -/* These fields in struct page are used by the page_pool and net stack: +/* net_iov is union'ed with struct netmem_desc mirroring struct page, so + * the page_pool can access these fields without worrying whether the + * underlying fields are accessed via netmem_desc or directly via + * net_iov, until all the references to them are converted so as to be + * accessed via netmem_desc e.g. niov->desc.pp instead of niov->pp. * - * struct { - * unsigned long pp_magic; - * struct page_pool *pp; - * unsigned long _pp_mapping_pad; - * unsigned long dma_addr; - * atomic_long_t pp_ref_count; - * }; - * - * We mirror the page_pool fields here so the page_pool can access these fields - * without worrying whether the underlying fields belong to a page or net_iov. - * - * The non-net stack fields of struct page are private to the mm stack and must - * never be mirrored to net_iov. + * The non-net stack fields of struct page are private to the mm stack + * and must never be mirrored to net_iov. */ -#define NET_IOV_ASSERT_OFFSET(pg, iov) \ - static_assert(offsetof(struct page, pg) == \ +#define NET_IOV_ASSERT_OFFSET(desc, iov) \ + static_assert(offsetof(struct netmem_desc, desc) == \ offsetof(struct net_iov, iov)) +NET_IOV_ASSERT_OFFSET(_flags, _flags); NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic); NET_IOV_ASSERT_OFFSET(pp, pp); +NET_IOV_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad); NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); #undef NET_IOV_ASSERT_OFFSET @@ -79,8 +163,7 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov) * typedef netmem_ref - a nonexistent type marking a reference to generic * network memory. * - * A netmem_ref currently is always a reference to a struct page. This - * abstraction is introduced so support for new memory types can be added. + * A netmem_ref can be a struct page* or a struct net_iov* underneath. * * Use the supplied helpers to obtain the underlying memory pointer and fields. */ @@ -107,9 +190,6 @@ static inline struct page *__netmem_to_page(netmem_ref netmem) return (__force struct page *)netmem; } -/* This conversion fails (returns NULL) if the netmem_ref is not struct page - * backed. - */ static inline struct page *netmem_to_page(netmem_ref netmem) { if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) @@ -133,10 +213,9 @@ static inline netmem_ref net_iov_to_netmem(struct net_iov *niov) return (__force netmem_ref)((unsigned long)niov | NET_IOV); } -static inline netmem_ref page_to_netmem(struct page *page) -{ - return (__force netmem_ref)page; -} +#define page_to_netmem(p) (_Generic((p), \ + const struct page * : (__force const netmem_ref)(p), \ + struct page * : (__force netmem_ref)(p))) /** * virt_to_netmem - convert virtual memory pointer to a netmem reference @@ -168,11 +247,61 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem) return page_to_pfn(netmem_to_page(netmem)); } +/** + * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing + * @netmem + * @netmem: netmem reference to convert + * + * Unsafe version that can be used only when @netmem is always backed by + * system memory, performs faster and generates smaller object code (no + * check for the LSB, no WARN). When @netmem points to IOV, provokes + * undefined behaviour. + * + * Return: pointer to the &netmem_desc (garbage if @netmem is not backed + * by system memory). + */ +static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem) +{ + return (__force struct netmem_desc *)netmem; +} + +/* __netmem_clear_lsb - convert netmem_ref to struct net_iov * for access to + * common fields. + * @netmem: netmem reference to extract as net_iov. + * + * All the sub types of netmem_ref (page, net_iov) have the same pp, pp_magic, + * dma_addr, and pp_ref_count fields at the same offsets. Thus, we can access + * these fields without a type check to make sure that the underlying mem is + * net_iov or page. + * + * The resulting value of this function can only be used to access the fields + * that are NET_IOV_ASSERT_OFFSET'd. Accessing any other fields will result in + * undefined behavior. + * + * Return: the netmem_ref cast to net_iov* regardless of its underlying type. + */ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) { return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); } +/* XXX: How to extract netmem_desc from page must be changed, once + * netmem_desc no longer overlays on page and will be allocated through + * slab. + */ +#define __pp_page_to_nmdesc(p) (_Generic((p), \ + const struct page * : (const struct netmem_desc *)(p), \ + struct page * : (struct netmem_desc *)(p))) + +/* CAUTION: Check if the page is a pp page before calling this helper or + * know it's a pp page. + */ +#define pp_page_to_nmdesc(p) \ +({ \ + DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ + __pp_page_to_nmdesc(p); \ +}) + /** * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem * @netmem: netmem reference to get the pointer from @@ -186,7 +315,7 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) */ static inline struct page_pool *__netmem_get_pp(netmem_ref netmem) { - return __netmem_to_page(netmem)->pp; + return __netmem_to_nmdesc(netmem)->pp; } static inline struct page_pool *netmem_get_pp(netmem_ref netmem) @@ -264,4 +393,26 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) return __netmem_clear_lsb(netmem)->dma_addr; } +void get_netmem(netmem_ref netmem); +void put_netmem(netmem_ref netmem); + +#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \ + do { \ + if (!netmem_is_net_iov(NETMEM)) \ + dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \ + else \ + dma_unmap_addr_set(PTR, ADDR_NAME, 0); \ + } while (0) + +static inline void netmem_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + if (!addr) + return; + + dma_unmap_page_attrs(dev, addr, size, dir, attrs); +} + #endif /* _NET_NETMEM_H */ diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index bae914815aa3..ab74b5ed0b01 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -7,9 +7,6 @@ #include <linux/atomic.h> #include <linux/workqueue.h> #include <linux/netfilter/nf_conntrack_tcp.h> -#ifdef CONFIG_NF_CT_PROTO_DCCP -#include <linux/netfilter/nf_conntrack_dccp.h> -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP #include <linux/netfilter/nf_conntrack_sctp.h> #endif @@ -50,13 +47,6 @@ struct nf_icmp_net { unsigned int timeout; }; -#ifdef CONFIG_NF_CT_PROTO_DCCP -struct nf_dccp_net { - u8 dccp_loose; - unsigned int dccp_timeout[CT_DCCP_MAX + 1]; -}; -#endif - #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net { unsigned int timeouts[SCTP_CONNTRACK_MAX]; @@ -82,9 +72,6 @@ struct nf_ip_net { struct nf_udp_net udp; struct nf_icmp_net icmp; struct nf_icmp_net icmpv6; -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct nf_dccp_net dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net sctp; #endif diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 650b2dc9199f..6373e3f17da8 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed { }; #endif +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. @@ -85,6 +90,11 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -277,4 +287,5 @@ struct netns_ipv4 { struct hlist_head *inet_addr_lst; struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 5f2cfd84570a..47dc70d8100a 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -72,6 +72,7 @@ struct netns_ipv6 { struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; + spinlock_t fib_table_hash_lock; struct fib6_table *fib6_main_tbl; struct list_head fib6_walkers; rwlock_t fib6_walker_lock; diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h index 1db8f9aaddb4..89555f90b97b 100644 --- a/include/net/netns/mctp.h +++ b/include/net/netns/mctp.h @@ -6,19 +6,25 @@ #ifndef __NETNS_MCTP_H__ #define __NETNS_MCTP_H__ +#include <linux/hash.h> +#include <linux/hashtable.h> #include <linux/mutex.h> #include <linux/types.h> +#define MCTP_BINDS_BITS 7 + struct netns_mctp { /* Only updated under RTNL, entries freed via RCU */ struct list_head routes; - /* Bound sockets: list of sockets bound by type. - * This list is updated from non-atomic contexts (under bind_lock), - * and read (under rcu) in packet rx + /* Bound sockets: hash table of sockets, keyed by + * (type, src_eid, dest_eid). + * Specific src_eid/dest_eid entries also have an entry for + * MCTP_ADDR_ANY. This list is updated from non-atomic contexts + * (under bind_lock), and read (under rcu) in packet rx. */ struct mutex bind_lock; - struct hlist_head binds; + DECLARE_HASHTABLE(binds, MCTP_BINDS_BITS); /* tag allocations. This list is read and updated from atomic contexts, * but elements are free()ed after a RCU grace-period @@ -34,4 +40,10 @@ struct netns_mctp { struct list_head neighbours; }; +static inline u32 mctp_bind_hash(u8 type, u8 local_addr, u8 peer_addr) +{ + return hash_32(type | (u32)local_addr << 8 | (u32)peer_addr << 16, + MCTP_BINDS_BITS); +} + #endif /* __NETNS_MCTP_H__ */ diff --git a/include/net/nexthop.h b/include/net/nexthop.h index d9fb44e8b321..572e69cda476 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -152,6 +152,8 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; + bool dead; + spinlock_t lock; /* protect dead and f6i_list */ refcount_t refcnt; struct rcu_head rcu; diff --git a/include/net/p8022.h b/include/net/p8022.h deleted file mode 100644 index a29e224ac498..000000000000 --- a/include/net/p8022.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_P8022_H -#define _NET_P8022_H - -struct net_device; -struct packet_type; -struct sk_buff; - -struct datalink_proto * -register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)); -void unregister_8022_client(struct datalink_proto *proto); -#endif diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 582a3d00cbe2..db180626be06 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -153,6 +153,13 @@ static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, return page_pool_alloc_netmem(pool, offset, size, gfp); } +static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool) +{ + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + + return page_pool_alloc_netmems(pool, gfp); +} + static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) @@ -395,6 +402,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); } +static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, + netmem_ref netmem) +{ + page_pool_put_full_netmem(pool, netmem, true); +} + #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) @@ -431,12 +444,7 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { - dma_addr_t ret = page->dma_addr; - - if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) - ret <<= PAGE_SHIFT; - - return ret; + return page_pool_get_dma_addr_netmem(page_to_netmem(page)); } static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, @@ -492,4 +500,9 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +static inline bool page_pool_is_unreadable(struct page_pool *pool) +{ + return !!pool->mp_ops; +} + #endif /* _NET_PAGE_POOL_HELPERS_H */ diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index b3e665897767..ada4f968960a 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -6,6 +6,7 @@ #include <net/page_pool/types.h> struct netdev_rx_queue; +struct netlink_ext_ack; struct sk_buff; struct memory_provider_ops { @@ -24,8 +25,13 @@ void net_mp_niov_clear_page_pool(struct net_iov *niov); int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *p); +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack); void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *old_p); +void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *old_p); /** * net_mp_netmem_place_in_cache() - give a netmem to a page pool diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 36eb57d73abc..431b593de709 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -6,6 +6,7 @@ #include <linux/dma-direction.h> #include <linux/ptr_ring.h> #include <linux/types.h> +#include <linux/xarray.h> #include <net/netmem.h> #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA @@ -33,6 +34,9 @@ #define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) +/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */ +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) + /* * Fast allocation side cache array/stack * @@ -221,6 +225,8 @@ struct page_pool { void *mp_priv; const struct memory_provider_ops *mp_ops; + struct xarray dma_mapped; + #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ struct page_pool_recycle_stats __percpu *recycle_stats; diff --git a/include/net/pfcp.h b/include/net/pfcp.h index af14f970b80e..639553797d3e 100644 --- a/include/net/pfcp.h +++ b/include/net/pfcp.h @@ -45,7 +45,7 @@ struct pfcphdr_session { reserved:4; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:4, - message_priprity:4; + message_priority:4; #else #error "Please fix <asm/byteorder>" #endif diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index d7b7b6cd4aa1..8a75c73fc555 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -114,7 +114,6 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); -void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate); @@ -290,4 +289,28 @@ static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, return true; } +static inline void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) +{ + if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { + pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", + txt, qdisc->ops->id, qdisc->handle >> 16); + qdisc->flags |= TCQ_F_WARN_NONWC; + } +} + +static inline unsigned int qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + unsigned int len; + + skb = sch->ops->peek(sch); + if (unlikely(skb == NULL)) { + qdisc_warn_nonwc("qdisc_peek_len", sch); + return 0; + } + len = qdisc_pkt_len(skb); + + return len; +} + #endif diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b07b1cd14e9f..6a5ec1418e85 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -30,8 +30,6 @@ struct request_sock_ops { unsigned int obj_size; struct kmem_cache *slab; char *slab_name; - int (*rtx_syn_ack)(const struct sock *sk, - struct request_sock *req); void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); void (*send_reset)(const struct sock *sk, @@ -41,8 +39,6 @@ struct request_sock_ops { void (*syn_ack_timeout)(const struct request_sock *req); }; -int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); - struct saved_syn { u32 mac_hdrlen; u32 network_hdrlen; diff --git a/include/net/route.h b/include/net/route.h index c605fd5ec0c0..7ea840daa775 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -153,7 +153,7 @@ static inline void inet_sk_init_flowi4(const struct inet_sock *inet, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, inet->inet_dport, - inet->inet_sport, sk->sk_uid); + inet->inet_sport, sk_uid(sk)); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); } @@ -326,9 +326,12 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport) + flow_flags |= FLOWI_FLAG_ANY_SPORT; + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, - src, dport, sport, sk->sk_uid); + src, dport, sport, sk_uid(sk)); } static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, @@ -387,7 +390,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) const struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); rcu_read_unlock(); } diff --git a/include/net/rps.h b/include/net/rps.h index e358e9711f27..d8ab3a08bcc4 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -57,9 +57,10 @@ struct rps_dev_flow_table { * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - u32 mask; + struct rcu_head rcu; + u32 mask; - u32 ents[] ____cacheline_aligned_in_smp; + u32 ents[] ____cacheline_aligned_in_smp; }; #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) @@ -122,6 +123,30 @@ static inline void sock_rps_record_flow(const struct sock *sk) #endif } +static inline void sock_rps_delete_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *table; + u32 hash, index; + + if (!static_branch_unlikely(&rfs_needed)) + return; + + hash = READ_ONCE(sk->sk_rxhash); + if (!hash) + return; + + rcu_read_lock(); + table = rcu_dereference(net_hotdata.rps_sock_flow_table); + if (table) { + index = hash & table->mask; + if (READ_ONCE(table->ents[index]) != RPS_NO_CPU) + WRITE_ONCE(table->ents[index], RPS_NO_CPU); + } + rcu_read_unlock(); +#endif +} + static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) { #ifdef CONFIG_RPS diff --git a/include/net/rstreason.h b/include/net/rstreason.h index 69cb2e52b7da..979ac87b5d99 100644 --- a/include/net/rstreason.h +++ b/include/net/rstreason.h @@ -36,7 +36,7 @@ /** * enum sk_rst_reason - the reasons of socket reset * - * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols. + * The reasons of sk reset, which are used in TCP/MPTCP protocols. * * There are three parts in order: * 1) skb drop reasons: relying on drop reasons for such as passive reset diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d48c657191cd..638948be4c50 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -803,6 +803,14 @@ static inline bool qdisc_tx_changing(const struct net_device *dev) return false; } +/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */ +static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq) +{ + struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc); + + return qdisc->enqueue == NULL; +} + /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { @@ -965,14 +973,6 @@ static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, *backlog = qstats.backlog; } -static inline void qdisc_tree_flush_backlog(struct Qdisc *sch) -{ - __u32 qlen, backlog; - - qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); - qdisc_tree_reduce_backlog(sch, qlen, backlog); -} - static inline void qdisc_purge_queue(struct Qdisc *sch) { __u32 qlen, backlog; @@ -1031,6 +1031,21 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) return skb; } +static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct) +{ + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->gso_skb); + if (skb) { + sch->q.qlen--; + return skb; + } + if (direct) + return __qdisc_dequeue_head(&sch->q); + else + return sch->dequeue(sch); +} + static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); diff --git a/include/net/scm.h b/include/net/scm.h index 22bb49589fde..c52519669349 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -69,7 +69,7 @@ static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_co static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { - scm->pid = get_pid(pid); + scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; @@ -78,7 +78,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm, static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); - scm->pid = NULL; + scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) @@ -102,123 +102,10 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, return __scm_send(sock, msg, scm); } -#ifdef CONFIG_SECURITY_NETWORK -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ - struct lsm_context ctx; - int err; - - if (test_bit(SOCK_PASSSEC, &sock->flags)) { - err = security_secid_to_secctx(scm->secid, &ctx); - - if (err >= 0) { - put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, - ctx.context); - security_release_secctx(&ctx); - } - } -} - -static inline bool scm_has_secdata(struct socket *sock) -{ - return test_bit(SOCK_PASSSEC, &sock->flags); -} -#else -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ } - -static inline bool scm_has_secdata(struct socket *sock) -{ - return false; -} -#endif /* CONFIG_SECURITY_NETWORK */ - -static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) -{ - struct file *pidfd_file = NULL; - int len, pidfd; - - /* put_cmsg() doesn't return an error if CMSG is truncated, - * that's why we need to opencode these checks here. - */ - if (msg->msg_flags & MSG_CMSG_COMPAT) - len = sizeof(struct compat_cmsghdr) + sizeof(int); - else - len = sizeof(struct cmsghdr) + sizeof(int); - - if (msg->msg_controllen < len) { - msg->msg_flags |= MSG_CTRUNC; - return; - } - - if (!scm->pid) - return; - - pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); - - if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { - if (pidfd_file) { - put_unused_fd(pidfd); - fput(pidfd_file); - } - - return; - } - - if (pidfd_file) - fd_install(pidfd, pidfd_file); -} - -static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!msg->msg_control) { - if (test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags) || - scm->fp || scm_has_secdata(sock)) - msg->msg_flags |= MSG_CTRUNC; - scm_destroy(scm); - return false; - } - - if (test_bit(SOCK_PASSCRED, &sock->flags)) { - struct user_namespace *current_ns = current_user_ns(); - struct ucred ucreds = { - .pid = scm->creds.pid, - .uid = from_kuid_munged(current_ns, scm->creds.uid), - .gid = from_kgid_munged(current_ns, scm->creds.gid), - }; - put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); - } - - scm_passec(sock, msg, scm); - - if (scm->fp) - scm_detach_fds(msg, scm); - - return true; -} - -static inline void scm_recv(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!__scm_recv_common(sock, msg, scm, flags)) - return; - - scm_destroy_cred(scm); -} - -static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!__scm_recv_common(sock, msg, scm, flags)) - return; - - if (test_bit(SOCK_PASSPIDFD, &sock->flags)) - scm_pidfd_recv(msg, scm); - - scm_destroy_cred(scm); -} +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); static inline int scm_recv_one_fd(struct file *f, int __user *ufd, unsigned int flags) diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index 291465c25810..654d37ec0402 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -15,8 +15,6 @@ * Dinakaran Joseph * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> - * - * Rewritten to use libcrc32c by: * Vlad Yasevich <vladislav.yasevich@hp.com> */ @@ -25,39 +23,18 @@ #include <linux/types.h> #include <linux/sctp.h> -#include <linux/crc32c.h> -#include <linux/crc32.h> - -static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum) -{ - return (__force __wsum)crc32c((__force __u32)sum, buff, len); -} - -static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - return (__force __wsum)crc32c_combine((__force __u32)csum, - (__force __u32)csum2, len); -} - -static const struct skb_checksum_ops sctp_csum_ops = { - .update = sctp_csum_update, - .combine = sctp_csum_combine, -}; static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, unsigned int offset) { struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); __le32 old = sh->checksum; - __wsum new; + u32 new; sh->checksum = 0; - new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0, - &sctp_csum_ops); + new = ~skb_crc32c(skb, offset, skb->len - offset, ~0); sh->checksum = old; - - return cpu_to_le32((__force __u32)new); + return cpu_to_le32(new); } #endif /* __sctp_checksum_h__ */ diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 84e6b9fd5610..e96d1bd087f6 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -364,8 +364,6 @@ sctp_assoc_to_state(const struct sctp_association *asoc) /* Look up the association by its id. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id); -int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp); - /* A macro to walk a list of skbs. */ #define sctp_skb_for_each(pos, head, tmp) \ skb_queue_walk_safe(head, pos, tmp) @@ -636,7 +634,7 @@ static inline void sctp_transport_pl_reset(struct sctp_transport *t) } } else { if (t->pl.state != SCTP_PL_DISABLED) { - if (del_timer(&t->probe_timer)) + if (timer_delete(&t->probe_timer)) sctp_transport_put(t); t->pl.state = SCTP_PL_DISABLED; } diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 64c42bd56bb2..3bfd261a53cc 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -161,7 +161,6 @@ const struct sctp_sm_table_entry *sctp_sm_lookup_event( enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype); -int sctp_chunk_iif(const struct sctp_chunk *); struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *, struct sctp_chunk *, gfp_t gfp); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 31248cfdfb23..8a540ad9b509 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -51,9 +51,9 @@ * We should wean ourselves off this. */ union sctp_addr { + struct sockaddr_inet sa; /* Large enough for both address families */ struct sockaddr_in v4; struct sockaddr_in6 v6; - struct sockaddr sa; }; /* Forward declarations for data structures. */ @@ -775,6 +775,7 @@ struct sctp_transport { /* Reference counting. */ refcount_t refcnt; + __u32 dead:1, /* RTO-Pending : A flag used to track if one of the DATA * chunks sent to this address is currently being * used to compute a RTT. If this flag is 0, @@ -784,7 +785,7 @@ struct sctp_transport { * calculation completes (i.e. the DATA chunk * is SACK'd) clear this flag. */ - __u32 rto_pending:1, + rto_pending:1, /* * hb_sent : a flag that signals that we have a pending @@ -2151,8 +2152,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *, const union sctp_addr *address, const gfp_t gfp, const int peer_state); -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr); void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer); void sctp_assoc_control_transport(struct sctp_association *asoc, diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 21e7fa2a1813..cddebafb9f77 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -16,9 +16,5 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport); u32 secure_tcpv6_ts_off(const struct net *net, const __be32 *saddr, const __be32 *daddr); -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport); #endif /* _NET_SECURE_SEQ */ diff --git a/include/net/sock.h b/include/net/sock.h index 8daf1b3b12c6..c8a4b283df6f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -337,8 +337,16 @@ struct sk_filter; * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags + * @sk_scm_recv_flags: all flags used by scm_recv() + * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS + * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY + * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS + * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. + * @sk_owner: reference to the real owner of the socket that calls + * sock_lock_init_class_and_name(). */ struct sock { /* @@ -521,7 +529,17 @@ struct sock { #endif int sk_disconnects; - u8 sk_txrehash; + union { + u8 sk_txrehash; + u8 sk_scm_recv_flags; + struct { + u8 sk_scm_credentials : 1, + sk_scm_security : 1, + sk_scm_pidfd : 1, + sk_scm_rights : 1, + sk_scm_unused : 4; + }; + }; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, @@ -547,6 +565,10 @@ struct sock { struct rcu_head sk_rcu; netns_tracker ns_tracker; struct xarray sk_user_frags; + +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) + struct module *sk_owner; +#endif }; struct sock_bh_locked { @@ -1531,7 +1553,7 @@ __sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size) { return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } @@ -1583,6 +1605,35 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) sk_mem_reclaim(sk); } +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ + __module_get(owner); + sk->sk_owner = owner; +} + +static inline void sk_owner_clear(struct sock *sk) +{ + sk->sk_owner = NULL; +} + +static inline void sk_owner_put(struct sock *sk) +{ + module_put(sk->sk_owner); +} +#else +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ +} + +static inline void sk_owner_clear(struct sock *sk) +{ +} + +static inline void sk_owner_put(struct sock *sk) +{ +} +#endif /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. @@ -1592,13 +1643,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ + sk_owner_set(sk, THIS_MODULE); \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ - sizeof((sk)->sk_lock)); \ + sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ - (skey), (sname)); \ + (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) @@ -1745,7 +1797,6 @@ void sk_free(struct sock *sk); void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); -void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); @@ -1816,6 +1867,7 @@ struct sockcm_cookie { u32 tsflags; u32 ts_opt_id; u32 priority; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, @@ -2024,6 +2076,7 @@ static inline void sock_orphan(struct sock *sk) sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; + /* Note: sk_uid is unchanged. */ write_unlock_bh(&sk->sk_callback_lock); } @@ -2034,18 +2087,23 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); - sk->sk_uid = SOCK_INODE(parent)->i_uid; + WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid); security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } -kuid_t sock_i_uid(struct sock *sk); +static inline kuid_t sk_uid(const struct sock *sk) +{ + /* Paired with WRITE_ONCE() in sockfs_setattr() */ + return READ_ONCE(sk->sk_uid); +} + unsigned long __sock_i_ino(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { - return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); + return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) @@ -2538,12 +2596,12 @@ static inline gfp_t gfp_memcg_charge(void) static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { - return noblock ? 0 : sk->sk_rcvtimeo; + return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo); } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { - return noblock ? 0 : sk->sk_sndtimeo; + return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo); } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) @@ -2569,8 +2627,8 @@ struct sock_skb_cb { * using skb->cb[] would keep using it directly and utilize its * alignment guarantee. */ -#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ - sizeof(struct sock_skb_cb))) +#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ + sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) @@ -2625,6 +2683,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); +bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk); +int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, + struct timespec64 *ts); + static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { @@ -2700,8 +2762,6 @@ static inline void _sock_tx_timestamp(struct sock *sk, *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } } - if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) - *tx_flags |= SKBTX_WIFI_STATUS; } static inline void sock_tx_timestamp(struct sock *sk, @@ -2739,9 +2799,14 @@ static inline bool sk_is_udp(const struct sock *sk) sk->sk_protocol == IPPROTO_UDP; } +static inline bool sk_is_unix(const struct sock *sk) +{ + return sk->sk_family == AF_UNIX; +} + static inline bool sk_is_stream_unix(const struct sock *sk) { - return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; + return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM; } static inline bool sk_is_vsock(const struct sock *sk) @@ -2749,6 +2814,13 @@ static inline bool sk_is_vsock(const struct sock *sk) return sk->sk_family == AF_VSOCK; } +static inline bool sk_may_scm_recv(const struct sock *sk) +{ + return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) || + sk->sk_family == AF_NETLINK || + (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH); +} + /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from @@ -2788,6 +2860,12 @@ sk_is_refcounted(struct sock *sk) return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } +static inline bool +sk_requests_wifi_status(struct sock *sk) +{ + return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); +} + /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. @@ -2914,7 +2992,6 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool); int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); -void sock_enable_timestamps(struct sock *sk); #if defined(CONFIG_CGROUP_BPF) void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op); #else @@ -2942,8 +3019,11 @@ int sock_ioctl_inout(struct sock *sk, unsigned int cmd, int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { - if (sk->sk_prot->sock_is_readable) - return sk->sk_prot->sock_is_readable(sk); + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->sock_is_readable) + return prot->sock_is_readable(sk); + return false; } #endif /* _SOCK_H */ diff --git a/include/net/strparser.h b/include/net/strparser.h index 0a83010b3a64..0ed73e364faa 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -114,8 +114,6 @@ static inline void strp_pause(struct strparser *strp) /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); -/* Must be called with process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h index e8dd77a96748..a5ce83f3eea4 100644 --- a/include/net/tc_act/tc_connmark.h +++ b/include/net/tc_act/tc_connmark.h @@ -7,6 +7,7 @@ struct tcf_connmark_parms { struct net *net; u16 zone; + int action; struct rcu_head rcu; }; diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 68269e4581b7..8d0c7a9f9345 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -8,6 +8,7 @@ struct tcf_csum_params { u32 update_flags; + int action; struct rcu_head rcu; }; @@ -18,15 +19,6 @@ struct tcf_csum { }; #define to_tcf_csum(a) ((struct tcf_csum *)a) -static inline bool is_tcf_csum(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_CSUM) - return true; -#endif - return false; -} - static inline u32 tcf_csum_update_flags(const struct tc_action *a) { u32 update_flags; diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index 77f87c622a2e..8b90c86c0b0d 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -13,7 +13,7 @@ struct tcf_ct_params { struct nf_conntrack_helper *helper; struct nf_conn *tmpl; u16 zone; - + int action; u32 mark; u32 mark_mask; @@ -92,13 +92,4 @@ static inline void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { } #endif -static inline bool is_tcf_ct(const struct tc_action *a) -{ -#if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK) - if (a->ops && a->ops->id == TCA_ID_CT) - return true; -#endif - return false; -} - #endif /* __NET_TC_CT_H */ diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h index f071c1d70a25..7fe01ab236da 100644 --- a/include/net/tc_act/tc_ctinfo.h +++ b/include/net/tc_act/tc_ctinfo.h @@ -7,6 +7,7 @@ struct tcf_ctinfo_params { struct rcu_head rcu; struct net *net; + int action; u32 dscpmask; u32 dscpstatemask; u32 cpmarkmask; @@ -18,9 +19,9 @@ struct tcf_ctinfo_params { struct tcf_ctinfo { struct tc_action common; struct tcf_ctinfo_params __rcu *params; - u64 stats_dscp_set; - u64 stats_dscp_error; - u64 stats_cpmark_set; + atomic64_t stats_dscp_set; + atomic64_t stats_dscp_error; + atomic64_t stats_cpmark_set; }; enum { diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h index c8fa11ebb397..c1a67149c6b6 100644 --- a/include/net/tc_act/tc_gate.h +++ b/include/net/tc_act/tc_gate.h @@ -51,15 +51,6 @@ struct tcf_gate { #define to_gate(a) ((struct tcf_gate *)a) -static inline bool is_tcf_gate(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_GATE) - return true; -#endif - return false; -} - static inline s32 tcf_gate_prio(const struct tc_action *a) { s32 tcfg_prio; diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h index 721de4f5733a..dd067bd4018d 100644 --- a/include/net/tc_act/tc_mpls.h +++ b/include/net/tc_act/tc_mpls.h @@ -10,6 +10,7 @@ struct tcf_mpls_params { int tcfm_action; u32 tcfm_label; + int action; /* tcf_action */ u8 tcfm_tc; u8 tcfm_ttl; u8 tcfm_bos; @@ -27,15 +28,6 @@ struct tcf_mpls { }; #define to_mpls(a) ((struct tcf_mpls *)a) -static inline bool is_tcf_mpls(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_MPLS) - return true; -#endif - return false; -} - static inline u32 tcf_mpls_action(const struct tc_action *a) { u32 tcfm_action; diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h index c869274ac529..ae35f4009445 100644 --- a/include/net/tc_act/tc_nat.h +++ b/include/net/tc_act/tc_nat.h @@ -6,6 +6,7 @@ #include <net/act_api.h> struct tcf_nat_parms { + int action; __be32 old_addr; __be32 new_addr; __be32 mask; diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index 83fe39931781..f58ee15cd858 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -14,6 +14,7 @@ struct tcf_pedit_key_ex { struct tcf_pedit_parms { struct tc_pedit_key *tcfp_keys; struct tcf_pedit_key_ex *tcfp_keys_ex; + int action; u32 tcfp_off_max_hint; unsigned char tcfp_nkeys; unsigned char tcfp_flags; diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index 283bde711a42..a89fc8e68b1e 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -5,10 +5,11 @@ #include <net/act_api.h> struct tcf_police_params { + int action; int tcfp_result; u32 tcfp_ewma_rate; - s64 tcfp_burst; u32 tcfp_mtu; + s64 tcfp_burst; s64 tcfp_mtu_ptoks; s64 tcfp_pkt_burst; struct psched_ratecfg rate; @@ -44,15 +45,6 @@ struct tc_police_compat { struct tc_ratespec peakrate; }; -static inline bool is_tcf_police(const struct tc_action *act) -{ -#ifdef CONFIG_NET_CLS_ACT - if (act->ops && act->ops->id == TCA_ID_POLICE) - return true; -#endif - return false; -} - static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act) { struct tcf_police *police = to_police(act); diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h index b5d76305e854..abd163ca1864 100644 --- a/include/net/tc_act/tc_sample.h +++ b/include/net/tc_act/tc_sample.h @@ -17,15 +17,6 @@ struct tcf_sample { }; #define to_sample(a) ((struct tcf_sample *)a) -static inline bool is_tcf_sample(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - return a->ops && a->ops->id == TCA_ID_SAMPLE; -#else - return false; -#endif -} - static inline __u32 tcf_sample_rate(const struct tc_action *a) { return to_sample(a)->rate; diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 9649600fb3dc..31b2cd0bebb5 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -12,6 +12,7 @@ #include <linux/tc_act/tc_skbedit.h> struct tcf_skbedit_params { + int action; u32 flags; u32 priority; u32 mark; diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 904eddfc1826..3f5e9242b5e8 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -26,15 +26,6 @@ struct tcf_vlan { }; #define to_vlan(a) ((struct tcf_vlan *)a) -static inline bool is_tcf_vlan(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_VLAN) - return true; -#endif - return false; -} - static inline u32 tcf_vlan_action(const struct tc_action *a) { u32 tcfv_action; diff --git a/include/net/tcp.h b/include/net/tcp.h index 4450c384ef17..b3815d104340 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -267,7 +267,6 @@ extern long sysctl_tcp_mem[3]; #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ -extern atomic_long_t tcp_memory_allocated; DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); extern struct percpu_counter tcp_sockets_allocated; @@ -321,7 +320,7 @@ extern struct proto tcp_prot; #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) -void tcp_tasklet_init(void); +void tcp_tsq_work_init(void); int tcp_v4_err(struct sk_buff *skb, u32); @@ -427,7 +426,8 @@ enum tcp_tw_status { enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, - u32 *tw_isn); + u32 *tw_isn, + enum skb_drop_reason *drop_reason); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race, enum skb_drop_reason *drop_reason); @@ -1559,7 +1559,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); -int tcp_filter(struct sock *sk, struct sk_buff *skb); +int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); @@ -1810,14 +1810,8 @@ static inline void tcp_mib_init(struct net *net) } /* from STCP */ -static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) -{ - tp->lost_skb_hint = NULL; -} - static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { - tcp_clear_retrans_hints_partial(tp); tp->retransmit_skb_hint = NULL; } diff --git a/include/net/tcx.h b/include/net/tcx.h index 5ce0ce9e0c02..23a61af13547 100644 --- a/include/net/tcx.h +++ b/include/net/tcx.h @@ -20,7 +20,6 @@ struct tcx_entry { struct tcx_link { struct bpf_link link; struct net_device *dev; - u32 location; }; static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100d..f8ae2c4ade14 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -205,7 +205,6 @@ static inline void udp_hash4_dec(struct udp_hslot *hslot2) extern struct proto udp_prot; -extern atomic_long_t udp_memory_allocated; DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); /* sysctl variables for udp */ @@ -290,6 +289,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index a93dc51f6323..9acef2fbd2fd 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -130,35 +130,20 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type); void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type); -static inline void udp_tunnel_get_rx_info(struct net_device *dev) -{ - ASSERT_RTNL(); - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); -} - -static inline void udp_tunnel_drop_rx_info(struct net_device *dev) -{ - ASSERT_RTNL(); - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); -} - /* Transmit the skb using UDP encapsulation. */ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck); + bool xnet, bool nocheck, u16 ipcb_flags); -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck); +void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck, + u16 ip6cb_flags); void udp_tunnel_sock_release(struct socket *sock); @@ -191,6 +176,21 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) } #endif +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) +{ + udp_tunnel_update_gro_rcv(sk, false); + udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); +} + static inline void udp_tunnel_encap_enable(struct sock *sk) { if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) @@ -206,19 +206,17 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) #define UDP_TUNNEL_NIC_MAX_TABLES 4 enum udp_tunnel_nic_info_flags { - /* Device callbacks may sleep */ - UDP_TUNNEL_NIC_INFO_MAY_SLEEP = BIT(0), /* Device only supports offloads when it's open, all ports * will be removed before close and re-added after open. */ - UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(1), + UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(0), /* Device supports only IPv4 tunnels */ - UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(2), + UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(1), /* Device has hard-coded the IANA VXLAN port (4789) as VXLAN. * This port must not be counted towards n_entries of any table. * Driver will not receive any callback associated with port 4789. */ - UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(3), + UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(2), }; struct udp_tunnel_nic; @@ -309,6 +307,9 @@ struct udp_tunnel_nic_ops { size_t (*dump_size)(struct net_device *dev, unsigned int table); int (*dump_write)(struct net_device *dev, unsigned int table, struct sk_buff *skb); + void (*assert_locked)(struct net_device *dev); + void (*lock)(struct net_device *dev); + void (*unlock)(struct net_device *dev); }; #ifdef CONFIG_INET @@ -337,8 +338,28 @@ static inline void udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { - if (udp_tunnel_nic_ops) + if (udp_tunnel_nic_ops) { + udp_tunnel_nic_ops->assert_locked(dev); udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv); + } +} + +static inline void udp_tunnel_nic_assert_locked(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->assert_locked(dev); +} + +static inline void udp_tunnel_nic_lock(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->lock(dev); +} + +static inline void udp_tunnel_nic_unlock(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->unlock(dev); } static inline void @@ -380,17 +401,50 @@ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev) static inline size_t udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { + size_t ret; + if (!udp_tunnel_nic_ops) return 0; - return udp_tunnel_nic_ops->dump_size(dev, table); + + udp_tunnel_nic_ops->lock(dev); + ret = udp_tunnel_nic_ops->dump_size(dev, table); + udp_tunnel_nic_ops->unlock(dev); + + return ret; } static inline int udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { + int ret; + if (!udp_tunnel_nic_ops) return 0; - return udp_tunnel_nic_ops->dump_write(dev, table, skb); + + udp_tunnel_nic_ops->lock(dev); + ret = udp_tunnel_nic_ops->dump_write(dev, table, skb); + udp_tunnel_nic_ops->unlock(dev); + + return ret; } + +static inline void udp_tunnel_get_rx_info(struct net_device *dev) +{ + ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + udp_tunnel_nic_assert_locked(dev); + call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); +} + +static inline void udp_tunnel_drop_rx_info(struct net_device *dev) +{ + ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + udp_tunnel_nic_assert_locked(dev); + call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); +} + #endif diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 2dd23ee2bacd..0ee50785f4f1 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -296,7 +296,7 @@ struct vxlan_dev { struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; - spinlock_t hash_lock[FDB_HASH_SIZE]; + spinlock_t hash_lock; unsigned int addrcnt; struct gro_cells gro_cells; @@ -304,9 +304,10 @@ struct vxlan_dev { struct vxlan_vni_group __rcu *vnigrp; - struct hlist_head fdb_head[FDB_HASH_SIZE]; + struct rhashtable fdb_hash_tbl; struct rhashtable mdb_tbl; + struct hlist_head fdb_list; struct hlist_head mdb_list; unsigned int mdb_seq; }; @@ -331,6 +332,7 @@ struct vxlan_dev { #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 #define VXLAN_F_LOCALBYPASS 0x80000 +#define VXLAN_F_MC_ROUTE 0x100000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -352,7 +354,9 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER | \ - VXLAN_F_LOCALBYPASS) + VXLAN_F_LOCALBYPASS | \ + VXLAN_F_MC_ROUTE | \ + 0) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/net/x25.h b/include/net/x25.h index 5e833cfc864e..414f3fd99345 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -203,7 +203,6 @@ void x25_send_frame(struct sk_buff *, struct x25_neigh *); int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void x25_establish_link(struct x25_neigh *); -void x25_terminate_link(struct x25_neigh *); /* x25_facilities.c */ int x25_parse_facilities(struct sk_buff *, struct x25_facilities *, diff --git a/include/net/xdp.h b/include/net/xdp.h index 48efacbaa35d..b40f1f96cb11 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -616,8 +616,12 @@ struct xdp_metadata_ops { u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); +void xdp_features_clear_redirect_target_locked(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index a58ae7589d12..ce587a225661 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -71,9 +71,6 @@ struct xdp_sock { */ u32 tx_budget_spent; - /* Protects generic receive. */ - spinlock_t rx_lock; - /* Statistics */ u64 rx_dropped; u64 rx_queue_full; @@ -87,6 +84,7 @@ struct xdp_sock { struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; + u32 max_tx_budget; /* Protects multiple processes in the control path */ struct mutex mutex; struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 39365fd2ea17..f3014e4f54fc 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -147,8 +147,19 @@ enum { }; struct xfrm_dev_offload { + /* The device for this offload. + * Device drivers should not use this directly, as that will prevent + * them from working with bonding device. Instead, the device passed + * to the add/delete callbacks should be used. + */ struct net_device *dev; netdevice_tracker dev_tracker; + /* This is a private pointer used by the bonding driver (and eventually + * should be moved there). Device drivers should not use it. + * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases, + * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock + * is held. + */ struct net_device *real_dev; unsigned long offload_handle; u8 dir : 2; @@ -236,7 +247,6 @@ struct xfrm_state { /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; - struct sock __rcu *encap_sk; /* NAT keepalive */ u32 nat_keepalive_interval; /* seconds */ @@ -431,7 +441,6 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo); void xfrm_flush_gc(void); -void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { struct module *owner; @@ -464,7 +473,7 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); -void xfrm_set_type_offload(struct xfrm_state *x); +void xfrm_set_type_offload(struct xfrm_state *x, bool try_load); static inline void xfrm_unset_type_offload(struct xfrm_state *x) { if (!x->type_offload) @@ -906,7 +915,7 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) xfrm_pol_put(pols[i]); } -void __xfrm_state_destroy(struct xfrm_state *, bool); +void __xfrm_state_destroy(struct xfrm_state *); static inline void __xfrm_state_put(struct xfrm_state *x) { @@ -916,13 +925,7 @@ static inline void __xfrm_state_put(struct xfrm_state *x) static inline void xfrm_state_put(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, false); -} - -static inline void xfrm_state_put_sync(struct xfrm_state *x) -{ - if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, true); + __xfrm_state_destroy(x); } static inline void xfrm_state_hold(struct xfrm_state *x) @@ -1760,7 +1763,7 @@ struct xfrmk_spdinfo { struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid); @@ -1893,12 +1896,16 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, + struct net *net, + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + struct xfrm_user_offload *xuo); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 1dcd4d71468a..cac56e6b0869 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -53,6 +53,8 @@ struct xsk_buff_pool { refcount_t users; struct xdp_umem *umem; struct work_struct work; + /* Protects generic receive in shared and non-shared umem mode. */ + spinlock_t rx_lock; struct list_head free_list; struct list_head xskb_list; u32 heads_cnt; @@ -238,8 +240,8 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, return orig_addr; offset = xskb->xdp.data - xskb->xdp.data_hard_start; - orig_addr -= offset; offset += pool->headroom; + orig_addr -= offset; return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } |