diff options
Diffstat (limited to 'include/net')
241 files changed, 17327 insertions, 4297 deletions
diff --git a/include/net/9p/client.h b/include/net/9p/client.h index 78ebcf782ce5..838a94218b59 100644 --- a/include/net/9p/client.h +++ b/include/net/9p/client.h @@ -16,6 +16,12 @@ /* Number of requests per row */ #define P9_ROW_MAXTAG 255 +/* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ + + * room for write (16 extra) or read (11 extra) operands. + */ + +#define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ) + /** enum p9_proto_versions - 9P protocol versions * @p9_proto_legacy: 9P Legacy mode, pre-9P2000.u * @p9_proto_2000u: 9P2000.u extension @@ -127,6 +133,96 @@ struct p9_client { }; /** + * struct p9_fd_opts - holds client options during parsing + * @msize: maximum data size negotiated by protocol + * @prot-Oversion: 9P protocol version to use + * @trans_mod: module API instantiated with this client + * + * These parsed options get transferred into client in + * apply_client_options() + */ +struct p9_client_opts { + unsigned int msize; + unsigned char proto_version; + struct p9_trans_module *trans_mod; +}; + +/** + * struct p9_fd_opts - per-transport options for fd transport + * @rfd: file descriptor for reading (trans=fd) + * @wfd: file descriptor for writing (trans=fd) + * @port: port to connect to (trans=tcp) + * @privport: port is privileged + */ +struct p9_fd_opts { + int rfd; + int wfd; + u16 port; + bool privport; +}; + +/** + * struct p9_rdma_opts - Collection of mount options for rdma transport + * @port: port of connection + * @privport: Whether a privileged port may be used + * @sq_depth: The requested depth of the SQ. This really doesn't need + * to be any deeper than the number of threads used in the client + * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth + * @timeout: Time to wait in msecs for CM events + */ +struct p9_rdma_opts { + short port; + bool privport; + int sq_depth; + int rq_depth; + long timeout; +}; + +/** + * struct p9_session_opts - holds parsed options for v9fs_session_info + * @flags: session options of type &p9_session_flags + * @nodev: set to 1 to disable device mapping + * @debug: debug level + * @afid: authentication handle + * @cache: cache mode of type &p9_cache_bits + * @cachetag: the tag of the cache associated with this session + * @uname: string user name to mount hierarchy as + * @aname: mount specifier for remote hierarchy + * @dfltuid: default numeric userid to mount hierarchy as + * @dfltgid: default numeric groupid to mount hierarchy as + * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy + * @session_lock_timeout: retry interval for blocking locks + * + * This strucure holds options which are parsed and will be transferred + * to the v9fs_session_info structure when mounted, and therefore largely + * duplicates struct v9fs_session_info. + */ +struct p9_session_opts { + unsigned int flags; + unsigned char nodev; + unsigned short debug; + unsigned int afid; + unsigned int cache; +#ifdef CONFIG_9P_FSCACHE + char *cachetag; +#endif + char *uname; + char *aname; + kuid_t dfltuid; + kgid_t dfltgid; + kuid_t uid; + long session_lock_timeout; +}; + +/* Used by mount API to store parsed mount options */ +struct v9fs_context { + struct p9_client_opts client_opts; + struct p9_fd_opts fd_opts; + struct p9_rdma_opts rdma_opts; + struct p9_session_opts session_opts; +}; + +/** * struct p9_fid - file system entity handle * @clnt: back pointer to instantiating &p9_client * @fid: numeric identifier for this handle @@ -183,7 +279,7 @@ int p9_client_rename(struct p9_fid *fid, struct p9_fid *newdirfid, const char *name); int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name, struct p9_fid *newdirfid, const char *new_name); -struct p9_client *p9_client_create(const char *dev_name, char *options); +struct p9_client *p9_client_create(struct fs_context *fc); void p9_client_destroy(struct p9_client *clnt); void p9_client_disconnect(struct p9_client *clnt); void p9_client_begin_disconnect(struct p9_client *clnt); @@ -207,6 +303,8 @@ int p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err int p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err); int p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err); +struct netfs_io_subrequest; +void p9_client_write_subreq(struct netfs_io_subrequest *subreq); int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset); int p9dirent_read(struct p9_client *clnt, char *buf, int len, struct p9_dirent *dirent); diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index 766ec07c9599..a912bbaa862f 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -14,6 +14,13 @@ #define P9_DEF_MIN_RESVPORT (665U) #define P9_DEF_MAX_RESVPORT (1023U) +#define P9_FD_PORT 564 + +#define P9_RDMA_PORT 5640 +#define P9_RDMA_SQ_DEPTH 32 +#define P9_RDMA_RQ_DEPTH 32 +#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ + /** * struct p9_trans_module - transport module interface * @list: used to maintain a list of currently available transports @@ -24,6 +31,9 @@ * we're less flexible when choosing the response message * size in this case * @def: set if this transport should be considered the default + * @supports_vmalloc: set if this transport can work with vmalloc'd buffers + * (non-physically contiguous memory). Transports requiring + * DMA should leave this as false. * @create: member function to create a new connection on this transport * @close: member function to discard a connection on this transport * @request: member function to issue a request to the transport @@ -43,10 +53,11 @@ struct p9_trans_module { char *name; /* name of transport */ int maxsize; /* max message size of transport */ bool pooled_rbuffers; - int def; /* this transport should be default */ + bool def; /* this transport should be default */ + bool supports_vmalloc; /* can work with vmalloc'd buffers */ struct module *owner; int (*create)(struct p9_client *client, - const char *devname, char *args); + struct fs_context *fc); void (*close)(struct p9_client *client); int (*request)(struct p9_client *client, struct p9_req_t *req); int (*cancel)(struct p9_client *client, struct p9_req_t *req); diff --git a/include/net/Space.h b/include/net/Space.h index c29f3d51c078..ef42629f4258 100644 --- a/include/net/Space.h +++ b/include/net/Space.h @@ -10,4 +10,3 @@ struct net_device *smc_init(int unit); struct net_device *cs89x0_probe(int unit); struct net_device *tc515_probe(int unit); struct net_device *lance_probe(int unit); -struct net_device *cops_probe(int unit); diff --git a/include/net/act_api.h b/include/net/act_api.h index 4ae0580b63ca..91a24b5e0b93 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -33,7 +33,10 @@ struct tc_action { struct tcf_t tcfa_tm; struct gnet_stats_basic_sync tcfa_bstats; struct gnet_stats_basic_sync tcfa_bstats_hw; - struct gnet_stats_queue tcfa_qstats; + + atomic_t tcfa_drops; + atomic_t tcfa_overlimits; + struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; struct gnet_stats_basic_sync __percpu *cpu_bstats; @@ -53,7 +56,6 @@ struct tc_action { #define tcf_action common.tcfa_action #define tcf_tm common.tcfa_tm #define tcf_bstats common.tcfa_bstats -#define tcf_qstats common.tcfa_qstats #define tcf_rate_est common.tcfa_rate_est #define tcf_lock common.tcfa_lock @@ -76,19 +78,24 @@ static inline void tcf_lastuse_update(struct tcf_t *tm) { unsigned long now = jiffies; - if (tm->lastuse != now) - tm->lastuse = now; - if (unlikely(!tm->firstuse)) - tm->firstuse = now; + if (READ_ONCE(tm->lastuse) != now) + WRITE_ONCE(tm->lastuse, now); + if (unlikely(!READ_ONCE(tm->firstuse))) + WRITE_ONCE(tm->firstuse, now); } static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) { - dtm->install = jiffies_to_clock_t(jiffies - stm->install); - dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse); - dtm->firstuse = stm->firstuse ? - jiffies_to_clock_t(jiffies - stm->firstuse) : 0; - dtm->expires = jiffies_to_clock_t(stm->expires); + unsigned long firstuse, now = jiffies; + + dtm->install = jiffies_to_clock_t(now - READ_ONCE(stm->install)); + dtm->lastuse = jiffies_to_clock_t(now - READ_ONCE(stm->lastuse)); + + firstuse = READ_ONCE(stm->firstuse); + dtm->firstuse = firstuse ? + jiffies_to_clock_t(now - firstuse) : 0; + + dtm->expires = jiffies_to_clock_t(READ_ONCE(stm->expires)); } static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) @@ -137,6 +144,7 @@ struct tc_action_ops { #ifdef CONFIG_NET_CLS_ACT +#define ACT_P_BOUND 0 #define ACT_P_CREATED 1 #define ACT_P_DELETED 1 @@ -169,14 +177,12 @@ static inline void tc_action_net_exit(struct list_head *net_list, { struct net *net; - rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { struct tc_action_net *tn = net_generic(net, id); tcf_idrinfo_destroy(tn->ops, tn->idrinfo); kfree(tn->idrinfo); } - rtnl_unlock(); } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, @@ -191,7 +197,7 @@ int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); -void tcf_idr_insert_many(struct tc_action *actions[]); +void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); @@ -200,6 +206,8 @@ int tcf_idr_release(struct tc_action *a, bool bind); int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); +#define NET_ACT_ALIAS_PREFIX "net-act-" +#define MODULE_ALIAS_NET_ACT(kind) MODULE_ALIAS(NET_ACT_ALIAS_PREFIX kind) int tcf_action_destroy(struct tc_action *actions[], int bind); int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); @@ -207,8 +215,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action *actions[], int init_res[], size_t *attr_size, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); -struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police, - bool rtnl_held, +struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, @@ -217,7 +224,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); -int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); static inline void tcf_action_update_bstats(struct tc_action *a, struct sk_buff *skb) @@ -237,9 +243,7 @@ static inline void tcf_action_inc_drop_qstats(struct tc_action *a) qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); return; } - spin_lock(&a->tcfa_lock); - qstats_drop_inc(&a->tcfa_qstats); - spin_unlock(&a->tcfa_lock); + atomic_inc(&a->tcfa_drops); } static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) @@ -248,9 +252,7 @@ static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); return; } - spin_lock(&a->tcfa_lock); - qstats_overlimit_inc(&a->tcfa_qstats); - spin_unlock(&a->tcfa_lock); + atomic_inc(&a->tcfa_overlimits); } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 82da55101b5a..78e8b877fb25 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -8,8 +8,9 @@ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ -#define TEMP_VALID_LIFETIME (7*86400) -#define TEMP_PREFERRED_LIFETIME (86400) +#define TEMP_VALID_LIFETIME (7*86400) /* 1 week */ +#define TEMP_PREFERRED_LIFETIME (86400) /* 24 hours */ +#define REGEN_MIN_ADVANCE (2) /* 2 seconds */ #define REGEN_MAX_RETRY (3) #define MAX_DESYNC_FACTOR (600) @@ -31,17 +32,26 @@ struct prefix_info { __u8 length; __u8 prefix_len; + union __packed { + __u8 flags; + struct __packed { #if defined(__BIG_ENDIAN_BITFIELD) - __u8 onlink : 1, - autoconf : 1, - reserved : 6; + __u8 onlink : 1, + autoconf : 1, + routeraddr : 1, + preferpd : 1, + reserved : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) - __u8 reserved : 6, + __u8 reserved : 4, + preferpd : 1, + routeraddr : 1, autoconf : 1, onlink : 1; #else #error "Please fix <asm/byteorder.h>" #endif + }; + }; __be32 valid; __be32 prefered; __be32 reserved2; @@ -49,6 +59,9 @@ struct prefix_info { struct in6_addr prefix; }; +/* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */ +static_assert(sizeof(struct prefix_info) == 32); + #include <linux/ipv6.h> #include <linux/netdevice.h> #include <net/if_inet6.h> @@ -75,6 +88,23 @@ struct ifa6_config { u16 scope; }; +enum addr_type_t { + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + +struct inet6_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; + int ifindex; + enum addr_type_t type; + bool force_rt_scope_universe; +}; + int addrconf_init(void); void addrconf_cleanup(void); @@ -174,10 +204,12 @@ static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) return 0; } +#define INFINITY_LIFE_TIME 0xFFFFFFFF + static inline unsigned long addrconf_timeout_fixup(u32 timeout, unsigned int unit) { - if (timeout == 0xffffffff) + if (timeout == INFINITY_LIFE_TIME) return ~0UL; /* @@ -315,10 +347,20 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->ip6_ptr); } +static inline struct inet6_dev *in6_dev_rcu(const struct net_device *dev) +{ + return rcu_dereference(dev->ip6_ptr); +} + +static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev) +{ + return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr); +} + /** * __in6_dev_stats_get - get inet6_dev pointer for stats * @dev: network device - * @skb: skb for original incoming interface if neeeded + * @skb: skb for original incoming interface if needed * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. @@ -408,7 +450,7 @@ static inline bool ip6_ignore_linkdown(const struct net_device *dev) if (unlikely(!idev)) return true; - return !!idev->cnf.ignore_routes_with_linkdown; + return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown); } void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); @@ -429,6 +471,10 @@ static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) refcount_inc(&ifp->refcnt); } +static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp) +{ + return refcount_inc_not_zero(&ifp->refcnt); +} /* * compute link-local solicited-node multicast address @@ -506,4 +552,11 @@ int if6_proc_init(void); void if6_proc_exit(void); #endif +int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, + struct inet6_fill_args *args); + +int inet6_fill_ifacaddr(struct sk_buff *skb, + const struct ifacaddr6 *ifaca, + struct inet6_fill_args *args); #endif diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 5531dd08061e..0fb4c41c9bbf 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -15,6 +15,8 @@ struct key; struct sock; struct socket; struct rxrpc_call; +struct rxrpc_peer; +struct krb5_buffer; enum rxrpc_abort_reason; enum rxrpc_interruptibility { @@ -23,31 +25,42 @@ enum rxrpc_interruptibility { RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */ }; +enum rxrpc_oob_type { + RXRPC_OOB_CHALLENGE, /* Security challenge for a connection */ +}; + /* * Debug ID counter for tracing. */ extern atomic_t rxrpc_debug_id; +/* + * Operations table for rxrpc to call out to a kernel application (e.g. kAFS). + */ +struct rxrpc_kernel_ops { + void (*notify_new_call)(struct sock *sk, struct rxrpc_call *call, + unsigned long user_call_ID); + void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*notify_oob)(struct sock *sk, struct sk_buff *oob); +}; + typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *, - unsigned long); -typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long); -void rxrpc_kernel_new_call_notification(struct socket *, - rxrpc_notify_new_call_t, - rxrpc_discard_new_call_t); +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, - struct sockaddr_rxrpc *srx, + struct rxrpc_peer *peer, struct key *key, unsigned long user_call_ID, s64 tx_total_len, u32 hard_timeout, gfp_t gfp, rxrpc_notify_rx_t notify_rx, + u16 service_id, bool upgrade, enum rxrpc_interruptibility interruptibility, unsigned int debug_id); @@ -60,19 +73,43 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, enum rxrpc_abort_reason); void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call); void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call); -void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, - struct sockaddr_rxrpc *); -bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *); -int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, - rxrpc_user_attach_call_t, unsigned long, gfp_t, - unsigned int); +struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, + struct sockaddr_rxrpc *srx, gfp_t gfp); +void rxrpc_kernel_put_peer(struct rxrpc_peer *peer); +struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer); +struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call); +const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer); +const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); +unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data); +unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer); +unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); -u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); -void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, - unsigned long); int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); int rxrpc_sock_set_security_keyring(struct sock *, struct key *); +int rxrpc_sock_set_manage_response(struct sock *sk, bool set); + +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata); +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type); +void rxrpc_kernel_free_oob(struct sk_buff *oob); +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index); +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why); +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata); +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype); #endif /* _NET_RXRPC_H */ diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 824c258143a3..34f53dde65ce 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -2,26 +2,24 @@ #ifndef __LINUX_NET_AFUNIX_H #define __LINUX_NET_AFUNIX_H -#include <linux/socket.h> -#include <linux/un.h> +#include <linux/atomic.h> #include <linux/mutex.h> +#include <linux/net.h> +#include <linux/path.h> #include <linux/refcount.h> +#include <linux/spinlock.h> +#include <linux/wait.h> #include <net/sock.h> +#include <uapi/linux/un.h> -void unix_inflight(struct user_struct *user, struct file *fp); -void unix_notinflight(struct user_struct *user, struct file *fp); -void unix_destruct_scm(struct sk_buff *skb); -void io_uring_destruct_scm(struct sk_buff *skb); -void unix_gc(void); -void wait_for_unix_gc(void); -struct sock *unix_get_socket(struct file *filp); -struct sock *unix_peer_get(struct sock *sk); - -#define UNIX_HASH_MOD (256 - 1) -#define UNIX_HASH_SIZE (256 * 2) -#define UNIX_HASH_BITS 8 - -extern unsigned int unix_tot_inflight; +#if IS_ENABLED(CONFIG_UNIX) +struct unix_sock *unix_get_socket(struct file *filp); +#else +static inline struct unix_sock *unix_get_socket(struct file *filp) +{ + return NULL; +} +#endif struct unix_address { refcount_t refcnt; @@ -29,29 +27,11 @@ struct unix_address { struct sockaddr_un name[]; }; -struct unix_skb_parms { - struct pid *pid; /* Skb credentials */ - kuid_t uid; - kgid_t gid; - struct scm_fp_list *fp; /* Passed files */ -#ifdef CONFIG_SECURITY_NETWORK - u32 secid; /* Security ID */ -#endif - u32 consumed; -} __randomize_layout; - struct scm_stat { atomic_t nr_fds; + unsigned long nr_unix_fds; }; -#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) - -#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) -#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) -#define unix_state_lock_nested(s) \ - spin_lock_nested(&unix_sk(s)->lock, \ - SINGLE_DEPTH_NESTING) - /* The AF_UNIX socket */ struct unix_sock { /* WARNING: sk has to be the first member */ @@ -60,48 +40,24 @@ struct unix_sock { struct path path; struct mutex iolock, bindlock; struct sock *peer; - struct list_head link; - atomic_long_t inflight; + struct sock *listener; + struct unix_vertex *vertex; spinlock_t lock; - unsigned long gc_flags; -#define UNIX_GC_CANDIDATE 0 -#define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; +#define peer_wait peer_wq.wait wait_queue_entry_t peer_wake; struct scm_stat scm_stat; + int inq_len; + bool recvmsg_inq; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) struct sk_buff *oob_skb; #endif }; #define unix_sk(ptr) container_of_const(ptr, struct unix_sock, sk) +#define unix_peer(sk) (unix_sk(sk)->peer) -#define peer_wait peer_wq.wait - -long unix_inq_len(struct sock *sk); -long unix_outq_len(struct sock *sk); - -int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, - int flags); -int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, - int flags); -#ifdef CONFIG_SYSCTL -int unix_sysctl_register(struct net *net); -void unix_sysctl_unregister(struct net *net); -#else -static inline int unix_sysctl_register(struct net *net) { return 0; } -static inline void unix_sysctl_unregister(struct net *net) {} -#endif - -#ifdef CONFIG_BPF_SYSCALL -extern struct proto unix_dgram_proto; -extern struct proto unix_stream_proto; +#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) -int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); -int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); -void __init unix_bpf_build_proto(void); -#else -static inline void __init unix_bpf_build_proto(void) -{} -#endif #endif diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index b01cf9ac2437..d40e978126e3 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -137,7 +137,6 @@ struct vsock_transport { u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); - int (*set_rcvlowat)(struct vsock_sock *vsk, int val); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, @@ -168,6 +167,10 @@ struct vsock_transport { struct vsock_transport_send_notify_data *); /* sk_lock held by the caller */ void (*notify_buffer_size)(struct vsock_sock *, u64 *); + int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); + + /* SIOCOUTQ ioctl */ + ssize_t (*unsent_bytes)(struct vsock_sock *vsk); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); @@ -177,6 +180,9 @@ struct vsock_transport { /* Read a single skb */ int (*read_skb)(struct vsock_sock *, skb_read_actor_t); + + /* Zero-copy. */ + bool (*msgzerocopy_allow)(void); }; /**** CORE ****/ @@ -215,6 +221,7 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); +void vsock_linger(struct sock *sk); /**** TAP ****/ @@ -227,13 +234,17 @@ struct vsock_tap { int vsock_add_tap(struct vsock_tap *vt); int vsock_remove_tap(struct vsock_tap *vt); void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); +int __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags); int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); +int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags); int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); -#ifdef CONFIG_BPF_SYSCALL extern struct proto vsock_proto; +#ifdef CONFIG_BPF_SYSCALL int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void __init vsock_bpf_build_proto(void); #else @@ -241,4 +252,8 @@ static inline void __init vsock_bpf_build_proto(void) {} #endif +static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t) +{ + return t->msgzerocopy_allow && t->msgzerocopy_allow(); +} #endif /* __AF_VSOCK_H__ */ diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h new file mode 100644 index 000000000000..e1a1c8aedc79 --- /dev/null +++ b/include/net/aligned_data.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_ALIGNED_DATA_H +#define _NET_ALIGNED_DATA_H + +#include <linux/atomic.h> +#include <linux/types.h> + +/* Structure holding cacheline aligned fields on SMP builds. + * Each field or group should have an ____cacheline_aligned_in_smp + * attribute to ensure no accidental false sharing can happen. + */ +struct net_aligned_data { + atomic64_t net_cookie ____cacheline_aligned_in_smp; +#if defined(CONFIG_INET) + atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; + atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; +#endif +}; + +extern struct net_aligned_data net_aligned_data; + +#endif /* _NET_ALIGNED_DATA_H */ diff --git a/include/net/ax25.h b/include/net/ax25.h index 0d939e5aee4e..a7bba42dde15 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -139,7 +139,9 @@ enum { AX25_VALUES_N2, /* Default N2 value */ AX25_VALUES_PACLEN, /* AX.25 MTU */ AX25_VALUES_PROTOCOL, /* Std AX.25, DAMA Slave, DAMA Master */ +#ifdef CONFIG_AX25_DAMA_SLAVE AX25_VALUES_DS_TIMEOUT, /* DAMA Slave timeout */ +#endif AX25_MAX_VALUES /* THIS MUST REMAIN THE LAST ENTRY OF THIS LIST */ }; @@ -216,7 +218,7 @@ typedef struct { struct ctl_table; typedef struct ax25_dev { - struct ax25_dev *next; + struct list_head list; struct net_device *dev; netdevice_tracker dev_tracker; @@ -229,6 +231,7 @@ typedef struct ax25_dev { #endif refcount_t refcount; bool device_up; + struct rcu_head rcu; } ax25_dev; typedef struct ax25_cb { @@ -288,9 +291,8 @@ static inline void ax25_dev_hold(ax25_dev *ax25_dev) static inline void ax25_dev_put(ax25_dev *ax25_dev) { - if (refcount_dec_and_test(&ax25_dev->refcount)) { - kfree(ax25_dev); - } + if (refcount_dec_and_test(&ax25_dev->refcount)) + kfree_rcu(ax25_dev, rcu); } static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { @@ -330,13 +332,12 @@ int ax25_addr_size(const ax25_digi *); void ax25_digi_invert(const ax25_digi *, ax25_digi *); /* ax25_dev.c */ -extern ax25_dev *ax25_dev_list; extern spinlock_t ax25_dev_lock; #if IS_ENABLED(CONFIG_AX25) -static inline ax25_dev *ax25_dev_ax25dev(struct net_device *dev) +static inline ax25_dev *ax25_dev_ax25dev(const struct net_device *dev) { - return dev->ax25_ptr; + return rcu_dereference_rtnl(dev->ax25_ptr); } #endif @@ -417,7 +418,6 @@ void ax25_rt_device_down(struct net_device *); int ax25_rt_ioctl(unsigned int, void __user *); extern const struct seq_operations ax25_rt_seqops; ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev); -int ax25_rt_autobind(ax25_cb *, ax25_address *); struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *, ax25_address *, ax25_digi *); void ax25_rt_free(void); diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index aa90adc3b2a4..d46ed9011ee5 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -29,6 +29,7 @@ #include <linux/poll.h> #include <net/sock.h> #include <linux/seq_file.h> +#include <linux/ethtool.h> #define BT_SUBSYS_VERSION 2 #define BT_SUBSYS_REVISION 22 @@ -123,6 +124,7 @@ struct bt_voice { #define BT_VOICE_TRANSPARENT 0x0003 #define BT_VOICE_CVSD_16BIT 0x0060 +#define BT_VOICE_TRANSPARENT_16BIT 0x0063 #define BT_SNDMTU 12 #define BT_RCVMTU 13 @@ -155,6 +157,7 @@ struct bt_voice { #define BT_PKT_STATUS 16 #define BT_SCM_PKT_STATUS 0x03 +#define BT_SCM_ERROR 0x04 #define BT_ISO_QOS 17 @@ -164,6 +167,8 @@ struct bt_voice { #define BT_ISO_QOS_BIG_UNSET 0xff #define BT_ISO_QOS_BIS_UNSET 0xff +#define BT_ISO_SYNC_TIMEOUT 0x07d0 /* 20 secs */ + struct bt_iso_io_qos { __u32 interval; __u16 latency; @@ -239,6 +244,12 @@ struct bt_codecs { #define BT_ISO_BASE 20 +/* Socket option value 21 reserved */ + +#define BT_PKT_SEQNUM 22 + +#define BT_SCM_PKT_SEQNUM 0x05 + __printf(1, 2) void bt_info(const char *fmt, ...); __printf(1, 2) @@ -261,7 +272,8 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) #if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG) -#define BT_DBG(fmt, ...) bt_dbg(fmt "\n", ##__VA_ARGS__) +#define BT_DBG(fmt, ...) \ + bt_dbg("%s:%d: " fmt "\n", __func__, __LINE__, ##__VA_ARGS__) #else #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif @@ -283,7 +295,7 @@ void bt_err_ratelimited(const char *fmt, ...); bt_err_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) /* Connection and socket states */ -enum { +enum bt_sock_state { BT_CONNECTED = 1, /* Equal to TCP_ESTABLISHED to make net code happy */ BT_OPEN, BT_BOUND, @@ -386,7 +398,8 @@ struct bt_sock { enum { BT_SK_DEFER_SETUP, BT_SK_SUSPEND, - BT_SK_PKT_STATUS + BT_SK_PKT_STATUS, + BT_SK_PKT_SEQNUM, }; struct bt_sock_list { @@ -401,6 +414,7 @@ int bt_sock_register(int proto, const struct net_proto_family *ops); void bt_sock_unregister(int proto); void bt_sock_link(struct bt_sock_list *l, struct sock *s); void bt_sock_unlink(struct bt_sock_list *l, struct sock *s); +bool bt_sock_linked(struct bt_sock_list *l, struct sock *s); struct sock *bt_sock_alloc(struct net *net, struct socket *sock, struct proto *prot, int proto, gfp_t prio, int kern); int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, @@ -439,6 +453,13 @@ typedef void (*hci_req_complete_t)(struct hci_dev *hdev, u8 status, u16 opcode); typedef void (*hci_req_complete_skb_t)(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb); +void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, + hci_req_complete_t *req_complete, + hci_req_complete_skb_t *req_complete_skb); + +int hci_ethtool_ts_info(unsigned int index, int sk_proto, + struct kernel_ethtool_ts_info *ts_info); + #define HCI_REQ_START BIT(0) #define HCI_REQ_SKB BIT(1) @@ -462,6 +483,7 @@ struct bt_skb_cb { u8 pkt_type; u8 force_active; u16 expect; + u16 pkt_seqnum; u8 incoming:1; u8 pkt_status:2; union { @@ -475,6 +497,7 @@ struct bt_skb_cb { #define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type #define hci_skb_pkt_status(skb) bt_cb((skb))->pkt_status +#define hci_skb_pkt_seqnum(skb) bt_cb((skb))->pkt_seqnum #define hci_skb_expect(skb) bt_cb((skb))->expect #define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode #define hci_skb_event(skb) bt_cb((skb))->hci.req_event @@ -541,7 +564,7 @@ static inline struct sk_buff *bt_skb_sendmsg(struct sock *sk, return ERR_PTR(-EFAULT); } - skb->priority = sk->sk_priority; + skb->priority = READ_ONCE(sk->sk_priority); return skb; } @@ -625,7 +648,7 @@ static inline void sco_exit(void) #if IS_ENABLED(CONFIG_BT_LE) int iso_init(void); int iso_exit(void); -bool iso_enabled(void); +bool iso_inited(void); #else static inline int iso_init(void) { @@ -637,7 +660,7 @@ static inline int iso_exit(void) return 0; } -static inline bool iso_enabled(void) +static inline bool iso_inited(void) { return false; } diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 87d92accc26e..a27cd3626b87 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1,6 +1,7 @@ /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated + Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> @@ -28,13 +29,11 @@ #define HCI_MAX_ACL_SIZE 1024 #define HCI_MAX_SCO_SIZE 255 #define HCI_MAX_ISO_SIZE 251 +#define HCI_MAX_ISO_BIS 31 #define HCI_MAX_EVENT_SIZE 260 #define HCI_MAX_FRAME_SIZE (HCI_MAX_ACL_SIZE + 4) #define HCI_LINK_KEY_SIZE 16 -#define HCI_AMP_LINK_KEY_SIZE (2 * HCI_LINK_KEY_SIZE) - -#define HCI_MAX_AMP_ASSOC_SIZE 672 #define HCI_MAX_CPB_DATA_SIZE 252 @@ -69,26 +68,7 @@ #define HCI_I2C 8 #define HCI_SMD 9 #define HCI_VIRTIO 10 - -/* HCI controller types */ -#define HCI_PRIMARY 0x00 -#define HCI_AMP 0x01 - -/* First BR/EDR Controller shall have ID = 0 */ -#define AMP_ID_BREDR 0x00 - -/* AMP controller types */ -#define AMP_TYPE_BREDR 0x00 -#define AMP_TYPE_80211 0x01 - -/* AMP controller status */ -#define AMP_STATUS_POWERED_DOWN 0x00 -#define AMP_STATUS_BLUETOOTH_ONLY 0x01 -#define AMP_STATUS_NO_CAPACITY 0x02 -#define AMP_STATUS_LOW_CAPACITY 0x03 -#define AMP_STATUS_MEDIUM_CAPACITY 0x04 -#define AMP_STATUS_HIGH_CAPACITY 0x05 -#define AMP_STATUS_FULL_CAPACITY 0x06 +#define HCI_IPC 11 /* HCI device quirks */ enum { @@ -175,6 +155,15 @@ enum { */ HCI_QUIRK_USE_BDADDR_PROPERTY, + /* When this quirk is set, the Bluetooth Device Address provided by + * the 'local-bd-address' fwnode property is incorrectly specified in + * big-endian order. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BDADDR_PROPERTY_BROKEN, + /* When this quirk is set, the duplicate filtering during * scanning is based on Bluetooth devices addresses. To allow * RSSI based updates, restart scanning if needed. @@ -219,14 +208,24 @@ enum { */ HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, - /* When this quirk is set, the controller has validated that - * LE states reported through the HCI_LE_READ_SUPPORTED_STATES are - * valid. This mechanism is necessary as many controllers have - * been seen has having trouble initiating a connectable - * advertisement despite the state combination being reported as - * supported. + /* When this quirk is set consider Sync Flow Control as supported by + * the driver. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, + + /* When this quirk is set, the LE states reported through the + * HCI_LE_READ_SUPPORTED_STATES are invalid/broken. + * + * This mechanism is necessary as many controllers have been seen has + * having trouble initiating a connectable advertisement despite the + * state combination being reported as supported. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. */ - HCI_QUIRK_VALID_LE_STATES, + HCI_QUIRK_BROKEN_LE_STATES, /* When this quirk is set, then erroneous data reporting * is ignored. This is mainly due to the fact that the HCI @@ -310,6 +309,20 @@ enum { */ HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, + /* + * When this quirk is set, the HCI_OP_LE_EXT_CREATE_CONN command is + * disabled. This is required for the Actions Semiconductor ATS2851 + * based controllers, which erroneously claims to support it. + */ + HCI_QUIRK_BROKEN_EXT_CREATE_CONN, + + /* + * When this quirk is set, the command WRITE_AUTH_PAYLOAD_TIMEOUT is + * skipped. This is required for the Actions Semiconductor ATS2851 + * based controllers, due to a race condition in pairing process. + */ + HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT, + /* When this quirk is set, MSFT extension monitor tracking by * address filter is supported. Since tracking quantity of each * pattern is limited, this feature supports tracking multiple @@ -329,6 +342,43 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_LE_CODED, + + /* + * When this quirk is set, the HCI_OP_READ_ENC_KEY_SIZE command is + * skipped during an HCI_EV_ENCRYPT_CHANGE event. This is required + * for Actions Semiconductor ATS2851 based controllers, which erroneously + * claim to support it. + */ + HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, + + /* + * When this quirk is set, the reserved bits of Primary/Secondary_PHY + * inside the LE Extended Advertising Report events are discarded. + * This is required for some Apple/Broadcom controllers which + * abuse these reserved bits for unrelated flags. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, + + /* When this quirk is set, the HCI_OP_READ_VOICE_SETTING command is + * skipped. This is required for a subset of the CSR controller clones + * which erroneously claim to support it. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_BROKEN_READ_VOICE_SETTING, + + /* When this quirk is set, the HCI_OP_READ_PAGE_SCAN_TYPE command is + * skipped. This is required for a subset of the CSR controller clones + * which erroneously claim to support it. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, + + __HCI_NUM_QUIRKS, }; /* HCI device flags */ @@ -371,6 +421,7 @@ enum { HCI_SETUP, HCI_CONFIG, HCI_DEBUGFS_CREATED, + HCI_POWERING_DOWN, HCI_AUTO_OFF, HCI_RFKILLED, HCI_MGMT, @@ -383,6 +434,7 @@ enum { HCI_USER_CHANNEL, HCI_EXT_CONFIGURED, HCI_LE_ADV, + HCI_LE_ADV_0, HCI_LE_PER_ADV, HCI_LE_SCAN, HCI_SSP_ENABLED, @@ -392,7 +444,6 @@ enum { HCI_LIMITED_PRIVACY, HCI_RPA_EXPIRED, HCI_RPA_RESOLVING, - HCI_HS_ENABLED, HCI_LE_ENABLED, HCI_ADVERTISING, HCI_ADVERTISING_CONNECTABLE, @@ -407,13 +458,13 @@ enum { HCI_WIDEBAND_SPEECH_ENABLED, HCI_EVENT_FILTER_CONFIGURED, HCI_PA_SYNC, + HCI_SCO_FLOWCTL, HCI_DUT_MODE, HCI_VENDOR_DIAG, HCI_FORCE_BREDR_SMP, HCI_FORCE_STATIC_ADDR, HCI_LL_RPA_RESOLUTION, - HCI_ENABLE_LL_PRIVACY, HCI_CMD_PENDING, HCI_FORCE_NO_MITM, HCI_QUALITY_REPORT, @@ -436,9 +487,9 @@ enum { #define HCI_NCMD_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ #define HCI_ACL_TX_TIMEOUT msecs_to_jiffies(45000) /* 45 seconds */ #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ -#define HCI_POWER_OFF_TIMEOUT msecs_to_jiffies(5000) /* 5 seconds */ +#define HCI_ACL_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ -#define HCI_LE_AUTOCONN_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ +#define HCI_ISO_TX_TIMEOUT usecs_to_jiffies(0x7fffff) /* 8388607 usecs */ /* HCI data types */ #define HCI_COMMAND_PKT 0x01 @@ -447,6 +498,7 @@ enum { #define HCI_EVENT_PKT 0x04 #define HCI_ISODATA_PKT 0x05 #define HCI_DIAG_PKT 0xf0 +#define HCI_DRV_PKT 0xf1 #define HCI_VENDOR_PKT 0xff /* HCI packet types */ @@ -510,8 +562,9 @@ enum { #define ESCO_LINK 0x02 /* Low Energy links do not have defined link type. Use invented one */ #define LE_LINK 0x80 -#define AMP_LINK 0x81 -#define ISO_LINK 0x82 +#define CIS_LINK 0x82 +#define BIS_LINK 0x83 +#define PA_LINK 0x84 #define INVALID_LINK 0xff /* LMP features */ @@ -594,10 +647,13 @@ enum { #define HCI_LE_EXT_ADV 0x10 #define HCI_LE_PERIODIC_ADV 0x20 #define HCI_LE_CHAN_SEL_ALG2 0x40 +#define HCI_LE_PAST_SENDER 0x01 +#define HCI_LE_PAST_RECEIVER 0x02 #define HCI_LE_CIS_CENTRAL 0x10 #define HCI_LE_CIS_PERIPHERAL 0x20 #define HCI_LE_ISO_BROADCASTER 0x40 #define HCI_LE_ISO_SYNC_RECEIVER 0x80 +#define HCI_LE_LL_EXT_FEATURE 0x80 /* Connection modes */ #define HCI_CM_ACTIVE 0x0000 @@ -652,6 +708,7 @@ enum { #define HCI_ERROR_PIN_OR_KEY_MISSING 0x06 #define HCI_ERROR_MEMORY_EXCEEDED 0x07 #define HCI_ERROR_CONNECTION_TIMEOUT 0x08 +#define HCI_ERROR_COMMAND_DISALLOWED 0x0c #define HCI_ERROR_REJ_LIMITED_RESOURCES 0x0d #define HCI_ERROR_REJ_BAD_ADDR 0x0f #define HCI_ERROR_INVALID_PARAMETERS 0x12 @@ -660,6 +717,7 @@ enum { #define HCI_ERROR_REMOTE_POWER_OFF 0x15 #define HCI_ERROR_LOCAL_HOST_TERM 0x16 #define HCI_ERROR_PAIRING_NOT_ALLOWED 0x18 +#define HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE 0x1a #define HCI_ERROR_INVALID_LL_PARAMS 0x1e #define HCI_ERROR_UNSPECIFIED 0x1f #define HCI_ERROR_ADVERTISING_TIMEOUT 0x3c @@ -673,6 +731,9 @@ enum { #define HCI_TX_POWER_INVALID 127 #define HCI_RSSI_INVALID 127 +#define HCI_SYNC_HANDLE_INVALID 0xffff +#define HCI_SID_INVALID 0xff + #define HCI_ROLE_MASTER 0x00 #define HCI_ROLE_SLAVE 0x01 @@ -828,6 +889,11 @@ struct hci_cp_remote_name_req_cancel { bdaddr_t bdaddr; } __packed; +struct hci_rp_remote_name_req_cancel { + __u8 status; + bdaddr_t bdaddr; +} __packed; + #define HCI_OP_READ_REMOTE_FEATURES 0x041b struct hci_cp_read_remote_features { __le16 handle; @@ -922,56 +988,6 @@ struct hci_cp_io_capability_neg_reply { __u8 reason; } __packed; -#define HCI_OP_CREATE_PHY_LINK 0x0435 -struct hci_cp_create_phy_link { - __u8 phy_handle; - __u8 key_len; - __u8 key_type; - __u8 key[HCI_AMP_LINK_KEY_SIZE]; -} __packed; - -#define HCI_OP_ACCEPT_PHY_LINK 0x0436 -struct hci_cp_accept_phy_link { - __u8 phy_handle; - __u8 key_len; - __u8 key_type; - __u8 key[HCI_AMP_LINK_KEY_SIZE]; -} __packed; - -#define HCI_OP_DISCONN_PHY_LINK 0x0437 -struct hci_cp_disconn_phy_link { - __u8 phy_handle; - __u8 reason; -} __packed; - -struct ext_flow_spec { - __u8 id; - __u8 stype; - __le16 msdu; - __le32 sdu_itime; - __le32 acc_lat; - __le32 flush_to; -} __packed; - -#define HCI_OP_CREATE_LOGICAL_LINK 0x0438 -#define HCI_OP_ACCEPT_LOGICAL_LINK 0x0439 -struct hci_cp_create_accept_logical_link { - __u8 phy_handle; - struct ext_flow_spec tx_flow_spec; - struct ext_flow_spec rx_flow_spec; -} __packed; - -#define HCI_OP_DISCONN_LOGICAL_LINK 0x043a -struct hci_cp_disconn_logical_link { - __le16 log_handle; -} __packed; - -#define HCI_OP_LOGICAL_LINK_CANCEL 0x043b -struct hci_cp_logical_link_cancel { - __u8 phy_handle; - __u8 flow_spec_id; -} __packed; - #define HCI_OP_ENHANCED_SETUP_SYNC_CONN 0x043d struct hci_coding_format { __u8 id; @@ -1551,6 +1567,11 @@ struct hci_rp_read_tx_power { __s8 tx_power; } __packed; +#define HCI_OP_WRITE_SYNC_FLOWCTL 0x0c2f +struct hci_cp_write_sync_flowctl { + __u8 enable; +} __packed; + #define HCI_OP_READ_PAGE_SCAN_TYPE 0x0c46 struct hci_rp_read_page_scan_type { __u8 status; @@ -1593,46 +1614,6 @@ struct hci_rp_read_enc_key_size { __u8 key_size; } __packed; -#define HCI_OP_READ_LOCAL_AMP_INFO 0x1409 -struct hci_rp_read_local_amp_info { - __u8 status; - __u8 amp_status; - __le32 total_bw; - __le32 max_bw; - __le32 min_latency; - __le32 max_pdu; - __u8 amp_type; - __le16 pal_cap; - __le16 max_assoc_size; - __le32 max_flush_to; - __le32 be_flush_to; -} __packed; - -#define HCI_OP_READ_LOCAL_AMP_ASSOC 0x140a -struct hci_cp_read_local_amp_assoc { - __u8 phy_handle; - __le16 len_so_far; - __le16 max_len; -} __packed; -struct hci_rp_read_local_amp_assoc { - __u8 status; - __u8 phy_handle; - __le16 rem_len; - __u8 frag[]; -} __packed; - -#define HCI_OP_WRITE_REMOTE_AMP_ASSOC 0x140b -struct hci_cp_write_remote_amp_assoc { - __u8 phy_handle; - __le16 len_so_far; - __le16 rem_len; - __u8 frag[]; -} __packed; -struct hci_rp_write_remote_amp_assoc { - __u8 status; - __u8 phy_handle; -} __packed; - #define HCI_OP_GET_MWS_TRANSPORT_CONFIG 0x140c #define HCI_OP_ENABLE_DUT_MODE 0x1803 @@ -1644,6 +1625,15 @@ struct hci_cp_le_set_event_mask { __u8 mask[8]; } __packed; +/* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 4, Part E + * 7.8.2 LE Read Buffer Size command + * MAX_LE_MTU is 0xffff. + * 0 is also valid. It means that no dedicated LE Buffer exists. + * It should use the HCI_Read_Buffer_Size command and mtu is shared + * between BR/EDR and LE. + */ +#define HCI_MIN_LE_MTU 0x001b + #define HCI_OP_LE_READ_BUFFER_SIZE 0x2002 struct hci_rp_le_read_buffer_size { __u8 status; @@ -1951,6 +1941,8 @@ struct hci_cp_le_pa_create_sync { __u8 sync_cte_type; } __packed; +#define HCI_OP_LE_PA_CREATE_SYNC_CANCEL 0x2045 + #define HCI_OP_LE_PA_TERM_SYNC 0x2046 struct hci_cp_le_pa_term_sync { __le16 handle; @@ -2004,7 +1996,7 @@ struct hci_cp_le_set_ext_adv_data { __u8 operation; __u8 frag_pref; __u8 length; - __u8 data[]; + __u8 data[] __counted_by(length); } __packed; #define HCI_OP_LE_SET_EXT_SCAN_RSP_DATA 0x2038 @@ -2013,7 +2005,7 @@ struct hci_cp_le_set_ext_scan_rsp_data { __u8 operation; __u8 frag_pref; __u8 length; - __u8 data[]; + __u8 data[] __counted_by(length); } __packed; #define HCI_OP_LE_SET_EXT_ADV_ENABLE 0x2039 @@ -2032,13 +2024,14 @@ struct hci_cp_le_set_per_adv_params { } __packed; #define HCI_MAX_PER_AD_LENGTH 252 +#define HCI_MAX_PER_AD_TOT_LEN 1650 #define HCI_OP_LE_SET_PER_ADV_DATA 0x203f struct hci_cp_le_set_per_adv_data { __u8 handle; __u8 operation; __u8 length; - __u8 data[]; + __u8 data[] __counted_by(length); } __packed; #define HCI_OP_LE_SET_PER_ADV_ENABLE 0x2040 @@ -2078,6 +2071,44 @@ struct hci_cp_le_set_privacy_mode { __u8 mode; } __packed; +#define HCI_OP_LE_PAST 0x205a +struct hci_cp_le_past { + __le16 handle; + __le16 service_data; + __le16 sync_handle; +} __packed; + +struct hci_rp_le_past { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_PAST_SET_INFO 0x205b +struct hci_cp_le_past_set_info { + __le16 handle; + __le16 service_data; + __u8 adv_handle; +} __packed; + +struct hci_rp_le_past_set_info { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_PAST_PARAMS 0x205c +struct hci_cp_le_past_params { + __le16 handle; + __u8 mode; + __le16 skip; + __le16 sync_timeout; + __u8 cte_type; +} __packed; + +struct hci_rp_le_past_params { + __u8 status; + __le16 handle; +} __packed; + #define HCI_OP_LE_READ_BUFFER_SIZE_V2 0x2060 struct hci_rp_le_read_buffer_size_v2 { __u8 status; @@ -2121,7 +2152,7 @@ struct hci_cp_le_set_cig_params { __le16 c_latency; __le16 p_latency; __u8 num_cis; - struct hci_cis_params cis[]; + struct hci_cis_params cis[] __counted_by(num_cis); } __packed; struct hci_rp_le_set_cig_params { @@ -2139,7 +2170,7 @@ struct hci_cis { struct hci_cp_le_create_cis { __u8 num_cis; - struct hci_cis cis[]; + struct hci_cis cis[] __counted_by(num_cis); } __packed; #define HCI_OP_LE_REMOVE_CIG 0x2065 @@ -2193,7 +2224,7 @@ struct hci_cp_le_big_create_sync { __u8 mse; __le16 timeout; __u8 num_bis; - __u8 bis[]; + __u8 bis[] __counted_by(num_bis); } __packed; #define HCI_OP_LE_BIG_TERM_SYNC 0x206c @@ -2225,6 +2256,19 @@ struct hci_cp_le_set_host_feature { __u8 bit_value; } __packed; +#define HCI_OP_LE_READ_ALL_LOCAL_FEATURES 0x2087 +struct hci_rp_le_read_all_local_features { + __u8 status; + __u8 page; + __u8 features[248]; +} __packed; + +#define HCI_OP_LE_READ_ALL_REMOTE_FEATURES 0x2088 +struct hci_cp_le_read_all_remote_features { + __le16 handle; + __u8 pages; +} __packed; + /* ---- HCI Events ---- */ struct hci_ev_status { __u8 status; @@ -2647,6 +2691,7 @@ struct hci_ev_le_conn_complete { #define LE_EXT_ADV_DIRECT_IND 0x0004 #define LE_EXT_ADV_SCAN_RSP 0x0008 #define LE_EXT_ADV_LEGACY_PDU 0x0010 +#define LE_EXT_ADV_DATA_STATUS_MASK 0x0060 #define LE_EXT_ADV_EVT_TYPE_MASK 0x007f #define ADDR_LE_DEV_PUBLIC 0x00 @@ -2792,6 +2837,15 @@ struct hci_ev_le_per_adv_report { __u8 data[]; } __packed; +#define HCI_EV_LE_PA_SYNC_LOST 0x10 +struct hci_ev_le_pa_sync_lost { + __le16 handle; +} __packed; + +#define LE_PA_DATA_COMPLETE 0x00 +#define LE_PA_DATA_MORE_TO_COME 0x01 +#define LE_PA_DATA_TRUNCATED 0x02 + #define HCI_EV_LE_EXT_ADV_SET_TERM 0x12 struct hci_evt_le_ext_adv_set_term { __u8 status; @@ -2800,6 +2854,20 @@ struct hci_evt_le_ext_adv_set_term { __u8 num_evts; } __packed; +#define HCI_EV_LE_PAST_RECEIVED 0x18 +struct hci_ev_le_past_received { + __u8 status; + __le16 handle; + __le16 service_data; + __le16 sync_handle; + __u8 sid; + __u8 bdaddr_type; + bdaddr_t bdaddr; + __u8 phy; + __le16 interval; + __u8 clock_accuracy; +} __packed; + #define HCI_EVT_LE_CIS_ESTABLISHED 0x19 struct hci_evt_le_cis_established { __u8 status; @@ -2845,8 +2913,8 @@ struct hci_evt_le_create_big_complete { __le16 bis_handle[]; } __packed; -#define HCI_EVT_LE_BIG_SYNC_ESTABILISHED 0x1d -struct hci_evt_le_big_sync_estabilished { +#define HCI_EVT_LE_BIG_SYNC_ESTABLISHED 0x1d +struct hci_evt_le_big_sync_established { __u8 status; __u8 handle; __u8 latency[3]; @@ -2860,6 +2928,12 @@ struct hci_evt_le_big_sync_estabilished { __le16 bis[]; } __packed; +#define HCI_EVT_LE_BIG_SYNC_LOST 0x1e +struct hci_evt_le_big_sync_lost { + __u8 handle; + __u8 reason; +} __packed; + #define HCI_EVT_LE_BIG_INFO_ADV_REPORT 0x22 struct hci_evt_le_big_info_adv_report { __le16 sync_handle; @@ -2877,6 +2951,15 @@ struct hci_evt_le_big_info_adv_report { __u8 encryption; } __packed; +#define HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE 0x2b +struct hci_evt_le_read_all_remote_features_complete { + __u8 status; + __le16 handle; + __u8 max_pages; + __u8 valid_pages; + __u8 features[248]; +} __packed; + #define HCI_EV_VENDOR 0xff /* Internal events generated by Bluetooth stack */ @@ -2966,6 +3049,11 @@ static inline struct hci_sco_hdr *hci_sco_hdr(const struct sk_buff *skb) return (struct hci_sco_hdr *) skb->data; } +static inline struct hci_iso_hdr *hci_iso_hdr(const struct sk_buff *skb) +{ + return (struct hci_iso_hdr *)skb->data; +} + /* Command opcode pack/unpack */ #define hci_opcode_pack(ogf, ocf) ((__u16) ((ocf & 0x03ff)|(ogf << 10))) #define hci_opcode_ogf(op) (op >> 10) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index e6359f7346f1..4263e71a23ef 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1,7 +1,7 @@ /* BlueZ - Bluetooth protocol stack for Linux Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved. - Copyright 2023 NXP + Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> @@ -29,8 +29,11 @@ #include <linux/idr.h> #include <linux/leds.h> #include <linux/rculist.h> +#include <linux/spinlock.h> +#include <linux/srcu.h> #include <net/bluetooth/hci.h> +#include <net/bluetooth/hci_drv.h> #include <net/bluetooth/hci_sync.h> #include <net/bluetooth/hci_sock.h> #include <net/bluetooth/coredump.h> @@ -91,9 +94,8 @@ struct discovery_state { s8 rssi; u16 uuid_count; u8 (*uuids)[16]; - unsigned long scan_start; - unsigned long scan_duration; unsigned long name_resolve_timeout; + spinlock_t lock; }; #define SUSPEND_NOTIFIER_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ @@ -126,9 +128,10 @@ enum suspended_state { struct hci_conn_hash { struct list_head list; unsigned int acl_num; - unsigned int amp_num; unsigned int sco_num; - unsigned int iso_num; + unsigned int cis_num; + unsigned int bis_num; + unsigned int pa_num; unsigned int le_num; unsigned int le_num_peripheral; }; @@ -160,8 +163,10 @@ struct bdaddr_list_with_irk { /* Bitmask of connection flags */ enum hci_conn_flags { - HCI_CONN_FLAG_REMOTE_WAKEUP = 1, - HCI_CONN_FLAG_DEVICE_PRIVACY = 2, + HCI_CONN_FLAG_REMOTE_WAKEUP = BIT(0), + HCI_CONN_FLAG_DEVICE_PRIVACY = BIT(1), + HCI_CONN_FLAG_ADDRESS_RESOLUTION = BIT(2), + HCI_CONN_FLAG_PAST = BIT(3), }; typedef u8 hci_conn_flags_t; @@ -240,8 +245,11 @@ struct adv_info { bool enabled; bool pending; bool periodic; + bool periodic_enabled; __u8 mesh; __u8 instance; + __u8 handle; + __u8 sid; __u32 flags; __u16 timeout; __u16 remaining_time; @@ -262,6 +270,12 @@ struct adv_info { struct delayed_work rpa_expired_cb; }; +struct tx_queue { + struct sk_buff_head queue; + unsigned int extra; + unsigned int tracked; +}; + #define HCI_MAX_ADV_INSTANCES 5 #define HCI_DEFAULT_ADV_DURATION 2 @@ -336,25 +350,19 @@ struct adv_monitor { /* Default authenticated payload timeout 30s */ #define DEFAULT_AUTH_PAYLOAD_TIMEOUT 0x0bb8 -struct amp_assoc { - __u16 len; - __u16 offset; - __u16 rem_len; - __u16 len_so_far; - __u8 data[HCI_MAX_AMP_ASSOC_SIZE]; -}; - #define HCI_MAX_PAGES 3 struct hci_dev { struct list_head list; + struct srcu_struct srcu; struct mutex lock; - char name[8]; + struct ida unset_handle_ida; + + const char *name; unsigned long flags; __u16 id; __u8 bus; - __u8 dev_type; bdaddr_t bdaddr; bdaddr_t setup_addr; bdaddr_t public_addr; @@ -370,7 +378,7 @@ struct hci_dev { __u8 minor_class; __u8 max_page; __u8 features[HCI_MAX_PAGES][8]; - __u8 le_features[8]; + __u8 le_features[248]; __u8 le_accept_list_size; __u8 le_resolv_list_size; __u8 le_num_of_adv_sets; @@ -460,24 +468,9 @@ struct hci_dev { __u16 sniff_min_interval; __u16 sniff_max_interval; - __u8 amp_status; - __u32 amp_total_bw; - __u32 amp_max_bw; - __u32 amp_min_latency; - __u32 amp_max_pdu; - __u8 amp_type; - __u16 amp_pal_cap; - __u16 amp_assoc_size; - __u32 amp_max_flush_to; - __u32 amp_be_flush_to; - - struct amp_assoc loc_assoc; - - __u8 flow_ctl_mode; - unsigned int auto_accept_delay; - unsigned long quirks; + DECLARE_BITMAP(quirk_flags, __HCI_NUM_QUIRKS); atomic_t cmd_cnt; unsigned int acl_cnt; @@ -494,14 +487,9 @@ struct hci_dev { unsigned int le_pkts; unsigned int iso_pkts; - __u16 block_len; - __u16 block_mtu; - __u16 num_blocks; - __u16 block_cnt; - unsigned long acl_last_tx; - unsigned long sco_last_tx; unsigned long le_last_tx; + unsigned long iso_last_tx; __u8 le_tx_def_phys; __u8 le_rx_def_phys; @@ -532,7 +520,6 @@ struct hci_dev { struct work_struct tx_work; struct delayed_work le_scan_disable; - struct delayed_work le_scan_restart; struct sk_buff_head rx_q; struct sk_buff_head raw_q; @@ -546,13 +533,13 @@ struct hci_dev { __u32 req_status; __u32 req_result; struct sk_buff *req_skb; + struct sk_buff *req_rsp; void *smp_data; void *smp_bredr_data; struct discovery_state discovery; - int discovery_old_state; bool discovery_paused; int advertising_old_state; bool advertising_paused; @@ -569,6 +556,7 @@ struct hci_dev { struct hci_conn_hash conn_hash; struct list_head mesh_pending; + struct mutex mgmt_pending_lock; struct list_head mgmt_pending; struct list_head reject_list; struct list_head accept_list; @@ -637,6 +625,8 @@ struct hci_dev { struct list_head monitored_devices; bool advmon_pend_notify; + struct hci_drv *hci_drv; + #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; #endif @@ -663,7 +653,6 @@ struct hci_dev { int (*post_init)(struct hci_dev *hdev); int (*set_diag)(struct hci_dev *hdev, bool enable); int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr); - void (*cmd_timeout)(struct hci_dev *hdev); void (*reset)(struct hci_dev *hdev); bool (*wakeup)(struct hci_dev *hdev); int (*set_quality_report)(struct hci_dev *hdev, bool enable); @@ -671,8 +660,13 @@ struct hci_dev { int (*get_codec_config_data)(struct hci_dev *hdev, __u8 type, struct bt_codec *codec, __u8 *vnd_len, __u8 **vnd_data); + u8 (*classify_pkt_type)(struct hci_dev *hdev, struct sk_buff *skb); }; +#define hci_set_quirk(hdev, nr) set_bit((nr), (hdev)->quirk_flags) +#define hci_clear_quirk(hdev, nr) clear_bit((nr), (hdev)->quirk_flags) +#define hci_test_quirk(hdev, nr) test_bit((nr), (hdev)->quirk_flags) + #define HCI_PHY_HANDLE(handle) (handle & 0xff) enum conn_reasons { @@ -698,7 +692,9 @@ struct hci_conn { __u8 adv_instance; __u16 handle; __u16 sync_handle; + __u8 sid; __u16 state; + __u16 mtu; __u8 mode; __u8 type; __u8 role; @@ -706,6 +702,7 @@ struct hci_conn { __u8 attempt; __u8 dev_class[3]; __u8 features[HCI_MAX_PAGES][8]; + __u8 le_features[248]; __u16 pkt_type; __u16 link_policy; __u8 key_type; @@ -728,14 +725,20 @@ struct hci_conn { __u16 le_supv_timeout; __u8 le_adv_data[HCI_MAX_EXT_AD_LENGTH]; __u8 le_adv_data_len; - __u8 le_per_adv_data[HCI_MAX_PER_AD_LENGTH]; - __u8 le_per_adv_data_len; + __u8 le_per_adv_data[HCI_MAX_PER_AD_TOT_LEN]; + __u16 le_per_adv_data_len; + __u16 le_per_adv_data_offset; + __u8 le_adv_phy; + __u8 le_adv_sec_phy; __u8 le_tx_phy; __u8 le_rx_phy; __s8 rssi; __s8 tx_power; __s8 max_tx_power; struct bt_iso_qos iso_qos; + __u8 num_bis; + __u8 bis[HCI_MAX_ISO_BIS]; + unsigned long flags; enum conn_reasons conn_reason; @@ -748,13 +751,14 @@ struct hci_conn { __u8 remote_cap; __u8 remote_auth; - __u8 remote_id; unsigned int sent; struct sk_buff_head data_q; struct list_head chan_list; + struct tx_queue tx_q; + struct delayed_work disc_work; struct delayed_work auto_accept_work; struct delayed_work idle_work; @@ -767,7 +771,6 @@ struct hci_conn { void *l2cap_data; void *sco_data; void *iso_data; - struct amp_mgr *amp_mgr; struct list_head link_list; struct hci_conn *parent; @@ -794,7 +797,6 @@ struct hci_chan { struct sk_buff_head data_q; unsigned int sent; __u8 state; - bool amp; }; struct hci_conn_params { @@ -838,29 +840,30 @@ extern struct mutex hci_cb_list_lock; #define hci_dev_test_and_clear_flag(hdev, nr) test_and_clear_bit((nr), (hdev)->dev_flags) #define hci_dev_test_and_change_flag(hdev, nr) test_and_change_bit((nr), (hdev)->dev_flags) -#define hci_dev_clear_volatile_flags(hdev) \ - do { \ - hci_dev_clear_flag(hdev, HCI_LE_SCAN); \ - hci_dev_clear_flag(hdev, HCI_LE_ADV); \ - hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\ - hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); \ - hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); \ +#define hci_dev_clear_volatile_flags(hdev) \ + do { \ + hci_dev_clear_flag((hdev), HCI_LE_SCAN); \ + hci_dev_clear_flag((hdev), HCI_LE_ADV); \ + hci_dev_clear_flag((hdev), HCI_LL_RPA_RESOLUTION); \ + hci_dev_clear_flag((hdev), HCI_PERIODIC_INQ); \ + hci_dev_clear_flag((hdev), HCI_QUALITY_REPORT); \ } while (0) #define hci_dev_le_state_simultaneous(hdev) \ - (test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) && \ - (hdev->le_states[4] & 0x08) && /* Central */ \ - (hdev->le_states[4] & 0x40) && /* Peripheral */ \ - (hdev->le_states[3] & 0x10)) /* Simultaneous */ + (!hci_test_quirk((hdev), HCI_QUIRK_BROKEN_LE_STATES) && \ + ((hdev)->le_states[4] & 0x08) && /* Central */ \ + ((hdev)->le_states[4] & 0x40) && /* Peripheral */ \ + ((hdev)->le_states[3] & 0x10)) /* Simultaneous */ /* ----- HCI interface to upper protocols ----- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr); int l2cap_disconn_ind(struct hci_conn *hcon); -void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, + u16 flags); #if IS_ENABLED(CONFIG_BT_BREDR) int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); -void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb); +int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb); #else static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) @@ -868,23 +871,30 @@ static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, return 0; } -static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +static inline int sco_recv_scodata(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb) { + kfree_skb(skb); + return -ENOENT; } #endif #if IS_ENABLED(CONFIG_BT_LE) int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); -void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, + u16 flags); #else static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { return 0; } -static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, - u16 flags) + +static inline int iso_recv(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb, u16 flags) { + kfree_skb(skb); + return -ENOENT; } #endif @@ -894,6 +904,7 @@ static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, static inline void discovery_init(struct hci_dev *hdev) { + spin_lock_init(&hdev->discovery.lock); hdev->discovery.state = DISCOVERY_STOPPED; INIT_LIST_HEAD(&hdev->discovery.all); INIT_LIST_HEAD(&hdev->discovery.unknown); @@ -908,10 +919,11 @@ static inline void hci_discovery_filter_clear(struct hci_dev *hdev) hdev->discovery.report_invalid_rssi = true; hdev->discovery.rssi = HCI_RSSI_INVALID; hdev->discovery.uuid_count = 0; + + spin_lock(&hdev->discovery.lock); kfree(hdev->discovery.uuids); hdev->discovery.uuids = NULL; - hdev->discovery.scan_start = 0; - hdev->discovery.scan_duration = 0; + spin_unlock(&hdev->discovery.lock); } bool hci_discovery_active(struct hci_dev *hdev); @@ -950,7 +962,6 @@ void hci_inquiry_cache_flush(struct hci_dev *hdev); /* ----- HCI Connections ----- */ enum { HCI_CONN_AUTH_PEND, - HCI_CONN_REAUTH_PEND, HCI_CONN_ENCRYPT_PEND, HCI_CONN_RSWITCH_PEND, HCI_CONN_MODE_CHANGE_PEND, @@ -976,8 +987,10 @@ enum { HCI_CONN_PER_ADV, HCI_CONN_BIG_CREATED, HCI_CONN_CREATE_CIS, + HCI_CONN_CREATE_BIG_SYNC, HCI_CONN_BIG_SYNC, HCI_CONN_BIG_SYNC_FAILED, + HCI_CONN_CREATE_PA_SYNC, HCI_CONN_PA_SYNC, HCI_CONN_PA_SYNC_FAILED, }; @@ -1004,9 +1017,6 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) case ACL_LINK: h->acl_num++; break; - case AMP_LINK: - h->amp_num++; - break; case LE_LINK: h->le_num++; if (c->role == HCI_ROLE_SLAVE) @@ -1016,8 +1026,14 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) case ESCO_LINK: h->sco_num++; break; - case ISO_LINK: - h->iso_num++; + case CIS_LINK: + h->cis_num++; + break; + case BIS_LINK: + h->bis_num++; + break; + case PA_LINK: + h->pa_num++; break; } } @@ -1033,9 +1049,6 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) case ACL_LINK: h->acl_num--; break; - case AMP_LINK: - h->amp_num--; - break; case LE_LINK: h->le_num--; if (c->role == HCI_ROLE_SLAVE) @@ -1045,8 +1058,14 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) case ESCO_LINK: h->sco_num--; break; - case ISO_LINK: - h->iso_num--; + case CIS_LINK: + h->cis_num--; + break; + case BIS_LINK: + h->bis_num--; + break; + case PA_LINK: + h->pa_num--; break; } } @@ -1057,15 +1076,17 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) switch (type) { case ACL_LINK: return h->acl_num; - case AMP_LINK: - return h->amp_num; case LE_LINK: return h->le_num; case SCO_LINK: case ESCO_LINK: return h->sco_num; - case ISO_LINK: - return h->iso_num; + case CIS_LINK: + return h->cis_num; + case BIS_LINK: + return h->bis_num; + case PA_LINK: + return h->pa_num; default: return 0; } @@ -1075,7 +1096,33 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev) { struct hci_conn_hash *c = &hdev->conn_hash; - return c->acl_num + c->amp_num + c->sco_num + c->le_num + c->iso_num; + return c->acl_num + c->sco_num + c->le_num + c->cis_num + c->bis_num + + c->pa_num; +} + +static inline unsigned int hci_iso_count(struct hci_dev *hdev) +{ + struct hci_conn_hash *c = &hdev->conn_hash; + + return c->cis_num + c->bis_num; +} + +static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c == conn) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + + return false; } static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle) @@ -1107,7 +1154,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, ba) || c->type != ISO_LINK) + if (bacmp(&c->dst, ba) || c->type != BIS_LINK) continue; if (c->iso_qos.bcast.bis == bis) { @@ -1121,6 +1168,30 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, } static inline struct hci_conn * +hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != PA_LINK) + continue; + + if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags)) + continue; + + rcu_read_unlock(); + return c; + } + + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, bdaddr_t *ba, __u8 big, __u8 bis) @@ -1131,8 +1202,8 @@ hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, ba) || c->type != ISO_LINK || - !test_bit(HCI_CONN_PER_ADV, &c->flags)) + if (bacmp(&c->dst, ba) || c->type != BIS_LINK || + !test_bit(HCI_CONN_PER_ADV, &c->flags)) continue; if (c->iso_qos.bcast.big == big && @@ -1185,6 +1256,27 @@ static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_role(struct hci_dev *hdev, + __u8 type, __u8 role, + bdaddr_t *ba) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == type && c->role == role && !bacmp(&c->dst, ba)) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev, bdaddr_t *ba, __u8 ba_type) @@ -1221,15 +1313,15 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) + if (c->type != CIS_LINK) continue; /* Match CIG ID if set */ - if (cig != BT_ISO_QOS_CIG_UNSET && cig != c->iso_qos.ucast.cig) + if (cig != c->iso_qos.ucast.cig) continue; /* Match CIS ID if set */ - if (id != BT_ISO_QOS_CIS_UNSET && id != c->iso_qos.ucast.cis) + if (id != c->iso_qos.ucast.cis) continue; /* Match destination address if set */ @@ -1253,7 +1345,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) + if (c->type != CIS_LINK) continue; if (handle == c->iso_qos.ucast.cig) { @@ -1276,7 +1368,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK) + if (c->type != BIS_LINK) continue; if (handle == c->iso_qos.bcast.big) { @@ -1290,8 +1382,9 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, return NULL; } -static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev *hdev, - __u8 handle) +static inline struct hci_conn * +hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, + __u8 handle, __u8 num_bis) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1299,10 +1392,10 @@ static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev * rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != PA_LINK) continue; - if (handle != BT_ISO_QOS_BIG_UNSET && handle == c->iso_qos.bcast.big) { + if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { rcu_read_unlock(); return c; } @@ -1314,7 +1407,8 @@ static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev * } static inline struct hci_conn * -hci_conn_hash_lookup_pa_sync(struct hci_dev *hdev, __u8 big) +hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state, + __u8 role) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1322,22 +1416,22 @@ hci_conn_hash_lookup_pa_sync(struct hci_dev *hdev, __u8 big) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || - !test_bit(HCI_CONN_PA_SYNC, &c->flags)) + if (c->type != BIS_LINK || c->state != state || c->role != role) continue; - if (c->iso_qos.bcast.big == big) { + if (handle == c->iso_qos.bcast.big) { rcu_read_unlock(); return c; } } + rcu_read_unlock(); return NULL; } -static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, - __u8 type, __u16 state) +static inline struct hci_conn * +hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1345,12 +1439,44 @@ static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type == type && c->state == state) { + if (c->type != BIS_LINK || + !test_bit(HCI_CONN_PA_SYNC, &c->flags)) + continue; + + if (c->iso_qos.bcast.big == big) { rcu_read_unlock(); return c; } } + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * +hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != PA_LINK) + continue; + /* Ignore the listen hcon, we are looking + * for the child hcon that was created as + * a result of the PA sync established event. + */ + if (c->state == BT_LISTEN) + continue; + + if (c->sync_handle == sync_handle) { + rcu_read_unlock(); + return c; + } + } rcu_read_unlock(); return NULL; @@ -1377,6 +1503,26 @@ static inline void hci_conn_hash_list_state(struct hci_dev *hdev, rcu_read_unlock(); } +static inline void hci_conn_hash_list_flag(struct hci_dev *hdev, + hci_conn_func_t func, __u8 type, + __u8 flag, void *data) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + if (!func) + return; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == type && test_bit(flag, &c->flags)) + func(c, data); + } + + rcu_read_unlock(); +} + static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev) { struct hci_conn_hash *h = &hdev->conn_hash; @@ -1426,10 +1572,11 @@ int hci_le_create_cis_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, - u8 role); + u8 dst_type, u8 role, u16 handle); +struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, + bdaddr_t *dst, u8 dst_type, u8 role); void hci_conn_del(struct hci_conn *conn); void hci_conn_hash_flush(struct hci_dev *hdev); -void hci_conn_check_pending(struct hci_dev *hdev); struct hci_chan *hci_chan_create(struct hci_conn *conn); void hci_chan_del(struct hci_chan *chan); @@ -1442,27 +1589,33 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, enum conn_reasons conn_reason); struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, bool dst_resolved, u8 sec_level, - u16 conn_timeout, u8 role); + u16 conn_timeout, u8 role, u8 phy, u8 sec_phy); +void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status); struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, u8 sec_level, u8 auth_type, - enum conn_reasons conn_reason); + enum conn_reasons conn_reason, u16 timeout); struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, - __u16 setting, struct bt_codec *codec); + __u16 setting, struct bt_codec *codec, + u16 timeout); struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos); -struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout); +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base); + __u8 base_len, __u8 *base, u16 timeout); +int hci_past_bis(struct hci_conn *conn, bdaddr_t *dst, __u8 dst_type); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos); -struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos, - __u8 data_len, __u8 *data); -int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, - __u8 sid, struct bt_iso_qos *qos); -int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, - struct bt_iso_qos *qos, - __u16 sync_handle, __u8 num_bis, __u8 bis[]); + u16 timeout); +struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, __u8 sid, + struct bt_iso_qos *qos, + __u8 data_len, __u8 *data, u16 timeout); +struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); +int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, + struct bt_iso_qos *qos, __u16 sync_handle, + __u8 num_bis, __u8 bis[]); int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level); int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, @@ -1474,6 +1627,18 @@ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active); void hci_conn_failed(struct hci_conn *conn, u8 status); u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle); +void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb); +void hci_conn_tx_dequeue(struct hci_conn *conn); +void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset, + const struct sockcm_cookie *sockc); + +static inline void hci_sockcm_init(struct sockcm_cookie *sockc, struct sock *sk) +{ + *sockc = (struct sockcm_cookie) { + .tsflags = READ_ONCE(sk->sk_tsflags), + }; +} + /* * hci_conn_get() and hci_conn_put() are used to control the life-time of an * "hci_conn" object. They do not guarantee that the hci_conn object is running, @@ -1536,10 +1701,6 @@ static inline void hci_conn_drop(struct hci_conn *conn) } break; - case AMP_LINK: - timeo = conn->disc_timeout; - break; - default: timeo = 0; break; @@ -1666,8 +1827,6 @@ int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type); int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type); -int hci_bdaddr_list_del_with_flags(struct list_head *list, bdaddr_t *bdaddr, - u8 type); void hci_bdaddr_list_clear(struct list_head *list); struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, @@ -1688,6 +1847,7 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, void hci_uuids_clear(struct hci_dev *hdev); void hci_link_keys_clear(struct hci_dev *hdev); +u8 *hci_conn_key_enc_size(struct hci_conn *conn); struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr); struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, @@ -1724,6 +1884,7 @@ int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, void hci_adv_instances_clear(struct hci_dev *hdev); struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance); +struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid); struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance); struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, @@ -1731,7 +1892,7 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u16 timeout, u16 duration, s8 tx_power, u32 min_interval, u32 max_interval, u8 mesh_handle); -struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, +struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval); int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, @@ -1766,6 +1927,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_hold_capable(dev) ((dev)->features[0][0] & LMP_HOLD) #define lmp_sniff_capable(dev) ((dev)->features[0][0] & LMP_SNIFF) #define lmp_park_capable(dev) ((dev)->features[0][1] & LMP_PARK) +#define lmp_sco_capable(dev) ((dev)->features[0][1] & LMP_SCO) #define lmp_inq_rssi_capable(dev) ((dev)->features[0][3] & LMP_RSSI_INQ) #define lmp_esco_capable(dev) ((dev)->features[0][3] & LMP_ESCO) #define lmp_bredr_capable(dev) (!((dev)->features[0][4] & LMP_NO_BREDR)) @@ -1808,6 +1970,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); !hci_dev_test_flag(dev, HCI_RPA_EXPIRED)) #define adv_rpa_valid(adv) (bacmp(&adv->random_addr, BDADDR_ANY) && \ !adv->rpa_expired) +#define le_enabled(dev) (lmp_le_capable(dev) && \ + hci_dev_test_flag(dev, HCI_LE_ENABLED)) #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) @@ -1818,36 +1982,41 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) #define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \ - !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \ - &(dev)->quirks)) + !hci_test_quirk((dev), \ + HCI_QUIRK_BROKEN_LE_CODED)) #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) +#define ll_privacy_enabled(dev) (le_enabled(dev) && ll_privacy_capable(dev)) -/* Use LL Privacy based address resolution if supported */ -#define use_ll_privacy(dev) (ll_privacy_capable(dev) && \ - hci_dev_test_flag(dev, HCI_ENABLE_LL_PRIVACY)) +#define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ + ((dev)->commands[39] & 0x04)) -#define privacy_mode_capable(dev) (use_ll_privacy(dev) && \ - (hdev->commands[39] & 0x04)) +#define read_key_size_capable(dev) \ + ((dev)->commands[20] & 0x10 && \ + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE)) + +#define read_voice_setting_capable(dev) \ + ((dev)->commands[9] & 0x04 && \ + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_VOICE_SETTING)) /* Use enhanced synchronous connection if command is supported and its quirk * has not been set. */ #define enhanced_sync_conn_capable(dev) \ (((dev)->commands[29] & 0x08) && \ - !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN)) /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_SCAN)) /* Use ext create connection if command is supported */ -#define use_ext_conn(dev) ((dev)->commands[37] & 0x80) - +#define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \ + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_CREATE_CONN)) /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) @@ -1860,25 +2029,50 @@ void hci_conn_del_sysfs(struct hci_conn *conn); * C24: Mandatory if the LE Controller supports Connection State and either * LE Feature (LL Privacy) or LE Feature (Extended Advertising) is supported */ -#define use_enhanced_conn_complete(dev) (ll_privacy_capable(dev) || \ - ext_adv_capable(dev)) +#define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \ + ext_adv_capable(dev)) && \ + !hci_test_quirk((dev), \ + HCI_QUIRK_BROKEN_EXT_CREATE_CONN)) /* Periodic advertising support */ #define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV)) /* CIS Master/Slave and BIS support */ #define iso_capable(dev) (cis_capable(dev) || bis_capable(dev)) +#define iso_enabled(dev) (le_enabled(dev) && iso_capable(dev)) #define cis_capable(dev) \ (cis_central_capable(dev) || cis_peripheral_capable(dev)) +#define cis_enabled(dev) (le_enabled(dev) && cis_capable(dev)) #define cis_central_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_CENTRAL) +#define cis_central_enabled(dev) \ + (le_enabled(dev) && cis_central_capable(dev)) #define cis_peripheral_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) +#define cis_peripheral_enabled(dev) \ + (le_enabled(dev) && cis_peripheral_capable(dev)) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) -#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define bis_enabled(dev) (le_enabled(dev) && bis_capable(dev)) +#define sync_recv_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define sync_recv_enabled(dev) (le_enabled(dev) && sync_recv_capable(dev)) +#define past_sender_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_PAST_SENDER) +#define past_receiver_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_PAST_RECEIVER) +#define past_capable(dev) \ + (past_sender_capable(dev) || past_receiver_capable(dev)) +#define past_sender_enabled(dev) \ + (le_enabled(dev) && past_sender_capable(dev)) +#define past_receiver_enabled(dev) \ + (le_enabled(dev) && past_receiver_capable(dev)) +#define past_enabled(dev) \ + (past_sender_enabled(dev) || past_receiver_enabled(dev)) +#define ll_ext_feature_capable(dev) \ + ((dev)->le_features[7] & HCI_LE_LL_EXT_FEATURE) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ - (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) + (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 @@ -1894,7 +2088,9 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, case ESCO_LINK: return sco_connect_ind(hdev, bdaddr, flags); - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: return iso_connect_ind(hdev, bdaddr, flags); default: @@ -2078,18 +2274,46 @@ static inline int hci_check_conn_params(u16 min, u16 max, u16 latency, { u16 max_latency; - if (min > max || min < 6 || max > 3200) + if (min > max) { + BT_WARN("min %d > max %d", min, max); + return -EINVAL; + } + + if (min < 6) { + BT_WARN("min %d < 6", min); + return -EINVAL; + } + + if (max > 3200) { + BT_WARN("max %d > 3200", max); + return -EINVAL; + } + + if (to_multiplier < 10) { + BT_WARN("to_multiplier %d < 10", to_multiplier); return -EINVAL; + } - if (to_multiplier < 10 || to_multiplier > 3200) + if (to_multiplier > 3200) { + BT_WARN("to_multiplier %d > 3200", to_multiplier); return -EINVAL; + } - if (max >= to_multiplier * 8) + if (max >= to_multiplier * 8) { + BT_WARN("max %d >= to_multiplier %d * 8", max, to_multiplier); return -EINVAL; + } max_latency = (to_multiplier * 4 / max) - 1; - if (latency > 499 || latency > max_latency) + if (latency > 499) { + BT_WARN("latency %d > 499", latency); return -EINVAL; + } + + if (latency > max_latency) { + BT_WARN("latency %d > max_latency %d", latency, max_latency); + return -EINVAL; + } return 0; } @@ -2157,8 +2381,22 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c); /* These LE scan and inquiry parameters were chosen according to LE General * Discovery Procedure specification. */ -#define DISCOV_LE_SCAN_WIN 0x12 -#define DISCOV_LE_SCAN_INT 0x12 +#define DISCOV_LE_SCAN_WIN 0x0012 /* 11.25 msec */ +#define DISCOV_LE_SCAN_INT 0x0012 /* 11.25 msec */ +#define DISCOV_LE_SCAN_INT_FAST 0x0060 /* 60 msec */ +#define DISCOV_LE_SCAN_WIN_FAST 0x0030 /* 30 msec */ +#define DISCOV_LE_SCAN_INT_CONN 0x0060 /* 60 msec */ +#define DISCOV_LE_SCAN_WIN_CONN 0x0060 /* 60 msec */ +#define DISCOV_LE_SCAN_INT_SLOW1 0x0800 /* 1.28 sec */ +#define DISCOV_LE_SCAN_WIN_SLOW1 0x0012 /* 11.25 msec */ +#define DISCOV_LE_SCAN_INT_SLOW2 0x1000 /* 2.56 sec */ +#define DISCOV_LE_SCAN_WIN_SLOW2 0x0024 /* 22.5 msec */ +#define DISCOV_CODED_SCAN_INT_FAST 0x0120 /* 180 msec */ +#define DISCOV_CODED_SCAN_WIN_FAST 0x0090 /* 90 msec */ +#define DISCOV_CODED_SCAN_INT_SLOW1 0x1800 /* 3.84 sec */ +#define DISCOV_CODED_SCAN_WIN_SLOW1 0x0036 /* 33.75 msec */ +#define DISCOV_CODED_SCAN_INT_SLOW2 0x3000 /* 7.68 sec */ +#define DISCOV_CODED_SCAN_WIN_SLOW2 0x006c /* 67.5 msec */ #define DISCOV_LE_TIMEOUT 10240 /* msec */ #define DISCOV_INTERLEAVED_TIMEOUT 5120 /* msec */ #define DISCOV_INTERLEAVED_INQUIRY_LEN 0x04 @@ -2190,8 +2428,8 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, bool mgmt_connected); void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status); -void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, - u8 addr_type, u8 status); +void mgmt_connect_failed(struct hci_dev *hdev, struct hci_conn *conn, + u8 status); void mgmt_pin_code_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 secure); void mgmt_pin_code_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 status); @@ -2218,8 +2456,6 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status); void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, u8 status); void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status); -void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status); -void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status); void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len, @@ -2245,7 +2481,6 @@ void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); -void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle); int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, bdaddr_t *bdaddr, u8 addr_type); diff --git a/include/net/bluetooth/hci_drv.h b/include/net/bluetooth/hci_drv.h new file mode 100644 index 000000000000..3fd6fdbdb02e --- /dev/null +++ b/include/net/bluetooth/hci_drv.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Google Corporation + */ + +#ifndef __HCI_DRV_H +#define __HCI_DRV_H + +#include <linux/types.h> + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci.h> + +struct hci_drv_cmd_hdr { + __le16 opcode; + __le16 len; +} __packed; + +struct hci_drv_ev_hdr { + __le16 opcode; + __le16 len; +} __packed; + +#define HCI_DRV_EV_CMD_STATUS 0x0000 +struct hci_drv_ev_cmd_status { + __le16 opcode; + __u8 status; +} __packed; + +#define HCI_DRV_EV_CMD_COMPLETE 0x0001 +struct hci_drv_ev_cmd_complete { + __le16 opcode; + __u8 status; + __u8 data[]; +} __packed; + +#define HCI_DRV_STATUS_SUCCESS 0x00 +#define HCI_DRV_STATUS_UNSPECIFIED_ERROR 0x01 +#define HCI_DRV_STATUS_UNKNOWN_COMMAND 0x02 +#define HCI_DRV_STATUS_INVALID_PARAMETERS 0x03 + +#define HCI_DRV_MAX_DRIVER_NAME_LENGTH 32 + +/* Common commands that make sense on all drivers start from 0x0000 */ +#define HCI_DRV_OP_READ_INFO 0x0000 +#define HCI_DRV_READ_INFO_SIZE 0 +struct hci_drv_rp_read_info { + __u8 driver_name[HCI_DRV_MAX_DRIVER_NAME_LENGTH]; + __le16 num_supported_commands; + __le16 supported_commands[] __counted_by_le(num_supported_commands); +} __packed; + +/* Driver specific OGF (Opcode Group Field) + * Commands in this group may have different meanings across different drivers. + */ +#define HCI_DRV_OGF_DRIVER_SPECIFIC 0x01 + +int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status); +int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp, + size_t rp_len); +int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *cmd_skb); + +struct hci_drv_handler { + int (*func)(struct hci_dev *hdev, void *data, u16 data_len); + size_t data_len; +}; + +struct hci_drv { + size_t common_handler_count; + const struct hci_drv_handler *common_handlers; + + size_t specific_handler_count; + const struct hci_drv_handler *specific_handlers; +}; + +#endif /* __HCI_DRV_H */ diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 2d5fcda1bcd0..bbd752494ef9 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -51,12 +51,14 @@ struct hci_mon_hdr { #define HCI_MON_CTRL_EVENT 17 #define HCI_MON_ISO_TX_PKT 18 #define HCI_MON_ISO_RX_PKT 19 +#define HCI_MON_DRV_TX_PKT 20 +#define HCI_MON_DRV_RX_PKT 21 struct hci_mon_new_index { __u8 type; __u8 bus; bdaddr_t bdaddr; - char name[8]; + char name[8] __nonstring; } __packed; #define HCI_MON_NEW_INDEX_SIZE 16 diff --git a/include/net/bluetooth/hci_sock.h b/include/net/bluetooth/hci_sock.h index 9949870f7d78..13e8cd4414a1 100644 --- a/include/net/bluetooth/hci_sock.h +++ b/include/net/bluetooth/hci_sock.h @@ -144,7 +144,7 @@ struct hci_dev_req { struct hci_dev_list_req { __u16 dev_num; - struct hci_dev_req dev_req[]; /* hci_dev_req structures */ + struct hci_dev_req dev_req[] __counted_by(dev_num); }; struct hci_conn_list_req { diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 57eeb07aeb25..56076bbc981d 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -8,6 +8,23 @@ #define UINT_PTR(_handle) ((void *)((uintptr_t)_handle)) #define PTR_UINT(_ptr) ((uintptr_t)((void *)_ptr)) +#define HCI_REQ_DONE 0 +#define HCI_REQ_PEND 1 +#define HCI_REQ_CANCELED 2 + +#define hci_req_sync_lock(hdev) mutex_lock(&hdev->req_lock) +#define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock) + +struct hci_request { + struct hci_dev *hdev; + struct sk_buff_head cmd_q; + + /* If something goes wrong when building the HCI request, the error + * value is stored in this field. + */ + int err; +}; + typedef int (*hci_cmd_sync_work_func_t)(struct hci_dev *hdev, void *data); typedef void (*hci_cmd_sync_work_destroy_t)(struct hci_dev *hdev, void *data, int err); @@ -20,6 +37,10 @@ struct hci_cmd_sync_work_entry { }; struct adv_info; + +struct sk_buff *hci_cmd_sync_alloc(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, struct sock *sk); + /* Function with sync suffix shall not be called with hdev->lock held as they * wait the command to complete and in the meantime an event could be received * which could attempt to acquire hdev->lock causing a deadlock. @@ -38,23 +59,41 @@ int __hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout, struct sock *sk); +int hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout); void hci_cmd_sync_init(struct hci_dev *hdev); void hci_cmd_sync_clear(struct hci_dev *hdev); void hci_cmd_sync_cancel(struct hci_dev *hdev, int err); -void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err); +void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err); int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy); int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy); +int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +int hci_cmd_sync_run_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +struct hci_cmd_sync_work_entry * +hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +void hci_cmd_sync_cancel_entry(struct hci_dev *hdev, + struct hci_cmd_sync_work_entry *entry); +bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev, + hci_cmd_sync_work_func_t func, void *data, + hci_cmd_sync_work_destroy_t destroy); int hci_update_eir_sync(struct hci_dev *hdev); int hci_update_class_sync(struct hci_dev *hdev); int hci_update_eir_sync(struct hci_dev *hdev); int hci_update_class_sync(struct hci_dev *hdev); -int hci_update_name_sync(struct hci_dev *hdev); +int hci_update_name_sync(struct hci_dev *hdev, const u8 *name); int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode); int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, @@ -76,10 +115,12 @@ int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance); int hci_enable_advertising_sync(struct hci_dev *hdev); int hci_enable_advertising(struct hci_dev *hdev); -int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, - u8 *data, u32 flags, u16 min_interval, +int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid, + u8 data_len, u8 *data, u32 flags, u16 min_interval, u16 max_interval, u16 sync_interval); +int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance); + int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk, u8 instance, bool force); int hci_disable_advertising_sync(struct hci_dev *hdev); @@ -99,7 +140,6 @@ int hci_update_scan(struct hci_dev *hdev); int hci_write_le_host_supported_sync(struct hci_dev *hdev, u8 le, u8 simul); int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance, struct sock *sk); -int hci_remove_ext_adv_instance(struct hci_dev *hdev, u8 instance); struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, bool ext, struct sock *sk); @@ -115,6 +155,8 @@ int hci_update_discoverable(struct hci_dev *hdev); int hci_update_connectable_sync(struct hci_dev *hdev); +int hci_inquiry_sync(struct hci_dev *hdev, u8 length, u8 num_rsp); + int hci_start_discovery_sync(struct hci_dev *hdev); int hci_stop_discovery_sync(struct hci_dev *hdev); @@ -122,11 +164,10 @@ int hci_suspend_sync(struct hci_dev *hdev); int hci_resume_sync(struct hci_dev *hdev); struct hci_conn; +struct hci_conn_params; int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason); -int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn); - int hci_le_create_cis_sync(struct hci_dev *hdev); int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle); @@ -136,3 +177,17 @@ int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason); int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle); int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle); + +int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn); + +int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn); + +int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, + struct hci_conn_params *params); + +int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_past_sync(struct hci_conn *conn, struct hci_conn *le); + +int hci_le_read_remote_features(struct hci_conn *conn); diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index cf393e72d6ed..00e182a22720 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -27,7 +27,7 @@ #ifndef __L2CAP_H #define __L2CAP_H -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/atomic.h> /* L2CAP defaults */ @@ -59,8 +59,6 @@ #define L2CAP_WAIT_ACK_POLL_PERIOD msecs_to_jiffies(200) #define L2CAP_WAIT_ACK_TIMEOUT msecs_to_jiffies(10000) -#define L2CAP_A2MP_DEFAULT_MTU 670 - /* L2CAP socket address */ struct sockaddr_l2 { sa_family_t l2_family; @@ -109,12 +107,6 @@ struct l2cap_conninfo { #define L2CAP_ECHO_RSP 0x09 #define L2CAP_INFO_REQ 0x0a #define L2CAP_INFO_RSP 0x0b -#define L2CAP_CREATE_CHAN_REQ 0x0c -#define L2CAP_CREATE_CHAN_RSP 0x0d -#define L2CAP_MOVE_CHAN_REQ 0x0e -#define L2CAP_MOVE_CHAN_RSP 0x0f -#define L2CAP_MOVE_CHAN_CFM 0x10 -#define L2CAP_MOVE_CHAN_CFM_RSP 0x11 #define L2CAP_CONN_PARAM_UPDATE_REQ 0x12 #define L2CAP_CONN_PARAM_UPDATE_RSP 0x13 #define L2CAP_LE_CONN_REQ 0x14 @@ -144,7 +136,6 @@ struct l2cap_conninfo { /* L2CAP fixed channels */ #define L2CAP_FC_SIG_BREDR 0x02 #define L2CAP_FC_CONNLESS 0x04 -#define L2CAP_FC_A2MP 0x08 #define L2CAP_FC_ATT 0x10 #define L2CAP_FC_SIG_LE 0x20 #define L2CAP_FC_SMP_LE 0x40 @@ -267,7 +258,6 @@ struct l2cap_conn_rsp { /* channel identifier */ #define L2CAP_CID_SIGNALING 0x0001 #define L2CAP_CID_CONN_LESS 0x0002 -#define L2CAP_CID_A2MP 0x0003 #define L2CAP_CID_ATT 0x0004 #define L2CAP_CID_LE_SIGNALING 0x0005 #define L2CAP_CID_SMP 0x0006 @@ -282,7 +272,6 @@ struct l2cap_conn_rsp { #define L2CAP_CR_BAD_PSM 0x0002 #define L2CAP_CR_SEC_BLOCK 0x0003 #define L2CAP_CR_NO_MEM 0x0004 -#define L2CAP_CR_BAD_AMP 0x0005 #define L2CAP_CR_INVALID_SCID 0x0006 #define L2CAP_CR_SCID_IN_USE 0x0007 @@ -404,29 +393,6 @@ struct l2cap_info_rsp { __u8 data[]; } __packed; -struct l2cap_create_chan_req { - __le16 psm; - __le16 scid; - __u8 amp_id; -} __packed; - -struct l2cap_create_chan_rsp { - __le16 dcid; - __le16 scid; - __le16 result; - __le16 status; -} __packed; - -struct l2cap_move_chan_req { - __le16 icid; - __u8 dest_amp_id; -} __packed; - -struct l2cap_move_chan_rsp { - __le16 icid; - __le16 result; -} __packed; - #define L2CAP_MR_SUCCESS 0x0000 #define L2CAP_MR_PEND 0x0001 #define L2CAP_MR_BAD_ID 0x0002 @@ -497,18 +463,24 @@ struct l2cap_le_credits { #define L2CAP_ECRED_MAX_CID 5 struct l2cap_ecred_conn_req { - __le16 psm; - __le16 mtu; - __le16 mps; - __le16 credits; + /* New members must be added within the struct_group() macro below. */ + __struct_group(l2cap_ecred_conn_req_hdr, hdr, __packed, + __le16 psm; + __le16 mtu; + __le16 mps; + __le16 credits; + ); __le16 scid[]; } __packed; struct l2cap_ecred_conn_rsp { - __le16 mtu; - __le16 mps; - __le16 credits; - __le16 result; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(l2cap_ecred_conn_rsp_hdr, hdr, + __le16 mtu; + __le16 mps; + __le16 credits; + __le16 result; + ); __le16 dcid[]; }; @@ -539,8 +511,6 @@ struct l2cap_seq_list { struct l2cap_chan { struct l2cap_conn *conn; - struct hci_conn *hs_hcon; - struct hci_chan *hs_hchan; struct kref kref; atomic_t nesting; @@ -584,6 +554,9 @@ struct l2cap_chan { __u16 tx_credits; __u16 rx_credits; + /* estimated available receive buffer space or -1 if unknown */ + ssize_t rx_avail; + __u8 tx_state; __u8 rx_state; @@ -591,12 +564,6 @@ struct l2cap_chan { unsigned long conn_state; unsigned long flags; - __u8 remote_amp_id; - __u8 local_amp_id; - __u8 move_id; - __u8 move_state; - __u8 move_role; - __u16 next_tx_seq; __u16 expected_ack_seq; __u16 expected_tx_seq; @@ -701,7 +668,7 @@ struct l2cap_conn { struct l2cap_chan *smp; struct list_head chan_l; - struct mutex chan_lock; + struct mutex lock; struct kref ref; struct list_head users; }; @@ -724,10 +691,15 @@ struct l2cap_user { /* ----- L2CAP socket info ----- */ #define l2cap_pi(sk) ((struct l2cap_pinfo *) sk) +struct l2cap_rx_busy { + struct list_head list; + struct sk_buff *skb; +}; + struct l2cap_pinfo { struct bt_sock bt; struct l2cap_chan *chan; - struct sk_buff *rx_busy_skb; + struct list_head rx_busy; }; enum { @@ -981,10 +953,12 @@ int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid); struct l2cap_chan *l2cap_chan_create(void); void l2cap_chan_close(struct l2cap_chan *chan, int reason); int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, - bdaddr_t *dst, u8 dst_type); + bdaddr_t *dst, u8 dst_type, u16 timeout); int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu); -int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len); +int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, + const struct sockcm_cookie *sockc); void l2cap_chan_busy(struct l2cap_chan *chan, int busy); +void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail); int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator); void l2cap_chan_set_defaults(struct l2cap_chan *chan); int l2cap_ertm_init(struct l2cap_chan *chan); @@ -995,12 +969,9 @@ void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, void *data); void l2cap_chan_del(struct l2cap_chan *chan, int err); void l2cap_send_conn_req(struct l2cap_chan *chan); -void l2cap_move_start(struct l2cap_chan *chan); -void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan, - u8 status); -void __l2cap_physical_cfm(struct l2cap_chan *chan, int result); struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index d382679efd2b..8234915854b6 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -53,10 +53,15 @@ struct mgmt_hdr { } __packed; struct mgmt_tlv { - __le16 type; - __u8 length; + /* New members MUST be added within the __struct_group() macro below. */ + __struct_group(mgmt_tlv_hdr, __hdr, __packed, + __le16 type; + __u8 length; + ); __u8 value[]; } __packed; +static_assert(offsetof(struct mgmt_tlv, value) == sizeof(struct mgmt_tlv_hdr), + "struct member likely outside of __struct_group()"); struct mgmt_addr_info { bdaddr_t bdaddr; @@ -113,6 +118,9 @@ struct mgmt_rp_read_index_list { #define MGMT_SETTING_CIS_PERIPHERAL BIT(19) #define MGMT_SETTING_ISO_BROADCASTER BIT(20) #define MGMT_SETTING_ISO_SYNC_RECEIVER BIT(21) +#define MGMT_SETTING_LL_PRIVACY BIT(22) +#define MGMT_SETTING_PAST_SENDER BIT(23) +#define MGMT_SETTING_PAST_RECEIVER BIT(24) #define MGMT_OP_READ_INFO 0x0004 #define MGMT_READ_INFO_SIZE 0 @@ -774,7 +782,7 @@ struct mgmt_adv_pattern { __u8 ad_type; __u8 offset; __u8 length; - __u8 value[31]; + __u8 value[HCI_MAX_AD_LENGTH]; } __packed; #define MGMT_OP_ADD_ADV_PATTERNS_MONITOR 0x0052 @@ -847,7 +855,7 @@ struct mgmt_cp_set_mesh { __le16 window; __le16 period; __u8 num_ad_types; - __u8 ad_types[]; + __u8 ad_types[] __counted_by(num_ad_types); } __packed; #define MGMT_SET_MESH_RECEIVER_SIZE 6 @@ -878,6 +886,16 @@ struct mgmt_cp_mesh_send_cancel { } __packed; #define MGMT_MESH_SEND_CANCEL_SIZE 1 +#define MGMT_OP_HCI_CMD_SYNC 0x005B +struct mgmt_cp_hci_cmd_sync { + __le16 opcode; + __u8 event; + __u8 timeout; + __le16 params_len; + __u8 params[]; +} __packed; +#define MGMT_HCI_CMD_SYNC_SIZE 6 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h index 99d26879b02a..c05882476900 100644 --- a/include/net/bluetooth/rfcomm.h +++ b/include/net/bluetooth/rfcomm.h @@ -355,7 +355,7 @@ struct rfcomm_dev_info { struct rfcomm_dev_list_req { u16 dev_num; - struct rfcomm_dev_info dev_info[]; + struct rfcomm_dev_info dev_info[] __counted_by(dev_num); }; int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index c5e57c6bd873..c92d4a976246 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -26,6 +26,7 @@ enum { BOND_AD_STABLE = 0, BOND_AD_BANDWIDTH = 1, BOND_AD_COUNT = 2, + BOND_AD_PRIO = 3, }; /* rx machine states(43.4.11 in the 802.3ad standard) */ @@ -54,6 +55,8 @@ typedef enum { AD_MUX_DETACHED, /* mux machine */ AD_MUX_WAITING, /* mux machine */ AD_MUX_ATTACHED, /* mux machine */ + AD_MUX_COLLECTING, /* mux machine */ + AD_MUX_DISTRIBUTING, /* mux machine */ AD_MUX_COLLECTING_DISTRIBUTING /* mux machine */ } mux_states_t; @@ -229,7 +232,10 @@ typedef struct port { mux_states_t sm_mux_state; /* state machine mux state */ u16 sm_mux_timer_counter; /* state machine mux timer counter */ tx_states_t sm_tx_state; /* state machine tx state */ - u16 sm_tx_timer_counter; /* state machine tx timer counter(allways on - enter to transmit state 3 time per second) */ + u16 sm_tx_timer_counter; /* state machine tx timer counter + * (always on - enter to transmit + * state 3 time per second) + */ u16 sm_churn_actor_timer_counter; u16 sm_churn_partner_timer_counter; u32 churn_actor_count; @@ -269,6 +275,7 @@ struct ad_slave_info { struct port port; /* 802.3ad port structure */ struct bond_3ad_stats stats; u16 id; + u16 port_priority; }; static inline const char *bond_3ad_churn_desc(churn_state_t state) @@ -302,6 +309,7 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); int bond_3ad_set_carrier(struct bonding *bond); void bond_3ad_update_lacp_rate(struct bonding *bond); +void bond_3ad_update_lacp_active(struct bonding *bond); void bond_3ad_update_ad_actor_settings(struct bonding *bond); int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats); size_t bond_3ad_stats_size(void); diff --git a/include/net/bond_alb.h b/include/net/bond_alb.h index 9dc082b2d543..e5945427f38d 100644 --- a/include/net/bond_alb.h +++ b/include/net/bond_alb.h @@ -53,7 +53,7 @@ struct slave; struct tlb_client_info { - struct slave *tx_slave; /* A pointer to slave used for transmiting + struct slave *tx_slave; /* A pointer to slave used for transmitting * packets to a Client that the Hash function * gave this entry index. */ diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 69292ecc0325..e6eedf23aea1 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -76,6 +76,9 @@ enum { BOND_OPT_MISSED_MAX, BOND_OPT_NS_TARGETS, BOND_OPT_PRIO, + BOND_OPT_COUPLED_CONTROL, + BOND_OPT_BROADCAST_NEIGH, + BOND_OPT_ACTOR_PORT_PRIO, BOND_OPT_LAST }; @@ -160,5 +163,7 @@ void bond_option_arp_ip_targets_clear(struct bonding *bond); #if IS_ENABLED(CONFIG_IPV6) void bond_option_ns_ip6_targets_clear(struct bonding *bond); #endif +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave); +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave); #endif /* _NET_BOND_OPTIONS_H */ diff --git a/include/net/bonding.h b/include/net/bonding.h index 5b8b1b644a2d..49edc7da0586 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -115,6 +115,8 @@ static inline int is_netpoll_tx_blocked(struct net_device *dev) #define is_netpoll_tx_blocked(dev) (0) #endif +DECLARE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled); + struct bond_params { int mode; int xmit_policy; @@ -124,7 +126,6 @@ struct bond_params { int arp_interval; int arp_validate; int arp_all_targets; - int use_carrier; int fail_over_mac; int updelay; int downdelay; @@ -148,6 +149,8 @@ struct bond_params { #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; #endif + int coupled_control; + int broadcast_neighbor; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; @@ -167,6 +170,7 @@ struct slave { u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ + rx_disabled:1, /* indicates whether slave's Rx is disabled */ should_notify:1, /* indicates whether the state changed */ should_notify_link:1; /* indicates whether the link changed */ u8 duplex; @@ -258,7 +262,7 @@ struct bonding { #ifdef CONFIG_XFRM_OFFLOAD struct list_head ipsec_list; /* protecting ipsec_list */ - spinlock_t ipsec_lock; + struct mutex ipsec_lock; #endif /* CONFIG_XFRM_OFFLOAD */ struct bpf_prog *xdp_prog; }; @@ -568,6 +572,14 @@ static inline void bond_set_slave_inactive_flags(struct slave *slave, bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); if (!slave->bond->params.all_slaves_active) slave->inactive = 1; + if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) + slave->rx_disabled = 1; +} + +static inline void bond_set_slave_tx_disabled_flags(struct slave *slave, + bool notify) +{ + bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); } static inline void bond_set_slave_active_flags(struct slave *slave, @@ -575,6 +587,14 @@ static inline void bond_set_slave_active_flags(struct slave *slave, { bond_set_slave_state(slave, BOND_STATE_ACTIVE, notify); slave->inactive = 0; + if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) + slave->rx_disabled = 0; +} + +static inline void bond_set_slave_rx_enabled_flags(struct slave *slave, + bool notify) +{ + slave->rx_disabled = 0; } static inline bool bond_is_slave_inactive(struct slave *slave) @@ -582,6 +602,11 @@ static inline bool bond_is_slave_inactive(struct slave *slave) return slave->inactive; } +static inline bool bond_is_slave_rx_disabled(struct slave *slave) +{ + return slave->rx_disabled; +} + static inline void bond_propose_link_state(struct slave *slave, int state) { slave->link_new_state = state; @@ -672,6 +697,7 @@ void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); +bool bond_xdp_check(struct bonding *bond, int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); int bond_netlink_init(void); @@ -684,6 +710,7 @@ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); void bond_work_init_all(struct bonding *bond); +void bond_work_cancel_all(struct bonding *bond); #ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond); diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 4dabeb6c76d3..6e172d0f6ef5 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -24,6 +24,11 @@ */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +static inline bool napi_id_valid(unsigned int napi_id) +{ + return napi_id >= MIN_NAPI_ID; +} + #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL @@ -48,6 +53,13 @@ void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget); + +void napi_suspend_irqs(unsigned int napi_id); +void napi_resume_irqs(unsigned int napi_id); + #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { @@ -64,7 +76,7 @@ static inline bool sk_can_busy_loop(struct sock *sk) static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL - return (unsigned long)(local_clock() >> 10); + return (unsigned long)(ktime_get_ns() >> 10); #else return 0; #endif @@ -107,7 +119,7 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); - if (napi_id >= MIN_NAPI_ID) + if (napi_id_valid(napi_id)) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, READ_ONCE(sk->sk_prefer_busy_poll), READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); @@ -115,19 +127,25 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) } /* used in the NIC receive handler to mark the skb */ -static inline void skb_mark_napi_id(struct sk_buff *skb, - struct napi_struct *napi) +static inline void __skb_mark_napi_id(struct sk_buff *skb, + const struct gro_node *gro) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ - if (skb->napi_id < MIN_NAPI_ID) - skb->napi_id = napi->napi_id; + if (!napi_id_valid(skb->napi_id)) + skb->napi_id = gro->cached_napi_id; #endif } -/* used in the protocol hanlder to propagate the napi_id to the socket */ +static inline void skb_mark_napi_id(struct sk_buff *skb, + const struct napi_struct *napi) +{ + __skb_mark_napi_id(skb, &napi->gro); +} + +/* used in the protocol handler to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL @@ -167,12 +185,4 @@ static inline void sk_mark_napi_id_once(struct sock *sk, #endif } -static inline void sk_mark_napi_id_once_xdp(struct sock *sk, - const struct xdp_buff *xdp) -{ -#ifdef CONFIG_NET_RX_BUSY_POLL - __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); -#endif -} - #endif /* _LINUX_NET_BUSY_POLL_H */ diff --git a/include/net/caif/caif_layer.h b/include/net/caif/caif_layer.h index 51f7bb42a936..053e7c6a6a66 100644 --- a/include/net/caif/caif_layer.h +++ b/include/net/caif/caif_layer.h @@ -11,9 +11,7 @@ struct cflayer; struct cfpkt; -struct cfpktq; struct caif_payload_info; -struct caif_packet_funcs; #define CAIF_LAYER_NAME_SZ 16 @@ -22,7 +20,7 @@ struct caif_packet_funcs; * @assert: expression to evaluate. * * This function will print a error message and a do WARN_ON if the - * assertion failes. Normally this will do a stack up at the current location. + * assertion fails. Normally this will do a stack up at the current location. */ #define caif_assert(assert) \ do { \ @@ -118,7 +116,7 @@ enum caif_direction { * @dn: Pointer down to the layer below. * @node: List node used when layer participate in a list. * @receive: Packet receive function. - * @transmit: Packet transmit funciton. + * @transmit: Packet transmit function. * @ctrlcmd: Used for control signalling upwards in the stack. * @modemcmd: Used for control signaling downwards in the stack. * @id: The identity of this layer diff --git a/include/net/caif/cfpkt.h b/include/net/caif/cfpkt.h index 44d914a50369..acf664227d96 100644 --- a/include/net/caif/cfpkt.h +++ b/include/net/caif/cfpkt.h @@ -18,7 +18,7 @@ struct cfpkt *cfpkt_create(u16 len); /* * Destroy a CAIF Packet. - * pkt Packet to be destoyed. + * pkt Packet to be destroyed. */ void cfpkt_destroy(struct cfpkt *pkt); diff --git a/include/net/caif/cfsrvl.h b/include/net/caif/cfsrvl.h index 5ee7b322e18b..a000dc45f966 100644 --- a/include/net/caif/cfsrvl.h +++ b/include/net/caif/cfsrvl.h @@ -40,7 +40,6 @@ void cfsrvl_init(struct cfsrvl *service, struct dev_info *dev_info, bool supports_flowctrl); bool cfsrvl_ready(struct cfsrvl *service, int *err); -u8 cfsrvl_getphyid(struct cflayer *layer); static inline void cfsrvl_get(struct cflayer *layr) { diff --git a/include/net/calipso.h b/include/net/calipso.h index f8667a3fda9e..76b9e08c10c2 100644 --- a/include/net/calipso.h +++ b/include/net/calipso.h @@ -25,7 +25,7 @@ #include <net/netlabel.h> #include <net/request_sock.h> #include <linux/refcount.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /* known doi values */ #define CALIPSO_DOI_UNKNOWN 0x00000000 diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 3a4b684f89bf..899f267b7cf9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7,7 +7,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2021, 2023 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/ethtool.h> @@ -52,7 +52,7 @@ * such wiphy can have zero, one, or many virtual interfaces associated with * it, which need to be identified as such by pointing the network interface's * @ieee80211_ptr pointer to a &struct wireless_dev which further describes - * the wireless part of the interface, normally this struct is embedded in the + * the wireless part of the interface. Normally this struct is embedded in the * network interface's private data area. Drivers can optionally allow creating * or destroying virtual interfaces on the fly, but without at least one or the * ability to create some the wireless device isn't useful. @@ -76,6 +76,8 @@ struct wiphy; * @IEEE80211_CHAN_DISABLED: This channel is disabled. * @IEEE80211_CHAN_NO_IR: do not initiate radiation, this includes * sending probe requests or beaconing. + * @IEEE80211_CHAN_PSD: Power spectral density (in dBm) is set for this + * channel. * @IEEE80211_CHAN_RADAR: Radar detection is required on this channel. * @IEEE80211_CHAN_NO_HT40PLUS: extension channel above this channel * is not permitted. @@ -99,45 +101,60 @@ struct wiphy; * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted * on this channel. * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel. - * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted - * on this channel. * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band, * this flag indicates that a 320 MHz channel cannot use this * channel as the control or any of the secondary channels. * This may be due to the driver or due to regulatory bandwidth * restrictions. * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel. + * @IEEE80211_CHAN_DFS_CONCURRENT: See %NL80211_RRF_DFS_CONCURRENT + * @IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT: Client connection with VLP AP + * not permitted using this channel + * @IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT: Client connection with AFC AP + * not permitted using this channel + * @IEEE80211_CHAN_CAN_MONITOR: This channel can be used for monitor + * mode even in the presence of other (regulatory) restrictions, + * even if it is otherwise disabled. + * @IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP: Allow using this channel for AP operation + * with very low power (VLP), even if otherwise set to NO_IR. + * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel, + * even if otherwise set to NO_IR. + * @IEEE80211_CHAN_S1G_NO_PRIMARY: Prevents the channel for use as an S1G + * primary channel. Does not prevent the wider operating channel + * described by the chandef from being used. In order for a 2MHz primary + * to be used, both 1MHz subchannels shall not contain this flag. + * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel. */ enum ieee80211_channel_flags { - IEEE80211_CHAN_DISABLED = 1<<0, - IEEE80211_CHAN_NO_IR = 1<<1, - /* hole at 1<<2 */ - IEEE80211_CHAN_RADAR = 1<<3, - IEEE80211_CHAN_NO_HT40PLUS = 1<<4, - IEEE80211_CHAN_NO_HT40MINUS = 1<<5, - IEEE80211_CHAN_NO_OFDM = 1<<6, - IEEE80211_CHAN_NO_80MHZ = 1<<7, - IEEE80211_CHAN_NO_160MHZ = 1<<8, - IEEE80211_CHAN_INDOOR_ONLY = 1<<9, - IEEE80211_CHAN_IR_CONCURRENT = 1<<10, - IEEE80211_CHAN_NO_20MHZ = 1<<11, - IEEE80211_CHAN_NO_10MHZ = 1<<12, - IEEE80211_CHAN_NO_HE = 1<<13, - IEEE80211_CHAN_1MHZ = 1<<14, - IEEE80211_CHAN_2MHZ = 1<<15, - IEEE80211_CHAN_4MHZ = 1<<16, - IEEE80211_CHAN_8MHZ = 1<<17, - IEEE80211_CHAN_16MHZ = 1<<18, - IEEE80211_CHAN_NO_320MHZ = 1<<19, - IEEE80211_CHAN_NO_EHT = 1<<20, + IEEE80211_CHAN_DISABLED = BIT(0), + IEEE80211_CHAN_NO_IR = BIT(1), + IEEE80211_CHAN_PSD = BIT(2), + IEEE80211_CHAN_RADAR = BIT(3), + IEEE80211_CHAN_NO_HT40PLUS = BIT(4), + IEEE80211_CHAN_NO_HT40MINUS = BIT(5), + IEEE80211_CHAN_NO_OFDM = BIT(6), + IEEE80211_CHAN_NO_80MHZ = BIT(7), + IEEE80211_CHAN_NO_160MHZ = BIT(8), + IEEE80211_CHAN_INDOOR_ONLY = BIT(9), + IEEE80211_CHAN_IR_CONCURRENT = BIT(10), + IEEE80211_CHAN_NO_20MHZ = BIT(11), + IEEE80211_CHAN_NO_10MHZ = BIT(12), + IEEE80211_CHAN_NO_HE = BIT(13), + /* can use free bits here */ + IEEE80211_CHAN_NO_320MHZ = BIT(19), + IEEE80211_CHAN_NO_EHT = BIT(20), + IEEE80211_CHAN_DFS_CONCURRENT = BIT(21), + IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT = BIT(22), + IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT = BIT(23), + IEEE80211_CHAN_CAN_MONITOR = BIT(24), + IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP = BIT(25), + IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY = BIT(26), + IEEE80211_CHAN_S1G_NO_PRIMARY = BIT(27), + IEEE80211_CHAN_NO_4MHZ = BIT(28), + IEEE80211_CHAN_NO_8MHZ = BIT(29), + IEEE80211_CHAN_NO_16MHZ = BIT(30), }; #define IEEE80211_CHAN_NO_HT40 \ @@ -171,6 +188,7 @@ enum ieee80211_channel_flags { * on this channel. * @dfs_state_entered: timestamp (jiffies) when the dfs state was entered. * @dfs_cac_ms: DFS CAC time in milliseconds, this is valid for DFS channels. + * @psd: power spectral density (in dBm) */ struct ieee80211_channel { enum nl80211_band band; @@ -187,6 +205,7 @@ struct ieee80211_channel { enum nl80211_dfs_state dfs_state; unsigned long dfs_state_entered; unsigned int dfs_cac_ms; + s8 psd; }; /** @@ -213,13 +232,13 @@ struct ieee80211_channel { * @IEEE80211_RATE_SUPPORTS_10MHZ: Rate can be used in 10 MHz mode */ enum ieee80211_rate_flags { - IEEE80211_RATE_SHORT_PREAMBLE = 1<<0, - IEEE80211_RATE_MANDATORY_A = 1<<1, - IEEE80211_RATE_MANDATORY_B = 1<<2, - IEEE80211_RATE_MANDATORY_G = 1<<3, - IEEE80211_RATE_ERP_G = 1<<4, - IEEE80211_RATE_SUPPORTS_5MHZ = 1<<5, - IEEE80211_RATE_SUPPORTS_10MHZ = 1<<6, + IEEE80211_RATE_SHORT_PREAMBLE = BIT(0), + IEEE80211_RATE_MANDATORY_A = BIT(1), + IEEE80211_RATE_MANDATORY_B = BIT(2), + IEEE80211_RATE_MANDATORY_G = BIT(3), + IEEE80211_RATE_ERP_G = BIT(4), + IEEE80211_RATE_SUPPORTS_5MHZ = BIT(5), + IEEE80211_RATE_SUPPORTS_10MHZ = BIT(6), }; /** @@ -410,6 +429,19 @@ struct ieee80211_sta_eht_cap { u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN]; }; +/* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */ +#ifdef __CHECKER__ +/* + * This is used to mark the sband->iftype_data pointer which is supposed + * to be an array with special access semantics (per iftype), but a lot + * of code got it wrong in the past, so with this marking sparse will be + * noisy when the pointer is used directly. + */ +# define __iftd __attribute__((noderef, address_space(__iftype_data))) +#else +# define __iftd +#endif /* __CHECKER__ */ + /** * struct ieee80211_sband_iftype_data - sband data per interface type * @@ -525,7 +557,7 @@ struct ieee80211_sta_s1g_cap { * @vht_cap: VHT capabilities in this band * @s1g_cap: S1G capabilities in this band * @edmg_cap: EDMG capabilities in this band - * @s1g_cap: S1G capabilities in this band (S1B band only, of course) + * @s1g_cap: S1G capabilities in this band (S1G band only, of course) * @n_iftype_data: number of iftype data entries * @iftype_data: interface type data entries. Note that the bits in * @types_mask inside this structure cannot overlap (i.e. only @@ -543,10 +575,48 @@ struct ieee80211_supported_band { struct ieee80211_sta_s1g_cap s1g_cap; struct ieee80211_edmg edmg_cap; u16 n_iftype_data; - const struct ieee80211_sband_iftype_data *iftype_data; + const struct ieee80211_sband_iftype_data __iftd *iftype_data; }; /** + * _ieee80211_set_sband_iftype_data - set sband iftype data array + * @sband: the sband to initialize + * @iftd: the iftype data array pointer + * @n_iftd: the length of the iftype data array + * + * Set the sband iftype data array; use this where the length cannot + * be derived from the ARRAY_SIZE() of the argument, but prefer + * ieee80211_set_sband_iftype_data() where it can be used. + */ +static inline void +_ieee80211_set_sband_iftype_data(struct ieee80211_supported_band *sband, + const struct ieee80211_sband_iftype_data *iftd, + u16 n_iftd) +{ + sband->iftype_data = (const void __iftd __force *)iftd; + sband->n_iftype_data = n_iftd; +} + +/** + * ieee80211_set_sband_iftype_data - set sband iftype data array + * @sband: the sband to initialize + * @iftd: the iftype data array + */ +#define ieee80211_set_sband_iftype_data(sband, iftd) \ + _ieee80211_set_sband_iftype_data(sband, iftd, ARRAY_SIZE(iftd)) + +/** + * for_each_sband_iftype_data - iterate sband iftype data entries + * @sband: the sband whose iftype_data array to iterate + * @i: iterator counter + * @iftd: iftype data pointer to set + */ +#define for_each_sband_iftype_data(sband, i, iftd) \ + for (i = 0, iftd = (const void __force *)&(sband)->iftype_data[i]; \ + i < (sband)->n_iftype_data; \ + i++, iftd = (const void __force *)&(sband)->iftype_data[i]) + +/** * ieee80211_get_sband_iftype_data - return sband data for a given iftype * @sband: the sband to search for the STA on * @iftype: enum nl80211_iftype @@ -557,18 +627,16 @@ static inline const struct ieee80211_sband_iftype_data * ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, u8 iftype) { + const struct ieee80211_sband_iftype_data *data; int i; - if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) + if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) return NULL; if (iftype == NL80211_IFTYPE_AP_VLAN) iftype = NL80211_IFTYPE_AP; - for (i = 0; i < sband->n_iftype_data; i++) { - const struct ieee80211_sband_iftype_data *data = - &sband->iftype_data[i]; - + for_each_sband_iftype_data(sband, i, data) { if (data->types_mask & BIT(iftype)) return data; } @@ -617,7 +685,7 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband, } /** - * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype + * ieee80211_get_eht_iftype_cap - return EHT capabilities for an sband's iftype * @sband: the sband to search for the iftype on * @iftype: enum nl80211_iftype * @@ -718,8 +786,7 @@ struct vif_params { * @key: key material * @key_len: length of key material * @cipher: cipher suite selector - * @seq: sequence counter (IV/PN) for TKIP and CCMP keys, only used - * with the get_key() callback, must be in little endian, + * @seq: sequence counter (IV/PN), must be in little endian, * length given by @seq_len. * @seq_len: length of @seq. * @vlan_id: vlan_id for VLAN group key (if nonzero) @@ -747,6 +814,12 @@ struct key_params { * chan will define the primary channel and all other * parameters are ignored. * @freq1_offset: offset from @center_freq1, in KHz + * @punctured: mask of the punctured 20 MHz subchannels, with + * bits turned on being disabled (punctured); numbered + * from lower to higher frequency (like in the spec) + * @s1g_primary_2mhz: Indicates if the control channel pointed to + * by 'chan' exists as a 1MHz primary subchannel within an + * S1G 2MHz primary channel. */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -755,6 +828,8 @@ struct cfg80211_chan_def { u32 center_freq2; struct ieee80211_edmg edmg; u16 freq1_offset; + u16 punctured; + bool s1g_primary_2mhz; }; /* @@ -766,9 +841,12 @@ struct cfg80211_bitrate_mask { u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; u16 he_mcs[NL80211_HE_NSS_MAX]; + u16 eht_mcs[NL80211_EHT_NSS_MAX]; enum nl80211_txrate_gi gi; enum nl80211_he_gi he_gi; + enum nl80211_eht_gi eht_gi; enum nl80211_he_ltf he_ltf; + enum nl80211_eht_ltf eht_ltf; } control[NUM_NL80211_BANDS]; }; @@ -895,7 +973,9 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1, chandef1->width == chandef2->width && chandef1->center_freq1 == chandef2->center_freq1 && chandef1->freq1_offset == chandef2->freq1_offset && - chandef1->center_freq2 == chandef2->center_freq2); + chandef1->center_freq2 == chandef2->center_freq2 && + chandef1->punctured == chandef2->punctured && + chandef1->s1g_primary_2mhz == chandef2->s1g_primary_2mhz); } /** @@ -912,6 +992,18 @@ cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef) } /** + * cfg80211_chandef_is_s1g - check if chandef represents an S1G channel + * @chandef: the channel definition + * + * Return: %true if S1G. + */ +static inline bool +cfg80211_chandef_is_s1g(const struct cfg80211_chan_def *chandef) +{ + return chandef->chan->band == NL80211_BAND_S1GHZ; +} + +/** * cfg80211_chandef_compatible - check if two channel definitions are compatible * @chandef1: first channel definition * @chandef2: second channel definition @@ -923,6 +1015,27 @@ const struct cfg80211_chan_def * cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1, const struct cfg80211_chan_def *chandef2); + +/** + * nl80211_chan_width_to_mhz - get the channel width in MHz + * @chan_width: the channel width from &enum nl80211_chan_width + * + * Return: channel width in MHz if the chan_width from &enum nl80211_chan_width + * is valid. -1 otherwise. + */ +int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width); + +/** + * cfg80211_chandef_get_width - return chandef width in MHz + * @c: chandef to return bandwidth for + * Return: channel width in MHz for the given chandef; note that it returns + * 80 for 80+80 configurations + */ +static inline int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) +{ + return nl80211_chan_width_to_mhz(c->width); +} + /** * cfg80211_chandef_valid - check if a channel definition is valid * @chandef: the channel definition to check @@ -954,50 +1067,51 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, enum nl80211_iftype iftype); /** - * nl80211_send_chandef - sends the channel definition. - * @msg: the msg to send channel definition + * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable and we + * can/need start CAC on such channel + * @wiphy: the wiphy to validate against * @chandef: the channel definition to check * - * Returns: 0 if sent the channel definition to msg, < 0 on error - **/ -int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef); + * Return: true if all channels available and at least + * one channel requires CAC (NL80211_DFS_USABLE) + */ +bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef); /** - * ieee80211_chanwidth_rate_flags - return rate flags for channel width - * @width: the channel width of the channel - * - * In some channel types, not all rates may be used - for example CCK - * rates may not be used in 5/10 MHz channels. + * cfg80211_chandef_dfs_cac_time - get the DFS CAC time (in ms) for given + * channel definition + * @wiphy: the wiphy to validate against + * @chandef: the channel definition to check * - * Returns: rate flags which apply for this channel width + * Returns: DFS CAC time (in ms) which applies for this channel definition */ -static inline enum ieee80211_rate_flags -ieee80211_chanwidth_rate_flags(enum nl80211_chan_width width) -{ - switch (width) { - case NL80211_CHAN_WIDTH_5: - return IEEE80211_RATE_SUPPORTS_5MHZ; - case NL80211_CHAN_WIDTH_10: - return IEEE80211_RATE_SUPPORTS_10MHZ; - default: - break; - } - return 0; -} +unsigned int +cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef); /** - * ieee80211_chandef_rate_flags - returns rate flags for a channel - * @chandef: channel definition for the channel - * - * See ieee80211_chanwidth_rate_flags(). + * cfg80211_chandef_primary - calculate primary 40/80/160 MHz freq + * @chandef: chandef to calculate for + * @primary_chan_width: primary channel width to calculate center for + * @punctured: punctured sub-channel bitmap, will be recalculated + * according to the new bandwidth, can be %NULL * - * Returns: rate flags which apply for this channel + * Returns: the primary 40/80/160 MHz channel center frequency, or -1 + * for errors, updating the punctured bitmap */ -static inline enum ieee80211_rate_flags -ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef) -{ - return ieee80211_chanwidth_rate_flags(chandef->width); -} +int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef, + enum nl80211_chan_width primary_chan_width, + u16 *punctured); + +/** + * nl80211_send_chandef - sends the channel definition. + * @msg: the msg to send channel definition + * @chandef: the channel definition to check + * + * Returns: 0 if sent the channel definition to msg, < 0 on error + **/ +int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef); /** * ieee80211_chandef_max_power - maximum transmission power for the chandef @@ -1032,6 +1146,8 @@ ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef) * @band_mask: which bands to check on * @prohibited_flags: which channels to not consider usable, * %IEEE80211_CHAN_DISABLED is always taken into account + * + * Return: %true if usable channels found, %false otherwise */ bool cfg80211_any_usable_channels(struct wiphy *wiphy, unsigned long band_mask, @@ -1164,11 +1280,13 @@ struct cfg80211_crypto_settings { * struct cfg80211_mbssid_config - AP settings for multi bssid * * @tx_wdev: pointer to the transmitted interface in the MBSSID set + * @tx_link_id: link ID of the transmitted profile in an MLD. * @index: index of this AP in the multi bssid group. * @ema: set to true if the beacons should be sent out in EMA mode. */ struct cfg80211_mbssid_config { struct wireless_dev *tx_wdev; + u8 tx_link_id; u8 index; bool ema; }; @@ -1289,6 +1407,7 @@ struct cfg80211_acl_data { * struct cfg80211_fils_discovery - FILS discovery parameters from * IEEE Std 802.11ai-2016, Annex C.3 MIB detail. * + * @update: Set to true if the feature configuration should be updated. * @min_interval: Minimum packet interval in TUs (0 - 10000) * @max_interval: Maximum packet interval in TUs (0 - 10000) * @tmpl_len: Template length @@ -1296,6 +1415,7 @@ struct cfg80211_acl_data { * frame headers. */ struct cfg80211_fils_discovery { + bool update; u32 min_interval; u32 max_interval; size_t tmpl_len; @@ -1306,6 +1426,7 @@ struct cfg80211_fils_discovery { * struct cfg80211_unsol_bcast_probe_resp - Unsolicited broadcast probe * response parameters in 6GHz. * + * @update: Set to true if the feature configuration should be updated. * @interval: Packet interval in TUs. Maximum allowed is 20 TU, as mentioned * in IEEE P802.11ax/D6.0 26.17.2.3.2 - AP behavior for fast passive * scanning @@ -1313,12 +1434,30 @@ struct cfg80211_fils_discovery { * @tmpl: Template data for probe response */ struct cfg80211_unsol_bcast_probe_resp { + bool update; u32 interval; size_t tmpl_len; const u8 *tmpl; }; /** + * struct cfg80211_s1g_short_beacon - S1G short beacon data. + * + * @update: Set to true if the feature configuration should be updated. + * @short_head: Short beacon head. + * @short_tail: Short beacon tail. + * @short_head_len: Short beacon head len. + * @short_tail_len: Short beacon tail len. + */ +struct cfg80211_s1g_short_beacon { + bool update; + const u8 *short_head; + const u8 *short_tail; + size_t short_head_len; + size_t short_tail_len; +}; + +/** * struct cfg80211_ap_settings - AP configuration * * Used to configure an AP interface. @@ -1334,7 +1473,6 @@ struct cfg80211_unsol_bcast_probe_resp { * @crypto: crypto settings * @privacy: the BSS uses privacy * @auth_type: Authentication type (algorithm) - * @smps_mode: SMPS mode * @inactivity_timeout: time in seconds to determine station's inactivity. * @p2p_ctwindow: P2P CT Window * @p2p_opp_ps: P2P opportunistic PS @@ -1359,9 +1497,8 @@ struct cfg80211_unsol_bcast_probe_resp { * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @mbssid_config: AP settings for multiple bssid - * @punct_bitmap: Preamble puncturing bitmap. Each bit represents - * a 20 MHz channel, lowest bit corresponding to the lowest channel. - * Bit set to 1 indicates that the channel is punctured. + * @s1g_long_beacon_period: S1G long beacon period + * @s1g_short_beacon: S1G short beacon data */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1375,7 +1512,6 @@ struct cfg80211_ap_settings { struct cfg80211_crypto_settings crypto; bool privacy; enum nl80211_auth_type auth_type; - enum nl80211_smps_mode smps_mode; int inactivity_timeout; u8 p2p_ctwindow; bool p2p_opp_ps; @@ -1396,7 +1532,26 @@ struct cfg80211_ap_settings { struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; struct cfg80211_mbssid_config mbssid_config; - u16 punct_bitmap; + u8 s1g_long_beacon_period; + struct cfg80211_s1g_short_beacon s1g_short_beacon; +}; + + +/** + * struct cfg80211_ap_update - AP configuration update + * + * Subset of &struct cfg80211_ap_settings, for updating a running AP. + * + * @beacon: beacon data + * @fils_discovery: FILS discovery transmission parameters + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters + * @s1g_short_beacon: S1G short beacon data + */ +struct cfg80211_ap_update { + struct cfg80211_beacon_data beacon; + struct cfg80211_fils_discovery fils_discovery; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; + struct cfg80211_s1g_short_beacon s1g_short_beacon; }; /** @@ -1411,12 +1566,12 @@ struct cfg80211_ap_settings { * @n_counter_offsets_beacon: number of csa counters the beacon (tail) * @n_counter_offsets_presp: number of csa counters in the probe response * @beacon_after: beacon data to be used on the new channel + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @radar_required: whether radar detection is required on the new channel * @block_tx: whether transmissions should be blocked while changing * @count: number of beacons until switch - * @punct_bitmap: Preamble puncturing bitmap. Each bit represents - * a 20 MHz channel, lowest bit corresponding to the lowest channel. - * Bit set to 1 indicates that the channel is punctured. + * @link_id: defines the link on which channel switch is expected during + * MLO. 0 in case of non-MLO. */ struct cfg80211_csa_settings { struct cfg80211_chan_def chandef; @@ -1426,10 +1581,11 @@ struct cfg80211_csa_settings { unsigned int n_counter_offsets_beacon; unsigned int n_counter_offsets_presp; struct cfg80211_beacon_data beacon_after; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; bool radar_required; bool block_tx; u8 count; - u16 punct_bitmap; + u8 link_id; }; /** @@ -1441,16 +1597,21 @@ struct cfg80211_csa_settings { * @counter_offset_beacon: offsets of the counters within the beacon (tail) * @counter_offset_presp: offsets of the counters within the probe response * @beacon_next: beacon data to be used after the color change + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @count: number of beacons until the color change * @color: the color used after the change + * @link_id: defines the link on which color change is expected during MLO. + * 0 in case of non-MLO. */ struct cfg80211_color_change_settings { struct cfg80211_beacon_data beacon_color_change; u16 counter_offset_beacon; u16 counter_offset_presp; struct cfg80211_beacon_data beacon_next; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; u8 count; u8 color; + u8 link_id; }; /** @@ -1458,6 +1619,7 @@ struct cfg80211_color_change_settings { * * Used to pass interface combination parameters * + * @radio_idx: wiphy radio index or -1 for global * @num_different_channels: the number of different channels we want * to use for verification * @radar_detect: a bitmap where each bit corresponds to a channel @@ -1471,6 +1633,7 @@ struct cfg80211_color_change_settings { * the verification */ struct iface_combination_params { + int radio_idx; int num_different_channels; u8 radar_detect; int iftype_num[NUM_NL80211_IFTYPES]; @@ -1534,6 +1697,7 @@ struct sta_txpwr { * @he_6ghz_capa: HE 6 GHz Band capabilities of station * @eht_capa: EHT capabilities of station * @eht_capa_len: the length of the EHT capabilities + * @s1g_capa: S1G capabilities of station */ struct link_station_parameters { const u8 *mld_mac; @@ -1552,6 +1716,7 @@ struct link_station_parameters { const struct ieee80211_he_6ghz_capa *he_6ghz_capa; const struct ieee80211_eht_cap_elem *eht_capa; u8 eht_capa_len; + const struct ieee80211_s1g_cap *s1g_capa; }; /** @@ -1568,6 +1733,21 @@ struct link_station_del_parameters { }; /** + * struct cfg80211_ttlm_params: TID to link mapping parameters + * + * Used for setting a TID to link mapping. + * + * @dlink: Downlink TID to link mapping, as defined in section 9.4.2.314 + * (TID-To-Link Mapping element) in Draft P802.11be_D4.0. + * @ulink: Uplink TID to link mapping, as defined in section 9.4.2.314 + * (TID-To-Link Mapping element) in Draft P802.11be_D4.0. + */ +struct cfg80211_ttlm_params { + u16 dlink[8]; + u16 ulink[8]; +}; + +/** * struct station_parameters - station parameters * * Used to change and create a new station. @@ -1601,6 +1781,9 @@ struct link_station_del_parameters { * @supported_oper_classes_len: number of supported operating classes * @support_p2p_ps: information if station supports P2P PS mechanism * @airtime_weight: airtime scheduler weight for this station + * @eml_cap_present: Specifies if EML capabilities field (@eml_cap) is + * present/updated + * @eml_cap: EML capabilities of this station * @link_sta_params: link related params. */ struct station_parameters { @@ -1625,6 +1808,8 @@ struct station_parameters { u8 supported_oper_classes_len; int support_p2p_ps; u16 airtime_weight; + bool eml_cap_present; + u16 eml_cap; struct link_station_parameters link_sta_params; }; @@ -1637,11 +1822,15 @@ struct station_parameters { * @subtype: Management frame subtype to use for indicating removal * (10 = Disassociation, 12 = Deauthentication) * @reason_code: Reason code for the Disassociation/Deauthentication frame + * @link_id: Link ID indicating a link that stations to be flushed must be + * using; valid only for MLO, but can also be -1 for MLO to really + * remove all stations. */ struct station_del_parameters { const u8 *mac; u8 subtype; u16 reason_code; + int link_id; }; /** @@ -1682,9 +1871,11 @@ enum cfg80211_station_type { * * Utility function for the @change_station driver method. Call this function * with the appropriate station type looking up the station (and checking that - * it exists). It will verify whether the station change is acceptable, and if - * not will return an error code. Note that it may modify the parameters for - * backward compatibility reasons, so don't use them before calling this. + * it exists). It will verify whether the station change is acceptable. + * + * Return: 0 if the change is acceptable, otherwise an error code. Note that + * it may modify the parameters for backward compatibility reasons, so don't + * use them before calling this. */ int cfg80211_check_station_change(struct wiphy *wiphy, struct station_parameters *params, @@ -1799,9 +1990,9 @@ struct rate_info { * @BSS_PARAM_FLAGS_SHORT_SLOT_TIME: whether short slot time is enabled */ enum bss_param_flags { - BSS_PARAM_FLAGS_CTS_PROT = 1<<0, - BSS_PARAM_FLAGS_SHORT_PREAMBLE = 1<<1, - BSS_PARAM_FLAGS_SHORT_SLOT_TIME = 1<<2, + BSS_PARAM_FLAGS_CTS_PROT = BIT(0), + BSS_PARAM_FLAGS_SHORT_PREAMBLE = BIT(1), + BSS_PARAM_FLAGS_SHORT_SLOT_TIME = BIT(2), }; /** @@ -1873,6 +2064,99 @@ struct cfg80211_tid_stats { #define IEEE80211_MAX_CHAINS 4 /** + * struct link_station_info - link station information + * + * Link station information filled by driver for get_station() and + * dump_station(). + * @filled: bit flag of flags using the bits of &enum nl80211_sta_info to + * indicate the relevant values in this struct for them + * @connected_time: time(in secs) since a link of station is last connected + * @inactive_time: time since last activity for link station(tx/rx) + * in milliseconds + * @assoc_at: bootime (ns) of the last association of link of station + * @rx_bytes: bytes (size of MPDUs) received from this link of station + * @tx_bytes: bytes (size of MPDUs) transmitted to this link of station + * @signal: The signal strength, type depends on the wiphy's signal_type. + * For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_. + * @signal_avg: Average signal strength, type depends on the wiphy's + * signal_type. For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_ + * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg + * @chain_signal: per-chain signal strength of last received packet in dBm + * @chain_signal_avg: per-chain signal strength average in dBm + * @txrate: current unicast bitrate from this link of station + * @rxrate: current unicast bitrate to this link of station + * @rx_packets: packets (MSDUs & MMPDUs) received from this link of station + * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this link of station + * @tx_retries: cumulative retry counts (MPDUs) for this link of station + * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK) + * @rx_dropped_misc: Dropped for un-specified reason. + * @bss_param: current BSS parameters + * @beacon_loss_count: Number of times beacon loss event has triggered. + * @expected_throughput: expected throughput in kbps (including 802.11 headers) + * towards this station. + * @rx_beacon: number of beacons received from this peer + * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received + * from this peer + * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer + * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer + * @airtime_weight: current airtime scheduling weight + * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last + * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs. + * Note that this doesn't use the @filled bit, but is used if non-NULL. + * @ack_signal: signal strength (in dBm) of the last ACK frame. + * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has + * been sent. + * @rx_mpdu_count: number of MPDUs received from this station + * @fcs_err_count: number of packets (MPDUs) received from this station with + * an FCS error. This counter should be incremented only when TA of the + * received packet with an FCS error matches the peer MAC address. + * @addr: For MLO STA connection, filled with address of the link of station. + */ +struct link_station_info { + u64 filled; + u32 connected_time; + u32 inactive_time; + u64 assoc_at; + u64 rx_bytes; + u64 tx_bytes; + s8 signal; + s8 signal_avg; + + u8 chains; + s8 chain_signal[IEEE80211_MAX_CHAINS]; + s8 chain_signal_avg[IEEE80211_MAX_CHAINS]; + + struct rate_info txrate; + struct rate_info rxrate; + u32 rx_packets; + u32 tx_packets; + u32 tx_retries; + u32 tx_failed; + u32 rx_dropped_misc; + struct sta_bss_parameters bss_param; + + u32 beacon_loss_count; + + u32 expected_throughput; + + u64 tx_duration; + u64 rx_duration; + u64 rx_beacon; + u8 rx_beacon_signal_avg; + + u16 airtime_weight; + + s8 ack_signal; + s8 avg_ack_signal; + struct cfg80211_tid_stats *pertid; + + u32 rx_mpdu_count; + u32 fcs_err_count; + + u8 addr[ETH_ALEN] __aligned(2); +}; + +/** * struct station_info - station information * * Station information filled by driver for get_station() and dump_station. @@ -1884,9 +2168,6 @@ struct cfg80211_tid_stats { * @assoc_at: bootime (ns) of the last association * @rx_bytes: bytes (size of MPDUs) received from this station * @tx_bytes: bytes (size of MPDUs) transmitted to this station - * @llid: mesh local link id - * @plid: mesh peer link id - * @plink_state: mesh peer link state * @signal: The signal strength, type depends on the wiphy's signal_type. * For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_. * @signal_avg: Average signal strength, type depends on the wiphy's signal_type. @@ -1906,14 +2187,20 @@ struct cfg80211_tid_stats { * This number should increase every time the list of stations * changes, i.e. when a station is added or removed, so that * userspace can tell whether it got a consistent snapshot. + * @beacon_loss_count: Number of times beacon loss event has triggered. * @assoc_req_ies: IEs from (Re)Association Request. * This is used only when in AP mode with drivers that do not use * user space MLME/SME implementation. The information is provided for * the cfg80211_new_sta() calls to notify user space of the IEs. * @assoc_req_ies_len: Length of assoc_req_ies buffer in octets. * @sta_flags: station flags mask & values - * @beacon_loss_count: Number of times beacon loss event has triggered. * @t_offset: Time offset of the station relative to this host. + * @llid: mesh local link id + * @plid: mesh peer link id + * @plink_state: mesh peer link state + * @connected_to_gate: true if mesh STA has a path to mesh gate + * @connected_to_as: true if mesh STA has a path to authentication server + * @airtime_link_metric: mesh airtime link metric. * @local_pm: local mesh STA power save mode * @peer_pm: peer mesh STA power save mode * @nonpeer_pm: non-peer mesh STA power save mode @@ -1922,7 +2209,6 @@ struct cfg80211_tid_stats { * @rx_beacon: number of beacons received from this peer * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received * from this peer - * @connected_to_gate: true if mesh STA has a path to mesh gate * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer * @airtime_weight: current airtime scheduling weight @@ -1936,8 +2222,6 @@ struct cfg80211_tid_stats { * @fcs_err_count: number of packets (MPDUs) received from this station with * an FCS error. This counter should be incremented only when TA of the * received packet with an FCS error matches the peer MAC address. - * @airtime_link_metric: mesh airtime link metric. - * @connected_to_as: true if mesh STA has a path to authentication server * @mlo_params_valid: Indicates @assoc_link_id and @mld_addr fields are filled * by driver. Drivers use this only in cfg80211_new_sta() calls when AP * MLD's MLME/SME is offload to driver. Drivers won't fill this @@ -1956,6 +2240,11 @@ struct cfg80211_tid_stats { * dump_station() callbacks. User space needs this information to determine * the accepted and rejected affiliated links of the connected station. * @assoc_resp_ies_len: Length of @assoc_resp_ies buffer in octets. + * @valid_links: bitmap of valid links, or 0 for non-MLO. Drivers fill this + * information in cfg80211_new_sta(), cfg80211_del_sta_sinfo(), + * get_station() and dump_station() callbacks. + * @links: reference to Link sta entries for MLO STA, all link specific + * information is accessed through links[link_id]. */ struct station_info { u64 filled; @@ -1964,9 +2253,6 @@ struct station_info { u64 assoc_at; u64 rx_bytes; u64 tx_bytes; - u16 llid; - u16 plid; - u8 plink_state; s8 signal; s8 signal_avg; @@ -1986,41 +2272,46 @@ struct station_info { int generation; + u32 beacon_loss_count; + const u8 *assoc_req_ies; size_t assoc_req_ies_len; - u32 beacon_loss_count; s64 t_offset; + u16 llid; + u16 plid; + u8 plink_state; + u8 connected_to_gate; + u8 connected_to_as; + u32 airtime_link_metric; enum nl80211_mesh_power_mode local_pm; enum nl80211_mesh_power_mode peer_pm; enum nl80211_mesh_power_mode nonpeer_pm; u32 expected_throughput; - u64 tx_duration; - u64 rx_duration; - u64 rx_beacon; - u8 rx_beacon_signal_avg; - u8 connected_to_gate; + u16 airtime_weight; - struct cfg80211_tid_stats *pertid; s8 ack_signal; s8 avg_ack_signal; + struct cfg80211_tid_stats *pertid; - u16 airtime_weight; + u64 tx_duration; + u64 rx_duration; + u64 rx_beacon; + u8 rx_beacon_signal_avg; u32 rx_mpdu_count; u32 fcs_err_count; - u32 airtime_link_metric; - - u8 connected_to_as; - bool mlo_params_valid; u8 assoc_link_id; u8 mld_addr[ETH_ALEN] __aligned(2); const u8 *assoc_resp_ies; size_t assoc_resp_ies_len; + + u16 valid_links; + struct link_station_info *links[IEEE80211_MLD_MAX_NUM_LINKS]; }; /** @@ -2042,7 +2333,7 @@ struct cfg80211_sar_sub_specs { struct cfg80211_sar_specs { enum nl80211_sar_type type; u32 num_sub_specs; - struct cfg80211_sar_sub_specs sub_specs[]; + struct cfg80211_sar_sub_specs sub_specs[] __counted_by(num_sub_specs); }; @@ -2078,7 +2369,7 @@ struct cfg80211_sar_capa { * @mac_addr: the mac address of the station of interest * @sinfo: pointer to the structure to fill with the information * - * Returns 0 on success and sinfo is filled with the available information + * Return: 0 on success and sinfo is filled with the available information * otherwise returns a negative error code and the content of sinfo has to be * considered undefined. */ @@ -2104,17 +2395,19 @@ static inline int cfg80211_get_station(struct net_device *dev, * @MONITOR_FLAG_PLCPFAIL: pass frames with bad PLCP * @MONITOR_FLAG_CONTROL: pass control frames * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering - * @MONITOR_FLAG_COOK_FRAMES: report frames after processing + * @MONITOR_FLAG_COOK_FRAMES: deprecated, will unconditionally be refused * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address + * @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames */ enum monitor_flags { - MONITOR_FLAG_CHANGED = 1<<__NL80211_MNTR_FLAG_INVALID, - MONITOR_FLAG_FCSFAIL = 1<<NL80211_MNTR_FLAG_FCSFAIL, - MONITOR_FLAG_PLCPFAIL = 1<<NL80211_MNTR_FLAG_PLCPFAIL, - MONITOR_FLAG_CONTROL = 1<<NL80211_MNTR_FLAG_CONTROL, - MONITOR_FLAG_OTHER_BSS = 1<<NL80211_MNTR_FLAG_OTHER_BSS, - MONITOR_FLAG_COOK_FRAMES = 1<<NL80211_MNTR_FLAG_COOK_FRAMES, - MONITOR_FLAG_ACTIVE = 1<<NL80211_MNTR_FLAG_ACTIVE, + MONITOR_FLAG_CHANGED = BIT(__NL80211_MNTR_FLAG_INVALID), + MONITOR_FLAG_FCSFAIL = BIT(NL80211_MNTR_FLAG_FCSFAIL), + MONITOR_FLAG_PLCPFAIL = BIT(NL80211_MNTR_FLAG_PLCPFAIL), + MONITOR_FLAG_CONTROL = BIT(NL80211_MNTR_FLAG_CONTROL), + MONITOR_FLAG_OTHER_BSS = BIT(NL80211_MNTR_FLAG_OTHER_BSS), + MONITOR_FLAG_COOK_FRAMES = BIT(NL80211_MNTR_FLAG_COOK_FRAMES), + MONITOR_FLAG_ACTIVE = BIT(NL80211_MNTR_FLAG_ACTIVE), + MONITOR_FLAG_SKIP_TX = BIT(NL80211_MNTR_FLAG_SKIP_TX), }; /** @@ -2181,6 +2474,29 @@ struct mpath_info { }; /** + * enum wiphy_bss_param_flags - bit positions for supported bss parameters. + * + * @WIPHY_BSS_PARAM_CTS_PROT: support changing CTS protection. + * @WIPHY_BSS_PARAM_SHORT_PREAMBLE: support changing short preamble usage. + * @WIPHY_BSS_PARAM_SHORT_SLOT_TIME: support changing short slot time usage. + * @WIPHY_BSS_PARAM_BASIC_RATES: support reconfiguring basic rates. + * @WIPHY_BSS_PARAM_AP_ISOLATE: support changing AP isolation. + * @WIPHY_BSS_PARAM_HT_OPMODE: support changing HT operating mode. + * @WIPHY_BSS_PARAM_P2P_CTWINDOW: support reconfiguring ctwindow. + * @WIPHY_BSS_PARAM_P2P_OPPPS: support changing P2P opportunistic power-save. + */ +enum wiphy_bss_param_flags { + WIPHY_BSS_PARAM_CTS_PROT = BIT(0), + WIPHY_BSS_PARAM_SHORT_PREAMBLE = BIT(1), + WIPHY_BSS_PARAM_SHORT_SLOT_TIME = BIT(2), + WIPHY_BSS_PARAM_BASIC_RATES = BIT(3), + WIPHY_BSS_PARAM_AP_ISOLATE = BIT(4), + WIPHY_BSS_PARAM_HT_OPMODE = BIT(5), + WIPHY_BSS_PARAM_P2P_CTWINDOW = BIT(6), + WIPHY_BSS_PARAM_P2P_OPPPS = BIT(7), +}; + +/** * struct bss_parameters - BSS parameters * * Used to change BSS parameters (mainly for AP mode). @@ -2346,7 +2662,7 @@ struct mesh_config { * @user_mpm: userspace handles all MPM functions * @dtim_period: DTIM period to use * @beacon_interval: beacon interval to use - * @mcast_rate: multicat rate for Mesh Node [6Mbps is the default for 802.11a] + * @mcast_rate: multicast rate for Mesh Node [6Mbps is the default for 802.11a] * @basic_rates: basic rates to use when creating the mesh * @beacon_rate: bitrate to be used for beacons * @userspace_handles_dfs: whether user space controls DFS operation, i.e. @@ -2463,7 +2779,7 @@ struct cfg80211_scan_info { * @short_ssid: short ssid to scan for * @bssid: bssid to scan for * @channel_idx: idx of the channel in the channel array in the scan request - * which the above info relvant to + * which the above info is relevant to * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU * @short_ssid_valid: @short_ssid is valid and can be used * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait @@ -2487,7 +2803,6 @@ struct cfg80211_scan_6ghz_params { * @n_ssids: number of SSIDs * @channels: channels to scan on. * @n_channels: total number of channels to scan - * @scan_width: channel width for scanning * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets * @duration: how long to listen on each channel, in TUs. If @@ -2500,24 +2815,26 @@ struct cfg80211_scan_6ghz_params { * @wiphy: the wiphy this was for * @scan_start: time (in jiffies) when the scan started * @wdev: the wireless device to scan for - * @info: (internal) information about completed scan - * @notified: (internal) scan request was notified as done or aborted * @no_cck: used to send probe requests at non CCK rate in 2GHz band * @mac_addr: MAC address used with randomisation * @mac_addr_mask: MAC address mask used with randomisation, bits that * are 0 in the mask should be randomised, bits that are 1 should * be taken from the @mac_addr * @scan_6ghz: relevant for split scan request only, - * true if this is the second scan request + * true if this is a 6 GHz scan request + * @first_part: %true if this is the first part of a split scan request or a + * scan that was not split. May be %true for a @scan_6ghz scan if no other + * channels were requested * @n_6ghz_params: number of 6 GHz params * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) + * @tsf_report_link_id: for MLO, indicates the link ID of the BSS that should be + * used for TSF reporting. Can be set to -1 to indicate no preference. */ struct cfg80211_scan_request { struct cfg80211_ssid *ssids; int n_ssids; u32 n_channels; - enum nl80211_bss_scan_width scan_width; const u8 *ie; size_t ie_len; u16 duration; @@ -2531,19 +2848,17 @@ struct cfg80211_scan_request { u8 mac_addr[ETH_ALEN] __aligned(2); u8 mac_addr_mask[ETH_ALEN] __aligned(2); u8 bssid[ETH_ALEN] __aligned(2); - - /* internal */ struct wiphy *wiphy; unsigned long scan_start; - struct cfg80211_scan_info info; - bool notified; bool no_cck; bool scan_6ghz; + bool first_part; u32 n_6ghz_params; struct cfg80211_scan_6ghz_params *scan_6ghz_params; + s8 tsf_report_link_id; /* keep last */ - struct ieee80211_channel *channels[] __counted_by(n_channels); + struct ieee80211_channel *channels[]; }; static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) @@ -2565,19 +2880,11 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match * or no match (RSSI only) * @rssi_thold: don't report scan results below this threshold (in s32 dBm) - * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied - * for filtering out scan results received. Drivers advertize this support - * of band specific rssi based filtering through the feature capability - * %NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band - * specific rssi thresholds take precedence over rssi_thold, if specified. - * If not specified for any band, it will be assigned with rssi_thold of - * corresponding matchset. */ struct cfg80211_match_set { struct cfg80211_ssid ssid; u8 bssid[ETH_ALEN]; s32 rssi_thold; - s32 per_band_rssi_thold[NUM_NL80211_BANDS]; }; /** @@ -2612,14 +2919,13 @@ struct cfg80211_bss_select_adjust { * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans) * @n_ssids: number of SSIDs * @n_channels: total number of channels to scan - * @scan_width: channel width for scanning * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets * @flags: control flags from &enum nl80211_scan_flags * @match_sets: sets of parameters to be matched for a scan result * entry to be considered valid and to be passed to the host * (others are filtered out). - * If ommited, all results are passed. + * If omitted, all results are passed. * @n_match_sets: number of match sets * @report_results: indicates that results were reported for this request * @wiphy: the wiphy this was for @@ -2653,14 +2959,13 @@ struct cfg80211_bss_select_adjust { * to the specified band while deciding whether a better BSS is reported * using @relative_rssi. If delta is a negative number, the BSSs that * belong to the specified band will be penalized by delta dB in relative - * comparisions. + * comparisons. */ struct cfg80211_sched_scan_request { u64 reqid; struct cfg80211_ssid *ssids; int n_ssids; u32 n_channels; - enum nl80211_bss_scan_width scan_width; const u8 *ie; size_t ie_len; u32 flags; @@ -2689,7 +2994,7 @@ struct cfg80211_sched_scan_request { struct list_head list; /* keep last */ - struct ieee80211_channel *channels[]; + struct ieee80211_channel *channels[] __counted_by(n_channels); }; /** @@ -2708,7 +3013,6 @@ enum cfg80211_signal_type { /** * struct cfg80211_inform_bss - BSS inform data * @chan: channel the frame was received on - * @scan_width: scan width that was used * @signal: signal strength value, according to the wiphy's * signal type * @boottime_ns: timestamp (CLOCK_BOOTTIME) when the information was @@ -2724,11 +3028,17 @@ enum cfg80211_signal_type { * the BSS that requested the scan in which the beacon/probe was received. * @chains: bitmask for filled values in @chain_signal. * @chain_signal: per-chain signal strength of last received BSS in dBm. + * @restrict_use: restrict usage, if not set, assume @use_for is + * %NL80211_BSS_USE_FOR_NORMAL. + * @use_for: bitmap of possible usage for this BSS, see + * &enum nl80211_bss_use_for + * @cannot_use_reasons: the reasons (bitmap) for not being able to connect, + * if @restrict_use is set and @use_for is zero (empty); may be 0 for + * unspecified reasons; see &enum nl80211_bss_cannot_use_reasons * @drv_data: Data to be passed through to @inform_bss */ struct cfg80211_inform_bss { struct ieee80211_channel *chan; - enum nl80211_bss_scan_width scan_width; s32 signal; u64 boottime_ns; u64 parent_tsf; @@ -2736,6 +3046,9 @@ struct cfg80211_inform_bss { u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; + u8 restrict_use:1, use_for:7; + u8 cannot_use_reasons; + void *drv_data; }; @@ -2762,7 +3075,6 @@ struct cfg80211_bss_ies { * for use in scan results and similar. * * @channel: channel this BSS is on - * @scan_width: width of the control channel * @bssid: BSSID of the BSS * @beacon_interval: the beacon interval as from the frame * @capability: the capability field in host byte order @@ -2775,6 +3087,8 @@ struct cfg80211_bss_ies { * own the beacon_ies, but they're just pointers to the ones from the * @hidden_beacon_bss struct) * @proberesp_ies: the information elements from the last Probe Response frame + * @proberesp_ecsa_stuck: ECSA element is stuck in the Probe Response frame, + * cannot rely on it having valid data * @hidden_beacon_bss: in case this BSS struct represents a probe response from * a BSS that hides the SSID in its beacon, this points to the BSS struct * that holds the beacon data. @beacon_ies is still valid, of course, and @@ -2784,15 +3098,20 @@ struct cfg80211_bss_ies { * @nontrans_list: list of non-transmitted BSS, if this is a transmitted one * (multi-BSSID support) * @signal: signal strength value (type depends on the wiphy's signal_type) + * @ts_boottime: timestamp of the last BSS update in nanoseconds since boot * @chains: bitmask for filled values in @chain_signal. * @chain_signal: per-chain signal strength of last received BSS in dBm. * @bssid_index: index in the multiple BSS set * @max_bssid_indicator: max number of members in the BSS set + * @use_for: bitmap of possible usage for this BSS, see + * &enum nl80211_bss_use_for + * @cannot_use_reasons: the reasons (bitmap) for not being able to connect, + * if @restrict_use is set and @use_for is zero (empty); may be 0 for + * unspecified reasons; see &enum nl80211_bss_cannot_use_reasons * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes */ struct cfg80211_bss { struct ieee80211_channel *channel; - enum nl80211_bss_scan_width scan_width; const struct cfg80211_bss_ies __rcu *ies; const struct cfg80211_bss_ies __rcu *beacon_ies; @@ -2804,6 +3123,8 @@ struct cfg80211_bss { s32 signal; + u64 ts_boottime; + u16 beacon_interval; u16 capability; @@ -2811,9 +3132,14 @@ struct cfg80211_bss { u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; + u8 proberesp_ecsa_stuck:1; + u8 bssid_index; u8 max_bssid_indicator; + u8 use_for; + u8 cannot_use_reasons; + u8 priv[] __aligned(sizeof(void *)); }; @@ -2851,6 +3177,10 @@ static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id) * * @bss: The BSS to authenticate with, the callee must obtain a reference * to it if it needs to keep it. + * @supported_selectors: List of selectors that should be assumed to be + * supported by the station. + * SAE_H2E must be assumed supported if set to %NULL. + * @supported_selectors_len: Length of supported_selectors in octets. * @auth_type: Authentication type (algorithm) * @ie: Extra IEs to add to Authentication frame or %NULL * @ie_len: Length of ie buffer in octets @@ -2873,6 +3203,8 @@ struct cfg80211_auth_request { struct cfg80211_bss *bss; const u8 *ie; size_t ie_len; + const u8 *supported_selectors; + u8 supported_selectors_len; enum nl80211_auth_type auth_type; const u8 *key; u8 key_len; @@ -2891,12 +3223,28 @@ struct cfg80211_auth_request { * @elems_len: length of the elements * @disabled: If set this link should be included during association etc. but it * should not be used until enabled by the AP MLD. + * @error: per-link error code, must be <= 0. If there is an error, then the + * operation as a whole must fail. */ struct cfg80211_assoc_link { struct cfg80211_bss *bss; const u8 *elems; size_t elems_len; bool disabled; + int error; +}; + +/** + * struct cfg80211_ml_reconf_req - MLO link reconfiguration request + * @add_links: data for links to add, see &struct cfg80211_assoc_link + * @rem_links: bitmap of links to remove + * @ext_mld_capa_ops: extended MLD capabilities and operations set by + * userspace for the ML reconfiguration action frame + */ +struct cfg80211_ml_reconf_req { + struct cfg80211_assoc_link add_links[IEEE80211_MLD_MAX_NUM_LINKS]; + u16 rem_links; + u16 ext_mld_capa_ops; }; /** @@ -2914,6 +3262,7 @@ struct cfg80211_assoc_link { * @CONNECT_REQ_MLO_SUPPORT: Userspace indicates support for handling MLD links. * Drivers shall disable MLO features for the current association if this * flag is not set. + * @ASSOC_REQ_SPP_AMSDU: SPP A-MSDUs will be used on this connection (if any) */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), @@ -2923,6 +3272,7 @@ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HE = BIT(4), ASSOC_REQ_DISABLE_EHT = BIT(5), CONNECT_REQ_MLO_SUPPORT = BIT(6), + ASSOC_REQ_SPP_AMSDU = BIT(7), }; /** @@ -2947,6 +3297,10 @@ enum cfg80211_assoc_req_flags { * included in the Current AP address field of the Reassociation Request * frame. * @flags: See &enum cfg80211_assoc_req_flags + * @supported_selectors: supported BSS selectors in IEEE 802.11 format + * (or %NULL for no change). + * If %NULL, then support for SAE_H2E should be assumed. + * @supported_selectors_len: number of supported BSS selectors * @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask * will be used in ht_capa. Un-supported values will be ignored. * @ht_capa_mask: The bits of ht_capa which are to be used. @@ -2965,6 +3319,8 @@ enum cfg80211_assoc_req_flags { * the link on which the association request should be sent * @ap_mld_addr: AP MLD address in case of MLO association request, * valid iff @link_id >= 0 + * @ext_mld_capa_ops: extended MLD capabilities and operations set by + * userspace for the association */ struct cfg80211_assoc_request { struct cfg80211_bss *bss; @@ -2973,6 +3329,8 @@ struct cfg80211_assoc_request { struct cfg80211_crypto_settings crypto; bool use_mfp; u32 flags; + const u8 *supported_selectors; + u8 supported_selectors_len; struct ieee80211_ht_cap ht_capa; struct ieee80211_ht_cap ht_capa_mask; struct ieee80211_vht_cap vht_capa, vht_capa_mask; @@ -2983,6 +3341,7 @@ struct cfg80211_assoc_request { struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS]; const u8 *ap_mld_addr; s8 link_id; + u16 ext_mld_capa_ops; }; /** @@ -3088,8 +3447,8 @@ struct cfg80211_ibss_params { * * @behaviour: requested BSS selection behaviour. * @param: parameters for requestion behaviour. - * @band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF. - * @adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST. + * @param.band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF. + * @param.adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST. */ struct cfg80211_bss_selection { enum nl80211_bss_select_attr behaviour; @@ -3227,15 +3586,15 @@ enum cfg80211_connect_params_changed { * @WIPHY_PARAM_TXQ_QUANTUM: TXQ scheduler quantum */ enum wiphy_params_flags { - WIPHY_PARAM_RETRY_SHORT = 1 << 0, - WIPHY_PARAM_RETRY_LONG = 1 << 1, - WIPHY_PARAM_FRAG_THRESHOLD = 1 << 2, - WIPHY_PARAM_RTS_THRESHOLD = 1 << 3, - WIPHY_PARAM_COVERAGE_CLASS = 1 << 4, - WIPHY_PARAM_DYN_ACK = 1 << 5, - WIPHY_PARAM_TXQ_LIMIT = 1 << 6, - WIPHY_PARAM_TXQ_MEMORY_LIMIT = 1 << 7, - WIPHY_PARAM_TXQ_QUANTUM = 1 << 8, + WIPHY_PARAM_RETRY_SHORT = BIT(0), + WIPHY_PARAM_RETRY_LONG = BIT(1), + WIPHY_PARAM_FRAG_THRESHOLD = BIT(2), + WIPHY_PARAM_RTS_THRESHOLD = BIT(3), + WIPHY_PARAM_COVERAGE_CLASS = BIT(4), + WIPHY_PARAM_DYN_ACK = BIT(5), + WIPHY_PARAM_TXQ_LIMIT = BIT(6), + WIPHY_PARAM_TXQ_MEMORY_LIMIT = BIT(7), + WIPHY_PARAM_TXQ_QUANTUM = BIT(8), }; #define IEEE80211_DEFAULT_AIRTIME_WEIGHT 256 @@ -3394,8 +3753,8 @@ struct cfg80211_coalesce_rules { * @n_rules: number of rules */ struct cfg80211_coalesce { - struct cfg80211_coalesce_rules *rules; int n_rules; + struct cfg80211_coalesce_rules rules[] __counted_by(n_rules); }; /** @@ -3410,7 +3769,7 @@ struct cfg80211_coalesce { struct cfg80211_wowlan_nd_match { struct cfg80211_ssid ssid; int n_channels; - u32 channels[]; + u32 channels[] __counted_by(n_channels); }; /** @@ -3424,7 +3783,7 @@ struct cfg80211_wowlan_nd_match { */ struct cfg80211_wowlan_nd_info { int n_matches; - struct cfg80211_wowlan_nd_match *matches[]; + struct cfg80211_wowlan_nd_match *matches[] __counted_by(n_matches); }; /** @@ -3447,12 +3806,15 @@ struct cfg80211_wowlan_nd_info { * @tcp_connlost: TCP connection lost or failed to establish * @tcp_nomoretokens: TCP data ran out of tokens * @net_detect: if not %NULL, woke up because of net detect + * @unprot_deauth_disassoc: woke up due to unprotected deauth or + * disassoc frame (in MFP). */ struct cfg80211_wowlan_wakeup { bool disconnect, magic_pkt, gtk_rekey_failure, eap_identity_req, four_way_handshake, rfkill_release, packet_80211, - tcp_match, tcp_connlost, tcp_nomoretokens; + tcp_match, tcp_connlost, tcp_nomoretokens, + unprot_deauth_disassoc; s32 pattern_idx; u32 packet_present_len, packet_len; const void *packet; @@ -3495,7 +3857,7 @@ struct cfg80211_update_ft_ies_params { * This structure provides information needed to transmit a mgmt frame * * @chan: channel to use - * @offchan: indicates wether off channel operation is required + * @offchan: indicates whether off channel operation is required * @wait: duration for ROC * @buf: buffer to transmit * @len: buffer length @@ -3565,6 +3927,38 @@ struct cfg80211_qos_map { }; /** + * struct cfg80211_nan_band_config - NAN band specific configuration + * + * @chan: Pointer to the IEEE 802.11 channel structure. The channel to be used + * for NAN operations on this band. For 2.4 GHz band, this is always + * channel 6. For 5 GHz band, the channel is either 44 or 149, according + * to the regulatory constraints. If chan pointer is NULL the entire band + * configuration entry is considered invalid and should not be used. + * @rssi_close: RSSI close threshold used for NAN state transition algorithm + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should + * be greater than -60 dBm. + * @rssi_middle: RSSI middle threshold used for NAN state transition algorithm. + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should be + * greater than -75 dBm and less than rssi_close. + * @awake_dw_interval: Committed DW interval. Valid values range: 0-5. 0 + * indicates no wakeup for DW and can't be used on 2.4GHz band, otherwise + * 2^(n-1). + * @disable_scan: If true, the device will not scan this band for cluster + * merge. Disabling scan on 2.4 GHz band is not allowed. + */ +struct cfg80211_nan_band_config { + struct ieee80211_channel *chan; + s8 rssi_close; + s8 rssi_middle; + u8 awake_dw_interval; + bool disable_scan; +}; + +/** * struct cfg80211_nan_conf - NAN configuration * * This struct defines NAN configuration parameters @@ -3573,10 +3967,34 @@ struct cfg80211_qos_map { * @bands: operating bands, a bitmap of &enum nl80211_band values. * For instance, for NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @cluster_id: cluster ID used for NAN synchronization. This is a MAC address + * that can take a value from 50-6F-9A-01-00-00 to 50-6F-9A-01-FF-FF. + * If NULL, the device will pick a random Cluster ID. + * @scan_period: period (in seconds) between NAN scans. + * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. + * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @enable_dw_notification: flag to enable/disable discovery window + * notifications. + * @band_cfgs: array of band specific configurations, indexed by + * &enum nl80211_band values. + * @extra_nan_attrs: pointer to additional NAN attributes. + * @extra_nan_attrs_len: length of the additional NAN attributes. + * @vendor_elems: pointer to vendor-specific elements. + * @vendor_elems_len: length of the vendor-specific elements. */ struct cfg80211_nan_conf { u8 master_pref; u8 bands; + const u8 *cluster_id; + u16 scan_period; + u16 scan_dwell_time; + u8 discovery_beacon_interval; + bool enable_dw_notification; + struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; + const u8 *extra_nan_attrs; + u16 extra_nan_attrs_len; + const u8 *vendor_elems; + u16 vendor_elems_len; }; /** @@ -3585,10 +4003,17 @@ struct cfg80211_nan_conf { * * @CFG80211_NAN_CONF_CHANGED_PREF: master preference * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands + * @CFG80211_NAN_CONF_CHANGED_CONFIG: changed additional configuration. + * When this flag is set, it indicates that some additional attribute(s) + * (other then master_pref and bands) have been changed. In this case, + * all the unchanged attributes will be properly configured to their + * previous values. The driver doesn't need to store any + * previous configuration besides master_pref and bands. */ enum cfg80211_nan_conf_changes { CFG80211_NAN_CONF_CHANGED_PREF = BIT(0), CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1), + CFG80211_NAN_CONF_CHANGED_CONFIG = BIT(2), }; /** @@ -3613,7 +4038,7 @@ struct cfg80211_nan_func_filter { * @publish_bcast: if true, the solicited publish should be broadcasted * @subscribe_active: if true, the subscribe is active * @followup_id: the instance ID for follow up - * @followup_reqid: the requestor instance ID for follow up + * @followup_reqid: the requester instance ID for follow up * @followup_dest: MAC address of the recipient of the follow up * @ttl: time to live counter in DW. * @serv_spec_info: Service Specific Info @@ -4401,6 +4826,19 @@ struct mgmt_frame_regs { * @del_link_station: Remove a link of a station. * * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. + * @set_ttlm: set the TID to link mapping. + * @set_epcs: Enable/Disable EPCS for station mode. + * @get_radio_mask: get bitmask of radios in use. + * (invoked with the wiphy mutex held) + * @assoc_ml_reconf: Request a non-AP MLO connection to perform ML + * reconfiguration, i.e., add and/or remove links to/from the + * association using ML reconfiguration action frames. Successfully added + * links will be added to the set of valid links. Successfully removed + * links will be removed from the set of valid links. The driver must + * indicate removed links by calling cfg80211_links_removed() and added + * links by calling cfg80211_mlo_reconf_add_done(). When calling + * cfg80211_mlo_reconf_add_done() the bss pointer must be given for each + * link for which MLO reconfiguration 'add' operation was requested. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -4450,7 +4888,7 @@ struct cfg80211_ops { int (*start_ap)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_settings *settings); int (*change_beacon)(struct wiphy *wiphy, struct net_device *dev, - struct cfg80211_beacon_data *info); + struct cfg80211_ap_update *info); int (*stop_ap)(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id); @@ -4513,6 +4951,7 @@ struct cfg80211_ops { struct ieee80211_channel *chan); int (*set_monitor_channel)(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef); int (*scan)(struct wiphy *wiphy, @@ -4544,12 +4983,14 @@ struct cfg80211_ops { int (*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev, int rate[NUM_NL80211_BANDS]); - int (*set_wiphy_params)(struct wiphy *wiphy, u32 changed); + int (*set_wiphy_params)(struct wiphy *wiphy, int radio_idx, + u32 changed); int (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, enum nl80211_tx_power_setting type, int mbm); int (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, - int *dbm); + int radio_idx, unsigned int link_id, int *dbm); void (*rfkill_poll)(struct wiphy *wiphy); @@ -4611,8 +5052,10 @@ struct cfg80211_ops { struct wireless_dev *wdev, struct mgmt_frame_regs *upd); - int (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant); - int (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant); + int (*set_antenna)(struct wiphy *wiphy, int radio_idx, + u32 tx_ant, u32 rx_ant); + int (*get_antenna)(struct wiphy *wiphy, int radio_idx, + u32 *tx_ant, u32 *rx_ant); int (*sched_scan_start)(struct wiphy *wiphy, struct net_device *dev, @@ -4654,9 +5097,9 @@ struct cfg80211_ops { int (*start_radar_detection)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef, - u32 cac_time_ms); + u32 cac_time_ms, int link_id); void (*end_cac)(struct wiphy *wiphy, - struct net_device *dev); + struct net_device *dev, unsigned int link_id); int (*update_ft_ies)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_update_ft_ies_params *ftie); int (*crit_proto_start)(struct wiphy *wiphy, @@ -4760,6 +5203,13 @@ struct cfg80211_ops { struct link_station_del_parameters *params); int (*set_hw_timestamp)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts); + int (*set_ttlm)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_ttlm_params *params); + u32 (*get_radio_mask)(struct wiphy *wiphy, struct net_device *dev); + int (*assoc_ml_reconf)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_ml_reconf_req *req); + int (*set_epcs)(struct wiphy *wiphy, struct net_device *dev, + bool val); }; /* @@ -4771,7 +5221,7 @@ struct cfg80211_ops { * enum wiphy_flags - wiphy capability flags * * @WIPHY_FLAG_SPLIT_SCAN_6GHZ: if set to true, the scan request will be split - * into two, first for legacy bands and second for UHB. + * into two, first for legacy bands and second for 6 GHz. * @WIPHY_FLAG_NETNS_OK: if not set, do not allow changing the netns of this * wiphy at all * @WIPHY_FLAG_PS_ON_BY_DEFAULT: if set to true, powersave will be enabled @@ -4816,6 +5266,11 @@ struct cfg80211_ops { * @WIPHY_FLAG_SUPPORTS_EXT_KCK_32: The device supports 32-byte KCK keys. * @WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER: The device could handle reg notify for * NL80211_REGDOM_SET_BY_DRIVER. + * @WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON: reg_call_notifier() is called if driver + * set this flag to update channels on beacon hints. + * @WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY: support connection to non-primary link + * of an NSTR mobile AP MLD. + * @WIPHY_FLAG_DISABLE_WEXT: disable wireless extensions for this device */ enum wiphy_flags { WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), @@ -4827,9 +5282,10 @@ enum wiphy_flags { WIPHY_FLAG_4ADDR_STATION = BIT(6), WIPHY_FLAG_CONTROL_PORT_PROTOCOL = BIT(7), WIPHY_FLAG_IBSS_RSN = BIT(8), + WIPHY_FLAG_DISABLE_WEXT = BIT(9), WIPHY_FLAG_MESH_AUTH = BIT(10), WIPHY_FLAG_SUPPORTS_EXT_KCK_32 = BIT(11), - /* use hole at 12 */ + WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY = BIT(12), WIPHY_FLAG_SUPPORTS_FW_ROAM = BIT(13), WIPHY_FLAG_AP_UAPSD = BIT(14), WIPHY_FLAG_SUPPORTS_TDLS = BIT(15), @@ -4842,6 +5298,7 @@ enum wiphy_flags { WIPHY_FLAG_SUPPORTS_5_10_MHZ = BIT(22), WIPHY_FLAG_HAS_CHANNEL_SWITCH = BIT(23), WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER = BIT(24), + WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON = BIT(25), }; /** @@ -4858,7 +5315,9 @@ struct ieee80211_iface_limit { * struct ieee80211_iface_combination - possible interface combination * * With this structure the driver can describe which interface - * combinations it supports concurrently. + * combinations it supports concurrently. When set in a struct wiphy_radio, + * the combinations refer to combinations of interfaces currently active on + * that radio. * * Examples: * @@ -5156,6 +5615,8 @@ struct wiphy_iftype_ext_capab { * cfg80211_get_iftype_ext_capa - lookup interface type extended capability * @wiphy: the wiphy to look up from * @type: the interface type to look up + * + * Return: The extended capability for the given interface @type, may be %NULL */ const struct wiphy_iftype_ext_capab * cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type); @@ -5216,6 +5677,94 @@ struct wiphy_iftype_akm_suites { int n_akm_suites; }; +/** + * struct wiphy_radio_cfg - physical radio config of a wiphy + * This structure describes the configurations of a physical radio in a + * wiphy. It is used to denote per-radio attributes belonging to a wiphy. + * + * @rts_threshold: RTS threshold (dot11RTSThreshold); + * -1 (default) = RTS/CTS disabled + * @radio_debugfsdir: Pointer to debugfs directory containing the radio- + * specific parameters. + * NULL (default) = Debugfs directory not created + */ +struct wiphy_radio_cfg { + u32 rts_threshold; + struct dentry *radio_debugfsdir; +}; + +/** + * struct wiphy_radio_freq_range - wiphy frequency range + * @start_freq: start range edge frequency (kHz) + * @end_freq: end range edge frequency (kHz) + */ +struct wiphy_radio_freq_range { + u32 start_freq; + u32 end_freq; +}; + + +/** + * struct wiphy_radio - physical radio of a wiphy + * This structure describes a physical radio belonging to a wiphy. + * It is used to describe concurrent-channel capabilities. Only one channel + * can be active on the radio described by struct wiphy_radio. + * + * @freq_range: frequency range that the radio can operate on. + * @n_freq_range: number of elements in @freq_range + * + * @iface_combinations: Valid interface combinations array, should not + * list single interface types. + * @n_iface_combinations: number of entries in @iface_combinations array. + * + * @antenna_mask: bitmask of antennas connected to this radio. + */ +struct wiphy_radio { + const struct wiphy_radio_freq_range *freq_range; + int n_freq_range; + + const struct ieee80211_iface_combination *iface_combinations; + int n_iface_combinations; + + u32 antenna_mask; +}; + +/** + * enum wiphy_nan_flags - NAN capabilities + * + * @WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC: Device supports NAN configurable + * synchronization. + * @WIPHY_NAN_FLAGS_USERSPACE_DE: Device doesn't support DE offload. + */ +enum wiphy_nan_flags { + WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC = BIT(0), + WIPHY_NAN_FLAGS_USERSPACE_DE = BIT(1), +}; + +/** + * struct wiphy_nan_capa - NAN capabilities + * + * This structure describes the NAN capabilities of a wiphy. + * + * @flags: NAN capabilities flags, see &enum wiphy_nan_flags + * @op_mode: NAN operation mode, as defined in Wi-Fi Aware (TM) specification + * Table 81. + * @n_antennas: number of antennas supported by the device for Tx/Rx. Lower + * nibble indicates the number of TX antennas and upper nibble indicates the + * number of RX antennas. Value 0 indicates the information is not + * available. + * @max_channel_switch_time: maximum channel switch time in milliseconds. + * @dev_capabilities: NAN device capabilities as defined in Wi-Fi Aware (TM) + * specification Table 79 (Capabilities field). + */ +struct wiphy_nan_capa { + u32 flags; + u8 op_mode; + u8 n_antennas; + u16 max_channel_switch_time; + u8 dev_capabilities; +}; + #define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff /** @@ -5376,6 +5925,11 @@ struct wiphy_iftype_akm_suites { * and probe responses. This value should be set if the driver * wishes to limit the number of csa counters. Default (0) means * infinite. + * @bss_param_support: bitmask indicating which bss_parameters as defined in + * &struct bss_parameters the driver can actually handle in the + * .change_bss() callback. The bit positions are defined in &enum + * wiphy_bss_param_flags. + * * @bss_select_support: bitmask indicating the BSS selection criteria supported * by the driver in the .connect() callback. The bit position maps to the * attribute indices defined in &enum nl80211_bss_select_attr. @@ -5384,6 +5938,7 @@ struct wiphy_iftype_akm_suites { * bitmap of &enum nl80211_band values. For instance, for * NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @nan_capa: NAN capabilities * * @txq_limit: configuration of internal TX queue frame limit * @txq_memory_limit: configuration internal TX queue memory limit @@ -5434,6 +5989,13 @@ struct wiphy_iftype_akm_suites { * A value of %CFG80211_HW_TIMESTAMP_ALL_PEERS indicates the driver * supports enabling HW timestamping for all peers (i.e. no need to * specify a mac address). + * + * @radio_cfg: configuration of radios belonging to a muli-radio wiphy. This + * struct contains a list of all radio specific attributes and should be + * used only for multi-radio wiphy. + * + * @radio: radios belonging to this wiphy + * @n_radio: number of radios */ struct wiphy { struct mutex mtx; @@ -5521,6 +6083,8 @@ struct wiphy { void (*reg_notifier)(struct wiphy *wiphy, struct regulatory_request *request); + struct wiphy_radio_cfg *radio_cfg; + /* fields below are read-only, assigned by cfg80211 */ const struct ieee80211_regdomain __rcu *regd; @@ -5552,9 +6116,11 @@ struct wiphy { u8 max_num_csa_counters; + u32 bss_param_support; u32 bss_select_support; u8 nan_supported_bands; + struct wiphy_nan_capa nan_capa; u32 txq_limit; u32 txq_memory_limit; @@ -5584,6 +6150,9 @@ struct wiphy { u16 hw_timestamp_max_peers; + int n_radio; + const struct wiphy_radio *radio; + char priv[] __aligned(NETDEV_ALIGN); }; @@ -5717,7 +6286,7 @@ int wiphy_register(struct wiphy *wiphy); * @wiphy: the wiphy to check the locking on * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit the + * Return: the value of the specified RCU-protected pointer, but omit the * READ_ONCE(), because caller holds the wiphy mutex used for updates. */ #define wiphy_dereference(wiphy, p) \ @@ -5726,6 +6295,10 @@ int wiphy_register(struct wiphy *wiphy); /** * get_wiphy_regdom - get custom regdomain for the given wiphy * @wiphy: the wiphy to get the regdomain from + * + * Context: Requires any of RTNL, wiphy mutex or RCU protection. + * + * Return: pointer to the regulatory domain associated with the wiphy */ const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy); @@ -5787,6 +6360,10 @@ static inline void wiphy_unlock(struct wiphy *wiphy) mutex_unlock(&wiphy->mtx); } +DEFINE_GUARD(wiphy, struct wiphy *, + mutex_lock(&_T->mtx), + mutex_unlock(&_T->mtx)) + struct wiphy_work; typedef void (*wiphy_work_func_t)(struct wiphy *, struct wiphy_work *); @@ -5826,6 +6403,16 @@ void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work); */ void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work); +/** + * wiphy_work_flush - flush previously queued work + * @wiphy: the wiphy, for debug purposes + * @work: the work to flush, this can be %NULL to flush all work + * + * Flush the work (i.e. run it if pending). This must be called + * under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_work_flush(struct wiphy *wiphy, struct wiphy_work *work); + struct wiphy_delayed_work { struct wiphy_work work; struct wiphy *wiphy; @@ -5853,6 +6440,11 @@ static inline void wiphy_delayed_work_init(struct wiphy_delayed_work *dwork, * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can * use just cancel_work() instead of cancel_work_sync(), it requires * being in a section protected by wiphy_lock(). + * + * Note that these are scheduled with a timer where the accuracy + * becomes less the longer in the future the scheduled timer is. Use + * wiphy_hrtimer_work_queue() if the timer must be not be late by more + * than approximately 10 percent. */ void wiphy_delayed_work_queue(struct wiphy *wiphy, struct wiphy_delayed_work *dwork, @@ -5870,6 +6462,149 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy, struct wiphy_delayed_work *dwork); /** + * wiphy_delayed_work_flush - flush previously queued delayed work + * @wiphy: the wiphy, for debug purposes + * @dwork: the delayed work to flush + * + * Flush the work (i.e. run it if pending). This must be called + * under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_delayed_work_flush(struct wiphy *wiphy, + struct wiphy_delayed_work *dwork); + +/** + * wiphy_delayed_work_pending - Find out whether a wiphy delayable + * work item is currently pending. + * + * @wiphy: the wiphy, for debug purposes + * @dwork: the delayed work in question + * + * Return: true if timer is pending, false otherwise + * + * How wiphy_delayed_work_queue() works is by setting a timer which + * when it expires calls wiphy_work_queue() to queue the wiphy work. + * Because wiphy_delayed_work_queue() uses mod_timer(), if it is + * called twice and the second call happens before the first call + * deadline, the work will rescheduled for the second deadline and + * won't run before that. + * + * wiphy_delayed_work_pending() can be used to detect if calling + * wiphy_work_delayed_work_queue() would start a new work schedule + * or delayed a previous one. As seen below it cannot be used to + * detect precisely if the work has finished to execute nor if it + * is currently executing. + * + * CPU0 CPU1 + * wiphy_delayed_work_queue(wk) + * mod_timer(wk->timer) + * wiphy_delayed_work_pending(wk) -> true + * + * [...] + * expire_timers(wk->timer) + * detach_timer(wk->timer) + * wiphy_delayed_work_pending(wk) -> false + * wk->timer->function() | + * wiphy_work_queue(wk) | delayed work pending + * list_add_tail() | returns false but + * queue_work(cfg80211_wiphy_work) | wk->func() has not + * | been run yet + * [...] | + * cfg80211_wiphy_work() | + * wk->func() V + * + */ +bool wiphy_delayed_work_pending(struct wiphy *wiphy, + struct wiphy_delayed_work *dwork); + +struct wiphy_hrtimer_work { + struct wiphy_work work; + struct wiphy *wiphy; + struct hrtimer timer; +}; + +enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t); + +static inline void wiphy_hrtimer_work_init(struct wiphy_hrtimer_work *hrwork, + wiphy_work_func_t func) +{ + hrtimer_setup(&hrwork->timer, wiphy_hrtimer_work_timer, + CLOCK_BOOTTIME, HRTIMER_MODE_REL); + wiphy_work_init(&hrwork->work, func); +} + +/** + * wiphy_hrtimer_work_queue - queue hrtimer work for the wiphy + * @wiphy: the wiphy to queue for + * @hrwork: the high resolution timer worker + * @delay: the delay given as a ktime_t + * + * Please refer to wiphy_delayed_work_queue(). The difference is that + * the hrtimer work uses a high resolution timer for scheduling. This + * may be needed if timeouts might be scheduled further in the future + * and the accuracy of the normal timer is not sufficient. + * + * Expect a delay of a few milliseconds as the timer is scheduled + * with some slack and some more time may pass between queueing the + * work and its start. + */ +void wiphy_hrtimer_work_queue(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork, + ktime_t delay); + +/** + * wiphy_hrtimer_work_cancel - cancel previously queued hrtimer work + * @wiphy: the wiphy, for debug purposes + * @hrtimer: the hrtimer work to cancel + * + * Cancel the work *without* waiting for it, this assumes being + * called under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_hrtimer_work_cancel(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrtimer); + +/** + * wiphy_hrtimer_work_flush - flush previously queued hrtimer work + * @wiphy: the wiphy, for debug purposes + * @hrwork: the hrtimer work to flush + * + * Flush the work (i.e. run it if pending). This must be called + * under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_hrtimer_work_flush(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork); + +/** + * wiphy_hrtimer_work_pending - Find out whether a wiphy hrtimer + * work item is currently pending. + * + * @wiphy: the wiphy, for debug purposes + * @hrwork: the hrtimer work in question + * + * Return: true if timer is pending, false otherwise + * + * Please refer to the wiphy_delayed_work_pending() documentation as + * this is the equivalent function for hrtimer based delayed work + * items. + */ +bool wiphy_hrtimer_work_pending(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork); + +/** + * enum ieee80211_ap_reg_power - regulatory power for an Access Point + * + * @IEEE80211_REG_UNSET_AP: Access Point has no regulatory power mode + * @IEEE80211_REG_LPI_AP: Indoor Access Point + * @IEEE80211_REG_SP_AP: Standard power Access Point + * @IEEE80211_REG_VLP_AP: Very low power Access Point + */ +enum ieee80211_ap_reg_power { + IEEE80211_REG_UNSET_AP, + IEEE80211_REG_LPI_AP, + IEEE80211_REG_SP_AP, + IEEE80211_REG_VLP_AP, +}; + +/** * struct wireless_dev - wireless device state * * For netdevs, this structure must be allocated by the driver @@ -5897,7 +6632,6 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy, * wireless device if it has no netdev * @u: union containing data specific to @iftype * @connected: indicates if connected or not (STA mode) - * @bssid: (private) Used by the internal configuration code * @wext: (private) Used by the internal wireless extensions compat code * @wext.ibss: (private) IBSS data part of wext handling * @wext.connect: (private) connection handling data @@ -5917,16 +6651,9 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy, * @mgmt_registrations: list of registrations for management frames * @mgmt_registrations_need_update: mgmt registrations were updated, * need to propagate the update to the driver - * @mtx: mutex used to lock data in this struct, may be used by drivers - * and some API functions require it held - * @beacon_interval: beacon interval used on this device for transmitting - * beacons, 0 when not valid * @address: The address for this device, valid only if @netdev is %NULL * @is_running: true if this is a non-netdev device that has been started, e.g. * the P2P Device. - * @cac_started: true if DFS channel availability check has been started - * @cac_start_time: timestamp (jiffies) when the dfs state was entered. - * @cac_time_ms: CAC time in ms * @ps: powersave mode is enabled * @ps_timeout: dynamic powersave timeout * @ap_unexpected_nlportid: (private) netlink port ID of application @@ -5941,6 +6668,7 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy, * @event_lock: (private) lock for event list * @owner_nlportid: (private) owner socket port ID * @nl_owner_dead: (private) owner socket went away + * @cqm_rssi_work: (private) CQM RSSI reporting work * @cqm_config: (private) nl80211 RSSI monitor state * @pmsr_list: (private) peer measurement requests * @pmsr_lock: (private) peer measurements requests/results lock @@ -5949,7 +6677,13 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy, * unprotected beacon report * @links: array of %IEEE80211_MLD_MAX_NUM_LINKS elements containing @addr * @ap and @client for each link + * @links.cac_started: true if DFS channel availability check has been + * started + * @links.cac_start_time: timestamp (jiffies) when the dfs state was + * entered. + * @links.cac_time_ms: CAC time in ms * @valid_links: bitmap describing what elements of @links are valid + * @radio_mask: Bitmask of radios that this interface is allowed to operate on. */ struct wireless_dev { struct wiphy *wiphy; @@ -5964,8 +6698,6 @@ struct wireless_dev { struct list_head mgmt_registrations; u8 mgmt_registrations_need_update:1; - struct mutex mtx; - bool use_4addr, is_running, registered, registering; u8 address[ETH_ALEN] __aligned(sizeof(u16)); @@ -5992,11 +6724,6 @@ struct wireless_dev { u32 owner_nlportid; bool nl_owner_dead; - /* FIXME: need to rework radar detection for MLO */ - bool cac_started; - unsigned long cac_start_time; - unsigned int cac_time_ms; - #ifdef CONFIG_CFG80211_WEXT /* wext data */ struct { @@ -6013,7 +6740,8 @@ struct wireless_dev { } wext; #endif - struct cfg80211_cqm_config *cqm_config; + struct wiphy_work cqm_rssi_work; + struct cfg80211_cqm_config __rcu *cqm_config; struct list_head pmsr_list; spinlock_t pmsr_lock; @@ -6031,7 +6759,7 @@ struct wireless_dev { int beacon_interval; struct cfg80211_chan_def preset_chandef; struct cfg80211_chan_def chandef; - u8 id[IEEE80211_MAX_SSID_LEN]; + u8 id[IEEE80211_MAX_MESH_ID_LEN]; u8 id_len, id_up_len; } mesh; struct { @@ -6049,6 +6777,9 @@ struct wireless_dev { struct { struct cfg80211_chan_def chandef; } ocb; + struct { + u8 cluster_id[ETH_ALEN] __aligned(2); + } nan; } u; struct { @@ -6062,8 +6793,14 @@ struct wireless_dev { struct cfg80211_internal_bss *current_bss; } client; }; + + bool cac_started; + unsigned long cac_start_time; + unsigned int cac_time_ms; } links[IEEE80211_MLD_MAX_NUM_LINKS]; u16 valid_links; + + u32 radio_mask; }; static inline const u8 *wdev_address(struct wireless_dev *wdev) @@ -6151,16 +6888,6 @@ ieee80211_channel_to_khz(const struct ieee80211_channel *chan) } /** - * ieee80211_s1g_channel_width - get allowed channel width from @chan - * - * Only allowed for band NL80211_BAND_S1GHZ - * @chan: channel - * Return: The allowed channel width for this center_freq - */ -enum nl80211_chan_width -ieee80211_s1g_channel_width(const struct ieee80211_channel *chan); - -/** * ieee80211_channel_to_freq_khz - convert channel number to frequency * @chan: channel number * @band: band, necessary due to channel number overlap @@ -6227,6 +6954,8 @@ ieee80211_get_channel(struct wiphy *wiphy, int freq) * * The Preferred Scanning Channels (PSC) are defined in * Draft IEEE P802.11ax/D5.0, 26.17.2.3.3 + * + * Return: %true if channel is a PSC, %false otherwise */ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) { @@ -6237,6 +6966,41 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) } /** + * ieee80211_radio_freq_range_valid - Check if the radio supports the + * specified frequency range + * + * @radio: wiphy radio + * @freq: the frequency (in KHz) to be queried + * @width: the bandwidth (in KHz) to be queried + * + * Return: whether or not the given frequency range is valid for the given radio + */ +bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, + u32 freq, u32 width); + +/** + * cfg80211_radio_chandef_valid - Check if the radio supports the chandef + * + * @radio: wiphy radio + * @chandef: chandef for current channel + * + * Return: whether or not the given chandef is valid for the given radio + */ +bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, + const struct cfg80211_chan_def *chandef); + +/** + * cfg80211_wdev_channel_allowed - Check if the wdev may use the channel + * + * @wdev: the wireless device + * @chan: channel to check + * + * Return: whether or not the wdev may use the channel + */ +bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, + struct ieee80211_channel *chan); + +/** * ieee80211_get_response_rate - get basic rate for a given rate * * @sband: the band to look for rates in @@ -6255,13 +7019,11 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband, /** * ieee80211_mandatory_rates - get mandatory rates for a given band * @sband: the band to look for rates in - * @scan_width: width of the control channel * - * This function returns a bitmap of the mandatory rates for the given - * band, bits are set according to the rate position in the bitrates array. + * Return: a bitmap of the mandatory rates for the given band, bits + * are set according to the rate position in the bitrates array. */ -u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, - enum nl80211_bss_scan_width scan_width); +u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband); /* * Radiotap parsing functions -- for controlled injection support @@ -6473,6 +7235,8 @@ bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto); * header to the ethernet header (if present). * * @skb: The 802.3 frame with embedded mesh header + * + * Return: 0 on success. Non-zero on error. */ int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb); @@ -6602,7 +7366,7 @@ static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len) * @ies: data consisting of IEs * @len: length of data * - * Return: %NULL if the etended element could not be found or if + * Return: %NULL if the extended element could not be found or if * the element is invalid (claims to be longer than the given * data) or if the byte array doesn't match; otherwise return the * requested element struct. @@ -6681,13 +7445,45 @@ cfg80211_find_vendor_ie(unsigned int oui, int oui_type, } /** + * enum cfg80211_rnr_iter_ret - reduced neighbor report iteration state + * @RNR_ITER_CONTINUE: continue iterating with the next entry + * @RNR_ITER_BREAK: break iteration and return success + * @RNR_ITER_ERROR: break iteration and return error + */ +enum cfg80211_rnr_iter_ret { + RNR_ITER_CONTINUE, + RNR_ITER_BREAK, + RNR_ITER_ERROR, +}; + +/** + * cfg80211_iter_rnr - iterate reduced neighbor report entries + * @elems: the frame elements to iterate RNR elements and then + * their entries in + * @elems_len: length of the elements + * @iter: iteration function, see also &enum cfg80211_rnr_iter_ret + * for the return value + * @iter_data: additional data passed to the iteration function + * Return: %true on success (after successfully iterating all entries + * or if the iteration function returned %RNR_ITER_BREAK), + * %false on error (iteration function returned %RNR_ITER_ERROR + * or elements were malformed.) + */ +bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len, + enum cfg80211_rnr_iter_ret + (*iter)(void *data, u8 type, + const struct ieee80211_neighbor_ap_info *info, + const u8 *tbtt_info, u8 tbtt_info_len), + void *iter_data); + +/** * cfg80211_defragment_element - Defrag the given element data into a buffer * * @elem: the element to defragment * @ies: elements where @elem is contained * @ieslen: length of @ies - * @data: buffer to store element data - * @data_len: length of @data + * @data: buffer to store element data, or %NULL to just determine size + * @data_len: length of @data, or 0 * @frag_id: the element ID of fragments * * Return: length of @data, or -EINVAL on error @@ -6749,7 +7545,7 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2); /** * regulatory_set_wiphy_regd - set regdom info for self managed drivers * @wiphy: the wireless device we want to process the regulatory domain on - * @rd: the regulatory domain informatoin to use for this wiphy + * @rd: the regulatory domain information to use for this wiphy * * Set the regulatory domain information for self-managed wiphys, only they * may use this function. See %REGULATORY_WIPHY_SELF_MANAGED for more @@ -6819,6 +7615,8 @@ const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, * * You can use this to map the regulatory request initiator enum to a * proper string representation. + * + * Return: pointer to string representation of the initiator */ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); @@ -6827,6 +7625,8 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); * @wiphy: wiphy for which pre-CAC capability is checked. * * Pre-CAC is allowed only in some regdomains (notable ETSI). + * + * Return: %true if allowed, %false otherwise */ bool regulatory_pre_cac_allowed(struct wiphy *wiphy); @@ -6840,7 +7640,7 @@ bool regulatory_pre_cac_allowed(struct wiphy *wiphy); * Regulatory self-managed driver can use it to proactively * * @alpha2: the ISO/IEC 3166 alpha2 wmm rule to be queried. - * @freq: the freqency(in MHz) to be queried. + * @freq: the frequency (in MHz) to be queried. * @rule: pointer to store the wmm rule from the regulatory db. * * Self-managed wireless drivers can use this function to query @@ -6923,22 +7723,6 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, gfp_t gfp); static inline struct cfg80211_bss * __must_check -cfg80211_inform_bss_width_frame(struct wiphy *wiphy, - struct ieee80211_channel *rx_channel, - enum nl80211_bss_scan_width scan_width, - struct ieee80211_mgmt *mgmt, size_t len, - s32 signal, gfp_t gfp) -{ - struct cfg80211_inform_bss data = { - .chan = rx_channel, - .scan_width = scan_width, - .signal = signal, - }; - - return cfg80211_inform_bss_frame_data(wiphy, &data, mgmt, len, gfp); -} - -static inline struct cfg80211_bss * __must_check cfg80211_inform_bss_frame(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, struct ieee80211_mgmt *mgmt, size_t len, @@ -6946,7 +7730,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, { struct cfg80211_inform_bss data = { .chan = rx_channel, - .scan_width = NL80211_BSS_CHAN_WIDTH_20, .signal = signal, }; @@ -6978,6 +7761,8 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid, * cfg80211_is_element_inherited - returns if element ID should be inherited * @element: element to check * @non_inherit_element: non inheritance element + * + * Return: %true if should be inherited, %false otherwise */ bool cfg80211_is_element_inherited(const struct element *element, const struct element *non_inherit_element); @@ -6990,6 +7775,8 @@ bool cfg80211_is_element_inherited(const struct element *element, * @sub_elem: current MBSSID subelement (profile) * @merged_ie: location of the merged profile * @max_copy_len: max merged profile length + * + * Return: the number of bytes merged */ size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, const struct element *mbssid_elem, @@ -7002,11 +7789,13 @@ size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, * from a beacon or probe response * @CFG80211_BSS_FTYPE_BEACON: data comes from a beacon * @CFG80211_BSS_FTYPE_PRESP: data comes from a probe response + * @CFG80211_BSS_FTYPE_S1G_BEACON: data comes from an S1G beacon */ enum cfg80211_bss_frame_type { CFG80211_BSS_FTYPE_UNKNOWN, CFG80211_BSS_FTYPE_BEACON, CFG80211_BSS_FTYPE_PRESP, + CFG80211_BSS_FTYPE_S1G_BEACON, }; /** @@ -7015,12 +7804,29 @@ enum cfg80211_bss_frame_type { * @ielen: length of IEs * @band: enum nl80211_band of the channel * - * Returns the channel number, or -1 if none could be determined. + * Return: the channel number, or -1 if none could be determined. */ int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen, enum nl80211_band band); /** + * cfg80211_ssid_eq - compare two SSIDs + * @a: first SSID + * @b: second SSID + * + * Return: %true if SSIDs are equal, %false otherwise. + */ +static inline bool +cfg80211_ssid_eq(struct cfg80211_ssid *a, struct cfg80211_ssid *b) +{ + if (WARN_ON(!a || !b)) + return false; + if (a->ssid_len != b->ssid_len) + return false; + return memcmp(a->ssid, b->ssid, a->ssid_len) ? false : true; +} + +/** * cfg80211_inform_bss_data - inform cfg80211 of a new BSS * * @wiphy: the wiphy reporting the BSS @@ -7049,26 +7855,6 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, gfp_t gfp); static inline struct cfg80211_bss * __must_check -cfg80211_inform_bss_width(struct wiphy *wiphy, - struct ieee80211_channel *rx_channel, - enum nl80211_bss_scan_width scan_width, - enum cfg80211_bss_frame_type ftype, - const u8 *bssid, u64 tsf, u16 capability, - u16 beacon_interval, const u8 *ie, size_t ielen, - s32 signal, gfp_t gfp) -{ - struct cfg80211_inform_bss data = { - .chan = rx_channel, - .scan_width = scan_width, - .signal = signal, - }; - - return cfg80211_inform_bss_data(wiphy, &data, ftype, bssid, tsf, - capability, beacon_interval, ie, ielen, - gfp); -} - -static inline struct cfg80211_bss * __must_check cfg80211_inform_bss(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, enum cfg80211_bss_frame_type ftype, @@ -7078,7 +7864,6 @@ cfg80211_inform_bss(struct wiphy *wiphy, { struct cfg80211_inform_bss data = { .chan = rx_channel, - .scan_width = NL80211_BSS_CHAN_WIDTH_20, .signal = signal, }; @@ -7088,6 +7873,27 @@ cfg80211_inform_bss(struct wiphy *wiphy, } /** + * __cfg80211_get_bss - get a BSS reference + * @wiphy: the wiphy this BSS struct belongs to + * @channel: the channel to search on (or %NULL) + * @bssid: the desired BSSID (or %NULL) + * @ssid: the desired SSID (or %NULL) + * @ssid_len: length of the SSID (or 0) + * @bss_type: type of BSS, see &enum ieee80211_bss_type + * @privacy: privacy filter, see &enum ieee80211_privacy + * @use_for: indicates which use is intended + * + * Return: Reference-counted BSS on success. %NULL on error. + */ +struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy, + u32 use_for); + +/** * cfg80211_get_bss - get a BSS reference * @wiphy: the wiphy this BSS struct belongs to * @channel: the channel to search on (or %NULL) @@ -7096,13 +7902,22 @@ cfg80211_inform_bss(struct wiphy *wiphy, * @ssid_len: length of the SSID (or 0) * @bss_type: type of BSS, see &enum ieee80211_bss_type * @privacy: privacy filter, see &enum ieee80211_privacy + * + * This version implies regular usage, %NL80211_BSS_USE_FOR_NORMAL. + * + * Return: Reference-counted BSS on success. %NULL on error. */ -struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, - struct ieee80211_channel *channel, - const u8 *bssid, - const u8 *ssid, size_t ssid_len, - enum ieee80211_bss_type bss_type, - enum ieee80211_privacy privacy); +static inline struct cfg80211_bss * +cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, + const u8 *bssid, const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy) +{ + return __cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, + bss_type, privacy, + NL80211_BSS_USE_FOR_NORMAL); +} + static inline struct cfg80211_bss * cfg80211_get_ibss(struct wiphy *wiphy, struct ieee80211_channel *channel, @@ -7163,19 +7978,6 @@ void cfg80211_bss_iter(struct wiphy *wiphy, void *data), void *iter_data); -static inline enum nl80211_bss_scan_width -cfg80211_chandef_to_scan_width(const struct cfg80211_chan_def *chandef) -{ - switch (chandef->width) { - case NL80211_CHAN_WIDTH_5: - return NL80211_BSS_CHAN_WIDTH_5; - case NL80211_CHAN_WIDTH_10: - return NL80211_BSS_CHAN_WIDTH_10; - default: - return NL80211_BSS_CHAN_WIDTH_20; - } -} - /** * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame * @dev: network device @@ -7208,9 +8010,7 @@ void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len); void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); /** - * struct cfg80211_rx_assoc_resp - association response data - * @bss: the BSS that association was requested with, ownership of the pointer - * moves to cfg80211 in the call to cfg80211_rx_assoc_resp() + * struct cfg80211_rx_assoc_resp_data - association response data * @buf: (Re)Association Response frame (header + body) * @len: length of the frame data * @uapsd_queues: bitmap of queues configured for uapsd. Same format @@ -7220,10 +8020,12 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); * @ap_mld_addr: AP MLD address (in case of MLO) * @links: per-link information indexed by link ID, use links[0] for * non-MLO connections + * @links.bss: the BSS that association was requested with, ownership of the + * pointer moves to cfg80211 in the call to cfg80211_rx_assoc_resp() * @links.status: Set this (along with a BSS pointer) for links that * were rejected by the AP. */ -struct cfg80211_rx_assoc_resp { +struct cfg80211_rx_assoc_resp_data { const u8 *buf; size_t len; const u8 *req_ies; @@ -7231,7 +8033,7 @@ struct cfg80211_rx_assoc_resp { int uapsd_queues; const u8 *ap_mld_addr; struct { - const u8 *addr; + u8 addr[ETH_ALEN] __aligned(2); struct cfg80211_bss *bss; u16 status; } links[IEEE80211_MLD_MAX_NUM_LINKS]; @@ -7240,7 +8042,7 @@ struct cfg80211_rx_assoc_resp { /** * cfg80211_rx_assoc_resp - notification of processed association response * @dev: network device - * @data: association response data, &struct cfg80211_rx_assoc_resp + * @data: association response data, &struct cfg80211_rx_assoc_resp_data * * After being asked to associate via cfg80211_ops::assoc() the driver must * call either this function or cfg80211_auth_timeout(). @@ -7248,7 +8050,7 @@ struct cfg80211_rx_assoc_resp { * This function may sleep. The caller must hold the corresponding wdev's mutex. */ void cfg80211_rx_assoc_resp(struct net_device *dev, - struct cfg80211_rx_assoc_resp *data); + const struct cfg80211_rx_assoc_resp_data *data); /** * struct cfg80211_assoc_failure - association failure data @@ -7367,7 +8169,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, * RFkill integration in cfg80211 is almost invisible to drivers, * as cfg80211 automatically registers an rfkill instance for each * wireless device it knows about. Soft kill is also translated - * into disconnecting and turning all interfaces off, drivers are + * into disconnecting and turning all interfaces off. Drivers are * expected to turn off the device when all interfaces are down. * * However, devices may have a hard RFkill line, in which case they @@ -7415,7 +8217,7 @@ static inline void wiphy_rfkill_stop_polling(struct wiphy *wiphy) * the configuration mechanism. * * A driver supporting vendor commands must register them as an array - * in struct wiphy, with handlers for each one, each command has an + * in struct wiphy, with handlers for each one. Each command has an * OUI and sub command ID to identify it. * * Note that this feature should not be (ab)used to implement protocol @@ -7488,8 +8290,9 @@ int cfg80211_vendor_cmd_reply(struct sk_buff *skb); * cfg80211_vendor_cmd_get_sender - get the current sender netlink ID * @wiphy: the wiphy * - * Return the current netlink port ID in a vendor command handler. - * Valid to call only there. + * Return: the current netlink port ID in a vendor command handler. + * + * Context: May only be called from a vendor command handler */ unsigned int cfg80211_vendor_cmd_get_sender(struct wiphy *wiphy); @@ -7579,7 +8382,7 @@ static inline void cfg80211_vendor_event(struct sk_buff *skb, gfp_t gfp) * interact with driver-specific tools to aid, for instance, * factory programming. * - * This chapter describes how drivers interact with it, for more + * This chapter describes how drivers interact with it. For more * information see the nl80211 book's chapter on it. */ @@ -7967,7 +8770,8 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, * cfg80211_port_authorized - notify cfg80211 of successful security association * * @dev: network device - * @bssid: the BSSID of the AP + * @peer_addr: BSSID of the AP/P2P GO in case of STA/GC or STA/GC MAC address + * in case of AP/P2P GO * @td_bitmap: transition disable policy * @td_bitmap_len: Length of transition disable policy * @gfp: allocation flags @@ -7978,8 +8782,11 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, * should be preceded with a call to cfg80211_connect_result(), * cfg80211_connect_done(), cfg80211_connect_bss() or cfg80211_roamed() to * indicate the 802.11 association. + * This function can also be called by AP/P2P GO driver that supports + * authentication offload. In this case the peer_mac passed is that of + * associated STA/GC. */ -void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, +void cfg80211_port_authorized(struct net_device *dev, const u8 *peer_addr, const u8* td_bitmap, u8 td_bitmap_len, gfp_t gfp); /** @@ -8038,10 +8845,23 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie, * * @sinfo: the station information * @gfp: allocation flags + * + * Return: 0 on success. Non-zero on error. */ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); /** + * cfg80211_link_sinfo_alloc_tid_stats - allocate per-tid statistics. + * + * @link_sinfo: the link station information + * @gfp: allocation flags + * + * Return: 0 on success. Non-zero on error. + */ +int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo, + gfp_t gfp); + +/** * cfg80211_sinfo_release_content - release contents of station info * @sinfo: the station information * @@ -8052,6 +8872,13 @@ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); static inline void cfg80211_sinfo_release_content(struct station_info *sinfo) { kfree(sinfo->pertid); + + for (int link_id = 0; link_id < ARRAY_SIZE(sinfo->links); link_id++) { + if (sinfo->links[link_id]) { + kfree(sinfo->links[link_id]->pertid); + kfree(sinfo->links[link_id]); + } + } } /** @@ -8412,6 +9239,7 @@ void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, * @chandef: chandef for the current channel * @event: type of event * @gfp: context flags + * @link_id: valid link_id for MLO operation or 0 otherwise. * * This function is called when a Channel availability check (CAC) is finished * or aborted. This must be called to notify the completion of a CAC process, @@ -8419,7 +9247,8 @@ void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, */ void cfg80211_cac_event(struct net_device *netdev, const struct cfg80211_chan_def *chandef, - enum nl80211_radar_event event, gfp_t gfp); + enum nl80211_radar_event event, gfp_t gfp, + unsigned int link_id); /** * cfg80211_background_cac_abort - Channel Availability Check offchan abort event @@ -8454,6 +9283,7 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index, /** * cfg80211_rx_spurious_frame - inform userspace about a spurious frame * @dev: The device the frame matched to + * @link_id: the link the frame was received on, -1 if not applicable or unknown * @addr: the transmitter address * @gfp: context flags * @@ -8463,13 +9293,14 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index, * Return: %true if the frame was passed to userspace (or this failed * for a reason other than not having a subscription.) */ -bool cfg80211_rx_spurious_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp); +bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp); /** * cfg80211_rx_unexpected_4addr_frame - inform about unexpected WDS frame * @dev: The device the frame matched to * @addr: the transmitter address + * @link_id: the link the frame was received on, -1 if not applicable or unknown * @gfp: context flags * * This function is used in AP mode (only!) to inform userspace that @@ -8479,8 +9310,8 @@ bool cfg80211_rx_spurious_frame(struct net_device *dev, * Return: %true if the frame was passed to userspace (or this failed * for a reason other than not having a subscription.) */ -bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp); +bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp); /** * cfg80211_probe_status - notify userspace about probe status @@ -8532,6 +9363,34 @@ static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy, } /** + * struct cfg80211_beaconing_check_config - beacon check configuration + * @iftype: the interface type to check for + * @relax: allow IR-relaxation conditions to apply (e.g. another + * interface connected already on the same channel) + * NOTE: If this is set, wiphy mutex must be held. + * @reg_power: &enum ieee80211_ap_reg_power value indicating the + * advertised/used 6 GHz regulatory power setting + */ +struct cfg80211_beaconing_check_config { + enum nl80211_iftype iftype; + enum ieee80211_ap_reg_power reg_power; + bool relax; +}; + +/** + * cfg80211_reg_check_beaconing - check if beaconing is allowed + * @wiphy: the wiphy + * @chandef: the channel definition + * @cfg: additional parameters for the checking + * + * Return: %true if there is no secondary channel or the secondary channel(s) + * can be used for beaconing (i.e. is not a radar channel etc.) + */ +bool cfg80211_reg_check_beaconing(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + struct cfg80211_beaconing_check_config *cfg); + +/** * cfg80211_reg_can_beacon - check if beaconing is allowed * @wiphy: the wiphy * @chandef: the channel definition @@ -8540,9 +9399,17 @@ static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy, * Return: %true if there is no secondary channel or the secondary channel(s) * can be used for beaconing (i.e. is not a radar channel etc.) */ -bool cfg80211_reg_can_beacon(struct wiphy *wiphy, - struct cfg80211_chan_def *chandef, - enum nl80211_iftype iftype); +static inline bool +cfg80211_reg_can_beacon(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) +{ + struct cfg80211_beaconing_check_config config = { + .iftype = iftype, + }; + + return cfg80211_reg_check_beaconing(wiphy, chandef, &config); +} /** * cfg80211_reg_can_beacon_relax - check if beaconing is allowed with relaxation @@ -8555,34 +9422,41 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy, * also checks if IR-relaxation conditions apply, to allow beaconing under * more permissive conditions. * - * Requires the wiphy mutex to be held. + * Context: Requires the wiphy mutex to be held. */ -bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, - struct cfg80211_chan_def *chandef, - enum nl80211_iftype iftype); +static inline bool +cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) +{ + struct cfg80211_beaconing_check_config config = { + .iftype = iftype, + .relax = true, + }; -/* + return cfg80211_reg_check_beaconing(wiphy, chandef, &config); +} + +/** * cfg80211_ch_switch_notify - update wdev channel and notify userspace * @dev: the device which switched channels * @chandef: the new channel definition * @link_id: the link ID for MLO, must be 0 for non-MLO - * @punct_bitmap: the new puncturing bitmap * - * Caller must acquire wdev_lock, therefore must only be called from sleepable + * Caller must hold wiphy mutex, therefore must only be called from sleepable * driver context! */ void cfg80211_ch_switch_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, - unsigned int link_id, u16 punct_bitmap); + unsigned int link_id); -/* +/** * cfg80211_ch_switch_started_notify - notify channel switch start * @dev: the device on which the channel switch started * @chandef: the future channel definition * @link_id: the link ID for MLO, must be 0 for non-MLO * @count: the number of TBTTs until the channel switch happens * @quiet: whether or not immediate quiet was requested by the AP - * @punct_bitmap: the future puncturing bitmap * * Inform the userspace about the channel switch that has just * started, so that it can take appropriate actions (eg. starting @@ -8591,7 +9465,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev, void cfg80211_ch_switch_started_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, unsigned int link_id, u8 count, - bool quiet, u16 punct_bitmap); + bool quiet); /** * ieee80211_operating_class_to_band - convert operating class to band @@ -8599,18 +9473,31 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev, * @operating_class: the operating class to convert * @band: band pointer to fill * - * Returns %true if the conversion was successful, %false otherwise. + * Return: %true if the conversion was successful, %false otherwise. */ bool ieee80211_operating_class_to_band(u8 operating_class, enum nl80211_band *band); /** + * ieee80211_operating_class_to_chandef - convert operating class to chandef + * + * @operating_class: the operating class to convert + * @chan: the ieee80211_channel to convert + * @chandef: a pointer to the resulting chandef + * + * Return: %true if the conversion was successful, %false otherwise. + */ +bool ieee80211_operating_class_to_chandef(u8 operating_class, + struct ieee80211_channel *chan, + struct cfg80211_chan_def *chandef); + +/** * ieee80211_chandef_to_operating_class - convert chandef to operation class * * @chandef: the chandef to convert * @op_class: a pointer to the resulting operating class * - * Returns %true if the conversion was successful, %false otherwise. + * Return: %true if the conversion was successful, %false otherwise. */ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class); @@ -8620,7 +9507,7 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, * * @chandef: the chandef to convert * - * Returns the center frequency of chandef (1st segment) in KHz. + * Return: the center frequency of chandef (1st segment) in KHz. */ static inline u32 ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef) @@ -8628,7 +9515,7 @@ ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef) return MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; } -/* +/** * cfg80211_tdls_oper_request - request userspace to perform TDLS operation * @dev: the device on which the operation is requested * @peer: the MAC address of the peer device @@ -8647,11 +9534,11 @@ void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper, u16 reason_code, gfp_t gfp); -/* +/** * cfg80211_calculate_bitrate - calculate actual bitrate (in 100Kbps units) * @rate: given rate_info to calculate bitrate from * - * return 0 if MCS index >= 32 + * Return: calculated bitrate */ u32 cfg80211_calculate_bitrate(struct rate_info *rate); @@ -8665,7 +9552,7 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate); * when the driver wishes to unregister the wdev, e.g. when the hardware device * is unbound from the driver. * - * Requires the RTNL and wiphy mutex to be held. + * Context: Requires the RTNL and wiphy mutex to be held. */ void cfg80211_unregister_wdev(struct wireless_dev *wdev); @@ -8678,7 +9565,9 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev); * held. Otherwise, both register_netdevice() and register_netdev() are usable * instead as well. * - * Requires the RTNL and wiphy mutex to be held. + * Context: Requires the RTNL and wiphy mutex to be held. + * + * Return: 0 on success. Non-zero on error. */ int cfg80211_register_netdevice(struct net_device *dev); @@ -8691,7 +9580,7 @@ int cfg80211_register_netdevice(struct net_device *dev); * is held. Otherwise, both unregister_netdevice() and unregister_netdev() are * usable instead as well. * - * Requires the RTNL and wiphy mutex to be held. + * Context: Requires the RTNL and wiphy mutex to be held. */ static inline void cfg80211_unregister_netdevice(struct net_device *dev) { @@ -8767,9 +9656,9 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, * correctly, if not the result of using this function will not * be ordered correctly either, i.e. it does no reordering. * - * The function returns the offset where the next part of the - * buffer starts, which may be @ielen if the entire (remainder) - * of the buffer should be used. + * Return: The offset where the next part of the buffer starts, which + * may be @ielen if the entire (remainder) of the buffer should be + * used. */ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, @@ -8797,9 +9686,9 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, * correctly, if not the result of using this function will not * be ordered correctly either, i.e. it does no reordering. * - * The function returns the offset where the next part of the - * buffer starts, which may be @ielen if the entire (remainder) - * of the buffer should be used. + * Return: The offset where the next part of the buffer starts, which + * may be @ielen if the entire (remainder) of the buffer should be + * used. */ static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, size_t offset) @@ -8808,6 +9697,18 @@ static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen, } /** + * ieee80211_fragment_element - fragment the last element in skb + * @skb: The skbuf that the element was added to + * @len_pos: Pointer to length of the element to fragment + * @frag_id: The element ID to use for fragments + * + * This function fragments all data after @len_pos, adding fragmentation + * elements with the given ID as appropriate. The SKB will grow in size + * accordingly. + */ +void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id); + +/** * cfg80211_report_wowlan_wakeup - report wakeup from WoWLAN * @wdev: the wireless device reporting the wakeup * @wakeup: the wakeup report @@ -8851,6 +9752,8 @@ unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy); * This function can be called by the driver to check whether a * combination of interfaces and their types are allowed according to * the interface combinations. + * + * Return: 0 if combinations are allowed. Non-zero on error. */ int cfg80211_check_combinations(struct wiphy *wiphy, struct iface_combination_params *params); @@ -8866,14 +9769,27 @@ int cfg80211_check_combinations(struct wiphy *wiphy, * This function can be called by the driver to check what possible * combinations it fits in at a given moment, e.g. for channel switching * purposes. + * + * Return: 0 on success. Non-zero on error. */ int cfg80211_iter_combinations(struct wiphy *wiphy, struct iface_combination_params *params, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data); +/** + * cfg80211_get_radio_idx_by_chan - get the radio index by the channel + * + * @wiphy: the wiphy + * @chan: channel for which the supported radio index is required + * + * Return: radio index on success or -EINVAL otherwise + */ +int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, + const struct ieee80211_channel *chan); -/* + +/** * cfg80211_stop_iface - trigger interface disconnection * * @wiphy: the wiphy @@ -8928,6 +9844,8 @@ static inline void wiphy_ext_feature_set(struct wiphy *wiphy, * * The extended features are flagged in multiple bytes (see * &struct wiphy.@ext_features) + * + * Return: %true if extended feature flag is set, %false otherwise */ static inline bool wiphy_ext_feature_isset(struct wiphy *wiphy, @@ -9049,6 +9967,8 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev, * Check whether the interface is allowed to operate; additionally, this API * can be used to check iftype against the software interfaces when * check_swif is '1'. + * + * Return: %true if allowed, %false otherwise */ bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, bool is_4addr, u8 check_swif); @@ -9056,9 +9976,9 @@ bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, /** * cfg80211_assoc_comeback - notification of association that was - * temporarly rejected with a comeback + * temporarily rejected with a comeback * @netdev: network device - * @ap_addr: AP (MLD) address that rejected the assocation + * @ap_addr: AP (MLD) address that rejected the association * @timeout: timeout interval value TUs. * * this function may sleep. the caller must hold the corresponding wdev's mutex. @@ -9141,73 +10061,109 @@ void cfg80211_bss_flush(struct wiphy *wiphy); * @cmd: the actual event we want to notify * @count: the number of TBTTs until the color change happens * @color_bitmap: representations of the colors that the local BSS is aware of + * @link_id: valid link_id in case of MLO or 0 for non-MLO. + * + * Return: 0 on success. Non-zero on error. */ int cfg80211_bss_color_notify(struct net_device *dev, enum nl80211_commands cmd, u8 count, - u64 color_bitmap); + u64 color_bitmap, u8 link_id); /** * cfg80211_obss_color_collision_notify - notify about bss color collision * @dev: network device * @color_bitmap: representations of the colors that the local BSS is aware of + * @link_id: valid link_id in case of MLO or 0 for non-MLO. + * + * Return: 0 on success. Non-zero on error. */ static inline int cfg80211_obss_color_collision_notify(struct net_device *dev, - u64 color_bitmap) + u64 color_bitmap, + u8 link_id) { return cfg80211_bss_color_notify(dev, NL80211_CMD_OBSS_COLOR_COLLISION, - 0, color_bitmap); + 0, color_bitmap, link_id); } /** * cfg80211_color_change_started_notify - notify color change start * @dev: the device on which the color is switched * @count: the number of TBTTs until the color change happens + * @link_id: valid link_id in case of MLO or 0 for non-MLO. * * Inform the userspace about the color change that has started. + * + * Return: 0 on success. Non-zero on error. */ static inline int cfg80211_color_change_started_notify(struct net_device *dev, - u8 count) + u8 count, u8 link_id) { return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_STARTED, - count, 0); + count, 0, link_id); } /** * cfg80211_color_change_aborted_notify - notify color change abort * @dev: the device on which the color is switched + * @link_id: valid link_id in case of MLO or 0 for non-MLO. * * Inform the userspace about the color change that has aborted. + * + * Return: 0 on success. Non-zero on error. */ -static inline int cfg80211_color_change_aborted_notify(struct net_device *dev) +static inline int cfg80211_color_change_aborted_notify(struct net_device *dev, + u8 link_id) { return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_ABORTED, - 0, 0); + 0, 0, link_id); } /** * cfg80211_color_change_notify - notify color change completion * @dev: the device on which the color was switched + * @link_id: valid link_id in case of MLO or 0 for non-MLO. * * Inform the userspace about the color change that has completed. + * + * Return: 0 on success. Non-zero on error. */ -static inline int cfg80211_color_change_notify(struct net_device *dev) +static inline int cfg80211_color_change_notify(struct net_device *dev, + u8 link_id) { return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_COMPLETED, - 0, 0); + 0, 0, link_id); } /** - * cfg80211_valid_disable_subchannel_bitmap - validate puncturing bitmap - * @bitmap: bitmap to be validated - * @chandef: channel definition + * cfg80211_6ghz_power_type - determine AP regulatory power type + * @control: control flags + * @client_flags: &enum ieee80211_channel_flags for station mode to enable + * SP to LPI fallback, zero otherwise. * - * Validate the puncturing bitmap. - * - * Return: %true if the bitmap is valid. %false otherwise. + * Return: regulatory power type from &enum ieee80211_ap_reg_power */ -bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap, - const struct cfg80211_chan_def *chandef); +static inline enum ieee80211_ap_reg_power +cfg80211_6ghz_power_type(u8 control, u32 client_flags) +{ + switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { + case IEEE80211_6GHZ_CTRL_REG_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_AP_ROLE_NOT_RELEVANT: + return IEEE80211_REG_LPI_AP; + case IEEE80211_6GHZ_CTRL_REG_SP_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: + return IEEE80211_REG_SP_AP; + case IEEE80211_6GHZ_CTRL_REG_VLP_AP: + return IEEE80211_REG_VLP_AP; + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: + if (client_flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT) + return IEEE80211_REG_LPI_AP; + return IEEE80211_REG_SP_AP; + default: + return IEEE80211_REG_UNSET_AP; + } +} /** * cfg80211_links_removed - Notify about removed STA MLD setup links. @@ -9222,4 +10178,202 @@ bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap, */ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); +/** + * struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data + * @buf: MLO Reconfiguration Response frame (header + body) + * @len: length of the frame data + * @driver_initiated: Indicates whether the add links request is initiated by + * driver. This is set to true when the link reconfiguration request + * initiated by driver due to AP link recommendation requests + * (Ex: BTM (BSS Transition Management) request) handling offloaded to + * driver. + * @added_links: BIT mask of links successfully added to the association + * @links: per-link information indexed by link ID + * @links.bss: the BSS that MLO reconfiguration was requested for, ownership of + * the pointer moves to cfg80211 in the call to + * cfg80211_mlo_reconf_add_done(). + * + * The BSS pointer must be set for each link for which 'add' operation was + * requested in the assoc_ml_reconf callback. + */ +struct cfg80211_mlo_reconf_done_data { + const u8 *buf; + size_t len; + bool driver_initiated; + u16 added_links; + struct { + struct cfg80211_bss *bss; + u8 *addr; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; +}; + +/** + * cfg80211_mlo_reconf_add_done - Notify about MLO reconfiguration result + * @dev: network device. + * @data: MLO reconfiguration done data, &struct cfg80211_mlo_reconf_done_data + * + * Inform cfg80211 and the userspace that processing of ML reconfiguration + * request to add links to the association is done. + */ +void cfg80211_mlo_reconf_add_done(struct net_device *dev, + struct cfg80211_mlo_reconf_done_data *data); + +/** + * cfg80211_schedule_channels_check - schedule regulatory check if needed + * @wdev: the wireless device to check + * + * In case the device supports NO_IR or DFS relaxations, schedule regulatory + * channels check, as previous concurrent operation conditions may not + * hold anymore. + */ +void cfg80211_schedule_channels_check(struct wireless_dev *wdev); + +/** + * cfg80211_epcs_changed - Notify about a change in EPCS state + * @netdev: the wireless device whose EPCS state changed + * @enabled: set to true if EPCS was enabled, otherwise set to false. + */ +void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); + +/** + * cfg80211_next_nan_dw_notif - Notify about the next NAN Discovery Window (DW) + * @wdev: Pointer to the wireless device structure + * @chan: DW channel (6, 44 or 149) + * @gfp: Memory allocation flags + */ +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp); + +/** + * cfg80211_nan_cluster_joined - Notify about NAN cluster join + * @wdev: Pointer to the wireless device structure + * @cluster_id: Cluster ID of the NAN cluster that was joined or started + * @new_cluster: Indicates if this is a new cluster or an existing one + * @gfp: Memory allocation flags + * + * This function is used to notify user space when a NAN cluster has been + * joined, providing the cluster ID and a flag whether it is a new cluster. + */ +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp); + +#ifdef CONFIG_CFG80211_DEBUGFS +/** + * wiphy_locked_debugfs_read - do a locked read in debugfs + * @wiphy: the wiphy to use + * @file: the file being read + * @buf: the buffer to fill and then read from + * @bufsize: size of the buffer + * @userbuf: the user buffer to copy to + * @count: read count + * @ppos: read position + * @handler: the read handler to call (under wiphy lock) + * @data: additional data to pass to the read handler + * + * Return: the number of characters read, or a negative errno + */ +ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file, + char *buf, size_t bufsize, + char __user *userbuf, size_t count, + loff_t *ppos, + ssize_t (*handler)(struct wiphy *wiphy, + struct file *file, + char *buf, + size_t bufsize, + void *data), + void *data); + +/** + * wiphy_locked_debugfs_write - do a locked write in debugfs + * @wiphy: the wiphy to use + * @file: the file being written to + * @buf: the buffer to copy the user data to + * @bufsize: size of the buffer + * @userbuf: the user buffer to copy from + * @count: read count + * @handler: the write handler to call (under wiphy lock) + * @data: additional data to pass to the write handler + * + * Return: the number of characters written, or a negative errno + */ +ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, + char *buf, size_t bufsize, + const char __user *userbuf, size_t count, + ssize_t (*handler)(struct wiphy *wiphy, + struct file *file, + char *buf, + size_t count, + void *data), + void *data); +#endif + +/** + * cfg80211_s1g_get_start_freq_khz - get S1G chandef start frequency + * @chandef: the chandef to use + * + * Return: the chandefs starting frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_start_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz - bw_mhz * 500 + 500; +} + +/** + * cfg80211_s1g_get_end_freq_khz - get S1G chandef end frequency + * @chandef: the chandef to use + * + * Return: the chandefs ending frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_end_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz + bw_mhz * 500 - 500; +} + +/** + * cfg80211_s1g_get_primary_sibling - retrieve the sibling 1MHz subchannel + * for an S1G chandef using a 2MHz primary channel. + * @wiphy: wiphy the channel belongs to + * @chandef: the chandef to use + * + * When chandef::s1g_primary_2mhz is set to true, we are operating on a 2MHz + * primary channel. The 1MHz subchannel designated by the primary channel + * location exists within chandef::chan, whilst the 'sibling' is denoted as + * being the other 1MHz subchannel that make up the 2MHz primary channel. + * + * Returns: the sibling 1MHz &struct ieee80211_channel, or %NULL on failure. + */ +static inline struct ieee80211_channel * +cfg80211_s1g_get_primary_sibling(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 pri_1mhz_khz, sibling_1mhz_khz, op_low_1mhz_khz, pri_index; + + if (!chandef->s1g_primary_2mhz || width_mhz < 2) + return NULL; + + pri_1mhz_khz = ieee80211_channel_to_khz(chandef->chan); + op_low_1mhz_khz = cfg80211_s1g_get_start_freq_khz(chandef); + + /* + * Compute the index of the primary 1 MHz subchannel within the + * operating channel, relative to the lowest 1 MHz center frequency. + * Flip the least significant bit to select the even/odd sibling, + * then translate that index back into a channel frequency. + */ + pri_index = (pri_1mhz_khz - op_low_1mhz_khz) / 1000; + sibling_1mhz_khz = op_low_1mhz_khz + ((pri_index ^ 1) * 1000); + + return ieee80211_get_channel_khz(wiphy, sibling_1mhz_khz); +} + #endif /* __NET_CFG80211_H */ diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index f79ce133e51a..76d2cd2e2b30 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -20,6 +20,7 @@ struct wpan_phy; struct wpan_phy_cca; struct cfg802154_scan_request; struct cfg802154_beacon_request; +struct ieee802154_addr; #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL struct ieee802154_llsec_device_key; @@ -77,6 +78,12 @@ struct cfg802154_ops { struct cfg802154_beacon_request *request); int (*stop_beacons)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev); + int (*associate)(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *coord); + int (*disassociate)(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *target); #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL void (*get_llsec_table)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, @@ -304,6 +311,22 @@ struct ieee802154_coord_desc { }; /** + * struct ieee802154_pan_device - PAN device information + * @pan_id: the PAN ID of this device + * @mode: the preferred mode to reach the device + * @short_addr: the short address of this device + * @extended_addr: the extended address of this device + * @node: the list node + */ +struct ieee802154_pan_device { + __le16 pan_id; + u8 mode; + __le16 short_addr; + __le64 extended_addr; + struct list_head node; +}; + +/** * struct cfg802154_scan_request - Scan request * * @type: type of scan to be performed @@ -378,6 +401,7 @@ struct ieee802154_llsec_key { struct ieee802154_llsec_key_entry { struct list_head list; + struct rcu_head rcu; struct ieee802154_llsec_key_id id; struct ieee802154_llsec_key *key; @@ -478,6 +502,13 @@ struct wpan_dev { /* fallback for acknowledgment bit setting */ bool ackreq; + + /* Associations */ + struct mutex association_lock; + struct ieee802154_pan_device *parent; + struct list_head children; + unsigned int max_associations; + unsigned int nchildren; }; #define to_phy(_dev) container_of(_dev, struct wpan_phy, dev) @@ -529,4 +560,46 @@ static inline const char *wpan_phy_name(struct wpan_phy *phy) void ieee802154_configure_durations(struct wpan_phy *phy, unsigned int page, unsigned int channel); +/** + * cfg802154_device_is_associated - Checks whether we are associated to any device + * @wpan_dev: the wpan device + * @return: true if we are associated + */ +bool cfg802154_device_is_associated(struct wpan_dev *wpan_dev); + +/** + * cfg802154_device_is_parent - Checks if a device is our coordinator + * @wpan_dev: the wpan device + * @target: the expected parent + * @return: true if @target is our coordinator + */ +bool cfg802154_device_is_parent(struct wpan_dev *wpan_dev, + struct ieee802154_addr *target); + +/** + * cfg802154_device_is_child - Checks whether a device is associated to us + * @wpan_dev: the wpan device + * @target: the expected child + * @return: the PAN device + */ +struct ieee802154_pan_device * +cfg802154_device_is_child(struct wpan_dev *wpan_dev, + struct ieee802154_addr *target); + +/** + * cfg802154_set_max_associations - Limit the number of future associations + * @wpan_dev: the wpan device + * @max: the maximum number of devices we accept to associate + * @return: the old maximum value + */ +unsigned int cfg802154_set_max_associations(struct wpan_dev *wpan_dev, + unsigned int max); + +/** + * cfg802154_get_free_short_addr - Get a free address among the known devices + * @wpan_dev: the wpan device + * @return: a random short address expectedly unused on our PAN + */ +__le16 cfg802154_get_free_short_addr(struct wpan_dev *wpan_dev); + #endif /* __NET_CFG802154_H */ diff --git a/include/net/checksum.h b/include/net/checksum.h index 1338cb92c8e7..3cbab35de5ab 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -99,12 +99,6 @@ csum_block_add(__wsum csum, __wsum csum2, int offset) } static __always_inline __wsum -csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) -{ - return csum_block_add(csum, csum2, offset); -} - -static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); @@ -115,12 +109,6 @@ static __always_inline __wsum csum_unfold(__sum16 n) return (__force __wsum)n; } -static __always_inline -__wsum csum_partial_ext(const void *buff, int len, __wsum sum) -{ - return csum_partial(buff, len, sum); -} - #define CSUM_MANGLED_0 ((__force __sum16)0xffff) static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) @@ -151,6 +139,12 @@ static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) *csum = csum_add(csum_sub(*csum, old), new); } +static inline unsigned short csum_from32to16(unsigned int sum) +{ + sum += (sum >> 16) | (sum << 16); + return (unsigned short)(sum >> 16); +} + struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); @@ -158,7 +152,7 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr); + __wsum diff, bool pseudohdr, bool ipv6); static __always_inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h index 53dd7d988a2d..d6780d7903f4 100644 --- a/include/net/cipso_ipv4.h +++ b/include/net/cipso_ipv4.h @@ -28,7 +28,7 @@ #include <net/request_sock.h> #include <linux/atomic.h> #include <linux/refcount.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /* known doi values */ #define CIPSO_V4_DOI_UNKNOWN 0x00000000 @@ -183,7 +183,8 @@ int cipso_v4_getattr(const unsigned char *cipso, struct netlbl_lsm_secattr *secattr); int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, - const struct netlbl_lsm_secattr *secattr); + const struct netlbl_lsm_secattr *secattr, + bool sk_locked); void cipso_v4_sock_delattr(struct sock *sk); int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); int cipso_v4_req_setattr(struct request_sock *req, @@ -214,7 +215,8 @@ static inline int cipso_v4_getattr(const unsigned char *cipso, static inline int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, - const struct netlbl_lsm_secattr *secattr) + const struct netlbl_lsm_secattr *secattr, + bool sk_locked) { return -ENOSYS; } diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 7e78e7d6f015..668aeee9b3f6 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -63,7 +63,7 @@ static inline u32 task_get_classid(const struct sk_buff *skb) * calls by looking at the number of nested bh disable calls because * softirqs always disables bh. */ - if (in_serving_softirq()) { + if (softirq_count()) { struct sock *sk = skb_to_full_sk(skb); /* If there is an sock_cgroup_classid we'll use that. */ diff --git a/include/net/devlink.h b/include/net/devlink.h index 29fd1b4ee654..cb839e0435a1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -35,7 +35,7 @@ struct devlink_port_phys_attrs { /** * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes * @controller: Associated controller number - * @pf: Associated PCI PF number for this port. + * @pf: associated PCI function number for the devlink port instance * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_pf_attrs { @@ -47,8 +47,9 @@ struct devlink_port_pci_pf_attrs { /** * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes * @controller: Associated controller number - * @pf: Associated PCI PF number for this port. - * @vf: Associated PCI VF for of the PCI PF for this port. + * @pf: associated PCI function number for the devlink port instance + * @vf: associated PCI VF number of a PF for the devlink port instance; + * VF number starts from 0 for the first PCI virtual function * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_vf_attrs { @@ -61,8 +62,8 @@ struct devlink_port_pci_vf_attrs { /** * struct devlink_port_pci_sf_attrs - devlink port's PCI SF attributes * @controller: Associated controller number - * @sf: Associated PCI SF for of the PCI PF for this port. - * @pf: Associated PCI PF number for this port. + * @sf: associated SF number of a PF for the devlink port instance + * @pf: associated PCI function number for the devlink port instance * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_sf_attrs { @@ -77,6 +78,9 @@ struct devlink_port_pci_sf_attrs { * @flavour: flavour of the port * @split: indicates if this is split port * @splittable: indicates if the port can be split. + * @no_phys_port_name: skip automatic phys_port_name generation; for + * compatibility only, newly added driver/port instance + * should never set this. * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink. * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL * @phys: physical port attributes @@ -86,7 +90,8 @@ struct devlink_port_pci_sf_attrs { */ struct devlink_port_attrs { u8 split:1, - splittable:1; + splittable:1, + no_phys_port_name:1; u32 lanes; enum devlink_port_flavour flavour; struct netdev_phys_item_id switch_id; @@ -117,6 +122,8 @@ struct devlink_rate { u32 tx_priority; u32 tx_weight; + + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; struct devlink_port { @@ -150,6 +157,7 @@ struct devlink_port { struct devlink_rate *devlink_rate; struct devlink_linecard *linecard; + u32 rel_index; }; struct devlink_port_new_attrs { @@ -351,7 +359,7 @@ struct devlink_dpipe_table { bool resource_valid; u64 resource_id; u64 resource_units; - struct devlink_dpipe_table_ops *table_ops; + const struct devlink_dpipe_table_ops *table_ops; struct rcu_head rcu; }; @@ -418,17 +426,19 @@ typedef u64 devlink_resource_occ_get_t(void *priv); #define __DEVLINK_PARAM_MAX_STRING_VALUE 32 enum devlink_param_type { - DEVLINK_PARAM_TYPE_U8, - DEVLINK_PARAM_TYPE_U16, - DEVLINK_PARAM_TYPE_U32, - DEVLINK_PARAM_TYPE_STRING, - DEVLINK_PARAM_TYPE_BOOL, + DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8, + DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16, + DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32, + DEVLINK_PARAM_TYPE_U64 = DEVLINK_VAR_ATTR_TYPE_U64, + DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING, + DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG, }; union devlink_param_value { u8 vu8; u16 vu16; u32 vu32; + u64 vu64; char vstr[__DEVLINK_PARAM_MAX_STRING_VALUE]; bool vbool; }; @@ -469,6 +479,10 @@ struct devlink_flash_notify { * @set: set parameter value, used for runtime and permanent * configuration modes * @validate: validate input value is applicable (within value range, etc.) + * @get_default: get parameter default value, used for runtime and permanent + * configuration modes + * @reset_default: reset parameter to default value, used for runtime and permanent + * configuration modes * * This struct should be used by the driver to fill the data for * a parameter it registers. @@ -480,12 +494,20 @@ struct devlink_param { enum devlink_param_type type; unsigned long supported_cmodes; int (*get)(struct devlink *devlink, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int (*set)(struct devlink *devlink, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int (*validate)(struct devlink *devlink, u32 id, union devlink_param_value val, struct netlink_ext_ack *extack); + int (*get_default)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); + int (*reset_default)(struct devlink *devlink, u32 id, + enum devlink_param_cmode cmode, + struct netlink_ext_ack *extack); }; struct devlink_param_item { @@ -497,6 +519,7 @@ struct devlink_param_item { * until reload. */ bool driverinit_value_new_valid; + union devlink_param_value driverinit_default; }; enum devlink_param_generic_id { @@ -517,6 +540,11 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, + DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, + DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, + DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, + DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -575,6 +603,21 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME "event_eq_size" #define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME "enable_phc" +#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE DEVLINK_PARAM_TYPE_BOOL + +#define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id" +#define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64 + +#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME "total_vfs" +#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE DEVLINK_PARAM_TYPE_U32 + +#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME "num_doorbells" +#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE DEVLINK_PARAM_TYPE_U32 + +#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME "max_mac_per_vf" +#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ @@ -598,12 +641,45 @@ enum devlink_param_generic_id { .validate = _validate, \ } -/* Part number, identifier of board design */ +#define DEVLINK_PARAM_GENERIC_WITH_DEFAULTS(_id, _cmodes, _get, _set, \ + _validate, _get_default, \ + _reset_default) \ +{ \ + .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ + .name = DEVLINK_PARAM_GENERIC_##_id##_NAME, \ + .type = DEVLINK_PARAM_GENERIC_##_id##_TYPE, \ + .generic = true, \ + .supported_cmodes = _cmodes, \ + .get = _get, \ + .set = _set, \ + .validate = _validate, \ + .get_default = _get_default, \ + .reset_default = _reset_default, \ +} + +#define DEVLINK_PARAM_DRIVER_WITH_DEFAULTS(_id, _name, _type, _cmodes, \ + _get, _set, _validate, \ + _get_default, _reset_default) \ +{ \ + .id = _id, \ + .name = _name, \ + .type = _type, \ + .supported_cmodes = _cmodes, \ + .get = _get, \ + .set = _set, \ + .validate = _validate, \ + .get_default = _get_default, \ + .reset_default = _reset_default, \ +} + +/* Identifier of board design */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_ID "board.id" /* Revision of board design */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_REV "board.rev" /* Maker of the board */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_MANUFACTURE "board.manufacture" +/* Part number of the board and its components */ +#define DEVLINK_INFO_VERSION_GENERIC_BOARD_PART_NUMBER "board.part_number" /* Part number, identifier of asic design */ #define DEVLINK_INFO_VERSION_GENERIC_ASIC_ID "asic.id" @@ -725,6 +801,10 @@ enum devlink_health_reporter_state { * if priv_ctx is NULL, run a full dump * @diagnose: callback to diagnose the current status * @test: callback to trigger a test event + * @default_graceful_period: default min time (in msec) + * between recovery attempts + * @default_burst_period: default time (in msec) for + * error recoveries before starting the grace period */ struct devlink_health_reporter_ops { @@ -739,6 +819,8 @@ struct devlink_health_reporter_ops { struct netlink_ext_ack *extack); int (*test)(struct devlink_health_reporter *reporter, struct netlink_ext_ack *extack); + u64 default_graceful_period; + u64 default_burst_period; }; /** @@ -1257,6 +1339,18 @@ enum devlink_trap_group_generic_id { .min_burst = _min_burst, \ } +#define devlink_fmsg_put(fmsg, name, value) ( \ + _Generic((value), \ + bool : devlink_fmsg_bool_pair_put, \ + u8 : devlink_fmsg_u8_pair_put, \ + u16 : devlink_fmsg_u32_pair_put, \ + u32 : devlink_fmsg_u32_pair_put, \ + u64 : devlink_fmsg_u64_pair_put, \ + int : devlink_fmsg_u32_pair_put, \ + char * : devlink_fmsg_string_pair_put, \ + const char * : devlink_fmsg_string_pair_put) \ + (fmsg, name, (value))) + enum { /* device supports reload operations */ DEVLINK_F_RELOAD = 1UL << 0, @@ -1465,6 +1559,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, @@ -1473,6 +1570,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, @@ -1518,6 +1618,7 @@ int devl_trylock(struct devlink *devlink); void devl_unlock(struct devlink *devlink); void devl_assert_locked(struct devlink *devlink); bool devl_lock_is_held(struct devlink *devlink); +DEFINE_GUARD(devl, struct devlink *, devl_lock(_T), devl_unlock(_T)); struct ib_device; @@ -1601,6 +1702,14 @@ void devlink_free(struct devlink *devlink); * capability. Should be used by device drivers to * enable/disable ipsec_packet capability of a * function managed by the devlink port. + * @port_fn_max_io_eqs_get: Callback used to get port function's maximum number + * of event queues. Should be used by device drivers to + * report the maximum event queues of a function + * managed by the devlink port. + * @port_fn_max_io_eqs_set: Callback used to set port function's maximum number + * of event queues. Should be used by device drivers to + * configure maximum number of event queues + * of a function managed by the devlink port. * * Note: Driver should return -EOPNOTSUPP if it doesn't support * port function (@port_fn_*) handling for a particular port. @@ -1650,6 +1759,12 @@ struct devlink_port_ops { int (*port_fn_ipsec_packet_set)(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack); + int (*port_fn_max_io_eqs_get)(struct devlink_port *devlink_port, + u32 *max_eqs, + struct netlink_ext_ack *extack); + int (*port_fn_max_io_eqs_set)(struct devlink_port *devlink_port, + u32 max_eqs, + struct netlink_ext_ack *extack); }; void devlink_port_init(struct devlink *devlink, @@ -1689,7 +1804,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *devlink_port_attrs); + const struct devlink_port_attrs *attrs); void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external); void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, @@ -1697,6 +1812,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external); +int devl_port_fn_devlink_set(struct devlink_port *devlink_port, + struct devlink *fn_devlink); struct devlink_rate * devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, struct devlink_rate *parent); @@ -1717,8 +1834,8 @@ void devlink_linecard_provision_clear(struct devlink_linecard *linecard); void devlink_linecard_provision_fail(struct devlink_linecard *linecard); void devlink_linecard_activate(struct devlink_linecard *linecard); void devlink_linecard_deactivate(struct devlink_linecard *linecard); -void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, - struct devlink *nested_devlink); +int devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, + struct devlink *nested_devlink); int devl_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, @@ -1731,7 +1848,7 @@ void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index); void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index); int devl_dpipe_table_register(struct devlink *devlink, const char *table_name, - struct devlink_dpipe_table_ops *table_ops, + const struct devlink_dpipe_table_ops *table_ops, void *priv, bool counter_control_extern); void devl_dpipe_table_unregister(struct devlink *devlink, const char *table_name); @@ -1759,12 +1876,6 @@ int devl_resource_register(struct devlink *devlink, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params); -int devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params); void devl_resources_unregister(struct devlink *devlink); void devlink_resources_unregister(struct devlink *devlink); int devl_resource_size_get(struct devlink *devlink, @@ -1777,15 +1888,8 @@ void devl_resource_occ_get_register(struct devlink *devlink, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv); -void devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv); void devl_resource_occ_get_unregister(struct devlink *devlink, u64 resource_id); - -void devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id); int devl_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count); @@ -1851,56 +1955,56 @@ int devlink_info_version_running_put_ext(struct devlink_info_req *req, const char *version_value, enum devlink_info_version_type version_type); -int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg); -int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg); - -int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name); -int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg); - -int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, - const char *name); -int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg); -int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, - const char *name); -int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg); - -int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value); -int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value); -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len); - -int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, - bool value); -int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, - u8 value); -int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, - u32 value); -int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, - u64 value); -int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, - const char *value); -int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u32 value_len); +void devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg); +void devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg); + +void devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name); +void devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg); + +void devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name); +void devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg); +void devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name); +void devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg); + +void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value); +void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value); +void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len); + +void devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, + bool value); +void devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, + u8 value); +void devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, + u32 value); +void devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, + u64 value); +void devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, + const char *value); +void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, + const void *value, u32 value_len); struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); void devl_health_reporter_destroy(struct devlink_health_reporter *reporter); @@ -1918,6 +2022,8 @@ devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter); +int devl_nested_devlink_set(struct devlink *devlink, + struct devlink *nested_devlink); bool devlink_is_reload_failed(const struct devlink *devlink); void devlink_remote_reload_actions_performed(struct devlink *devlink, enum devlink_reload_limit limit, @@ -1985,6 +2091,7 @@ int devlink_compat_switch_id_get(struct net_device *dev, int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port); size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port); +void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb); #else diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index a587e83fc169..58d91ccc56e0 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -6,9 +6,13 @@ #define DEFINE_DROP_REASON(FN, FNe) \ FN(NOT_SPECIFIED) \ FN(NO_SOCKET) \ + FN(SOCKET_CLOSE) \ + FN(SOCKET_FILTER) \ + FN(SOCKET_RCVBUFF) \ + FN(UNIX_DISCONNECT) \ + FN(UNIX_SKIP_OOB) \ FN(PKT_TOO_SMALL) \ FN(TCP_CSUM) \ - FN(SOCKET_FILTER) \ FN(UDP_CSUM) \ FN(NETFILTER_DROP) \ FN(OTHERHOST) \ @@ -18,20 +22,31 @@ FN(UNICAST_IN_L2_MULTICAST) \ FN(XFRM_POLICY) \ FN(IP_NOPROTO) \ - FN(SOCKET_RCVBUFF) \ FN(PROTO_MEM) \ + FN(TCP_AUTH_HDR) \ FN(TCP_MD5NOTFOUND) \ FN(TCP_MD5UNEXPECTED) \ FN(TCP_MD5FAILURE) \ + FN(TCP_AONOTFOUND) \ + FN(TCP_AOUNEXPECTED) \ + FN(TCP_AOKEYNOTFOUND) \ + FN(TCP_AOFAILURE) \ FN(SOCKET_BACKLOG) \ FN(TCP_FLAGS) \ + FN(TCP_ABORT_ON_DATA) \ FN(TCP_ZEROWINDOW) \ FN(TCP_OLD_DATA) \ FN(TCP_OVERWINDOW) \ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ + FN(TCP_RFC7323_PAWS_ACK) \ + FN(TCP_RFC7323_TW_PAWS) \ + FN(TCP_RFC7323_TSECR) \ + FN(TCP_LISTEN_OVERFLOW) \ FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ + FN(TCP_INVALID_END_SEQUENCE) \ + FN(TCP_INVALID_ACK_SEQUENCE) \ FN(TCP_RESET) \ FN(TCP_INVALID_SYN) \ FN(TCP_CLOSE) \ @@ -48,8 +63,16 @@ FN(NEIGH_FAILED) \ FN(NEIGH_QUEUEFULL) \ FN(NEIGH_DEAD) \ + FN(NEIGH_HH_FILLFAIL) \ FN(TC_EGRESS) \ + FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ + FN(QDISC_OVERLIMIT) \ + FN(QDISC_CONGESTED) \ + FN(CAKE_FLOOD) \ + FN(FQ_BAND_LIMIT) \ + FN(FQ_HORIZON_LIMIT) \ + FN(FQ_FLOW_LIMIT) \ FN(CPU_BACKLOG) \ FN(XDP) \ FN(TC_INGRESS) \ @@ -68,6 +91,10 @@ FN(INVALID_PROTO) \ FN(IP_INADDRERRORS) \ FN(IP_INNOROUTES) \ + FN(IP_LOCAL_SOURCE) \ + FN(IP_INVALID_SOURCE) \ + FN(IP_LOCALNET) \ + FN(IP_INVALID_DEST) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -80,6 +107,28 @@ FN(IPV6_NDISC_BAD_OPTIONS) \ FN(IPV6_NDISC_NS_OTHERHOST) \ FN(QUEUE_PURGE) \ + FN(TC_COOKIE_ERROR) \ + FN(PACKET_SOCK_ERROR) \ + FN(TC_CHAIN_NOTFOUND) \ + FN(TC_RECLASSIFY_LOOP) \ + FN(VXLAN_INVALID_HDR) \ + FN(VXLAN_VNI_NOT_FOUND) \ + FN(MAC_INVALID_SOURCE) \ + FN(VXLAN_ENTRY_EXISTS) \ + FN(NO_TX_TARGET) \ + FN(IP_TUNNEL_ECN) \ + FN(TUNNEL_TXINFO) \ + FN(LOCAL_MAC) \ + FN(ARP_PVLAN_DISABLE) \ + FN(MAC_IEEE_MAC_CONTROL) \ + FN(BRIDGE_INGRESS_STP_STATE) \ + FN(CAN_RX_INVALID_FRAME) \ + FN(CANFD_RX_INVALID_FRAME) \ + FN(CANXL_RX_INVALID_FRAME) \ + FN(PFMEMALLOC) \ + FN(DUALPI2_STEP_DROP) \ + FN(PSP_INPUT) \ + FN(PSP_OUTPUT) \ FNe(MAX) /** @@ -96,14 +145,35 @@ enum skb_drop_reason { SKB_CONSUMED, /** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */ SKB_DROP_REASON_NOT_SPECIFIED, - /** @SKB_DROP_REASON_NO_SOCKET: socket not found */ + /** + * @SKB_DROP_REASON_NO_SOCKET: no valid socket that can be used. + * Reason could be one of three cases: + * 1) no established/listening socket found during lookup process + * 2) no valid request socket during 3WHS process + * 3) no valid child socket during 3WHS process + */ SKB_DROP_REASON_NO_SOCKET, + /** @SKB_DROP_REASON_SOCKET_CLOSE: socket is close()d */ + SKB_DROP_REASON_SOCKET_CLOSE, + /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ + SKB_DROP_REASON_SOCKET_FILTER, + /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ + SKB_DROP_REASON_SOCKET_RCVBUFF, + /** + * @SKB_DROP_REASON_UNIX_DISCONNECT: recv queue is purged when SOCK_DGRAM + * or SOCK_SEQPACKET socket re-connect()s to another socket or notices + * during send() that the peer has been close()d. + */ + SKB_DROP_REASON_UNIX_DISCONNECT, + /** + * @SKB_DROP_REASON_UNIX_SKIP_OOB: Out-Of-Band data is skipped by + * recv() without MSG_OOB so dropped. + */ + SKB_DROP_REASON_UNIX_SKIP_OOB, /** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */ SKB_DROP_REASON_PKT_TOO_SMALL, /** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */ SKB_DROP_REASON_TCP_CSUM, - /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ - SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_UDP_CSUM: UDP checksum error */ SKB_DROP_REASON_UDP_CSUM, /** @SKB_DROP_REASON_NETFILTER_DROP: dropped by netfilter */ @@ -134,14 +204,17 @@ enum skb_drop_reason { SKB_DROP_REASON_XFRM_POLICY, /** @SKB_DROP_REASON_IP_NOPROTO: no support for IP protocol */ SKB_DROP_REASON_IP_NOPROTO, - /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ - SKB_DROP_REASON_SOCKET_RCVBUFF, /** - * @SKB_DROP_REASON_PROTO_MEM: proto memory limition, such as udp packet - * drop out of udp_memory_allocated. + * @SKB_DROP_REASON_PROTO_MEM: proto memory limitation, such as + * udp packet drop out of udp_memory_allocated. */ SKB_DROP_REASON_PROTO_MEM, /** + * @SKB_DROP_REASON_TCP_AUTH_HDR: TCP-MD5 or TCP-AO hashes are met + * twice or set incorrectly. + */ + SKB_DROP_REASON_TCP_AUTH_HDR, + /** * @SKB_DROP_REASON_TCP_MD5NOTFOUND: no MD5 hash and one expected, * corresponding to LINUX_MIB_TCPMD5NOTFOUND */ @@ -157,6 +230,26 @@ enum skb_drop_reason { */ SKB_DROP_REASON_TCP_MD5FAILURE, /** + * @SKB_DROP_REASON_TCP_AONOTFOUND: no TCP-AO hash and one was expected, + * corresponding to LINUX_MIB_TCPAOREQUIRED + */ + SKB_DROP_REASON_TCP_AONOTFOUND, + /** + * @SKB_DROP_REASON_TCP_AOUNEXPECTED: TCP-AO hash is present and it + * was not expected, corresponding to LINUX_MIB_TCPAOKEYNOTFOUND + */ + SKB_DROP_REASON_TCP_AOUNEXPECTED, + /** + * @SKB_DROP_REASON_TCP_AOKEYNOTFOUND: TCP-AO key is unknown, + * corresponding to LINUX_MIB_TCPAOKEYNOTFOUND + */ + SKB_DROP_REASON_TCP_AOKEYNOTFOUND, + /** + * @SKB_DROP_REASON_TCP_AOFAILURE: TCP-AO hash is wrong, + * corresponding to LINUX_MIB_TCPAOBAD + */ + SKB_DROP_REASON_TCP_AOFAILURE, + /** * @SKB_DROP_REASON_SOCKET_BACKLOG: failed to add skb to socket backlog ( * see LINUX_MIB_TCPBACKLOGDROP) */ @@ -164,12 +257,17 @@ enum skb_drop_reason { /** @SKB_DROP_REASON_TCP_FLAGS: TCP flags invalid */ SKB_DROP_REASON_TCP_FLAGS, /** + * @SKB_DROP_REASON_TCP_ABORT_ON_DATA: abort on data, corresponding to + * LINUX_MIB_TCPABORTONDATA + */ + SKB_DROP_REASON_TCP_ABORT_ON_DATA, + /** * @SKB_DROP_REASON_TCP_ZEROWINDOW: TCP receive window size is zero, * see LINUX_MIB_TCPZEROWINDOWDROP */ SKB_DROP_REASON_TCP_ZEROWINDOW, /** - * @SKB_DROP_REASON_TCP_OLD_DATA: the TCP data reveived is already + * @SKB_DROP_REASON_TCP_OLD_DATA: the TCP data received is already * received before (spurious retrans may happened), see * LINUX_MIB_DELAYEDACKLOST */ @@ -187,13 +285,43 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_OFOMERGE, /** * @SKB_DROP_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to - * LINUX_MIB_PAWSESTABREJECTED + * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED */ SKB_DROP_REASON_TCP_RFC7323_PAWS, + /** + * @SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK: PAWS check, old ACK packet. + * Corresponds to LINUX_MIB_PAWS_OLD_ACK. + */ + SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in + * TIME_WAIT state. + * Corresponds to LINUX_MIB_PAWS_TW_REJECTED. + */ + SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr. + * Corresponds to LINUX_MIB_TSECRREJECTED. + */ + SKB_DROP_REASON_TCP_RFC7323_TSECR, + /** @SKB_DROP_REASON_TCP_LISTEN_OVERFLOW: listener queue full. */ + SKB_DROP_REASON_TCP_LISTEN_OVERFLOW, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ SKB_DROP_REASON_TCP_OLD_SEQUENCE, - /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ + /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field. */ SKB_DROP_REASON_TCP_INVALID_SEQUENCE, + /** + * @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE: + * Not acceptable END_SEQ field. + * Corresponds to LINUX_MIB_BEYOND_WINDOW. + */ + SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE, + /** + * @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ + * field because ack sequence is not in the window between snd_una + * and snd_nxt + */ + SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE, /** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */ SKB_DROP_REASON_TCP_RESET, /** @@ -235,14 +363,48 @@ enum skb_drop_reason { SKB_DROP_REASON_NEIGH_QUEUEFULL, /** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */ SKB_DROP_REASON_NEIGH_DEAD, + /** @SKB_DROP_REASON_NEIGH_HH_FILLFAIL: failed to fill the device hard header */ + SKB_DROP_REASON_NEIGH_HH_FILLFAIL, /** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */ SKB_DROP_REASON_TC_EGRESS, + /** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */ + SKB_DROP_REASON_SECURITY_HOOK, /** * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting ( * failed to enqueue to current qdisc) */ SKB_DROP_REASON_QDISC_DROP, /** + * @SKB_DROP_REASON_QDISC_OVERLIMIT: dropped by qdisc when a qdisc + * instance exceeds its total buffer size limit. + */ + SKB_DROP_REASON_QDISC_OVERLIMIT, + /** + * @SKB_DROP_REASON_QDISC_CONGESTED: dropped by a qdisc AQM algorithm + * due to congestion. + */ + SKB_DROP_REASON_QDISC_CONGESTED, + /** + * @SKB_DROP_REASON_CAKE_FLOOD: dropped by the flood protection part of + * CAKE qdisc AQM algorithm (BLUE). + */ + SKB_DROP_REASON_CAKE_FLOOD, + /** + * @SKB_DROP_REASON_FQ_BAND_LIMIT: dropped by fq qdisc when per band + * limit is reached. + */ + SKB_DROP_REASON_FQ_BAND_LIMIT, + /** + * @SKB_DROP_REASON_FQ_HORIZON_LIMIT: dropped by fq qdisc when packet + * timestamp is too far in the future. + */ + SKB_DROP_REASON_FQ_HORIZON_LIMIT, + /** + * @SKB_DROP_REASON_FQ_FLOW_LIMIT: dropped by fq qdisc when a flow + * exceeds its limits. + */ + SKB_DROP_REASON_FQ_FLOW_LIMIT, + /** * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU * backlog queue. This can be caused by backlog queue full (see * netdev_max_backlog in net.rst) or RPS flow limit @@ -309,6 +471,21 @@ enum skb_drop_reason { * IPSTATS_MIB_INADDRERRORS */ SKB_DROP_REASON_IP_INNOROUTES, + /** @SKB_DROP_REASON_IP_LOCAL_SOURCE: the source ip is local */ + SKB_DROP_REASON_IP_LOCAL_SOURCE, + /** + * @SKB_DROP_REASON_IP_INVALID_SOURCE: the source ip is invalid: + * 1) source ip is multicast or limited broadcast + * 2) source ip is zero and not IGMP + */ + SKB_DROP_REASON_IP_INVALID_SOURCE, + /** @SKB_DROP_REASON_IP_LOCALNET: source or dest ip is local net */ + SKB_DROP_REASON_IP_LOCALNET, + /** + * @SKB_DROP_REASON_IP_INVALID_DEST: the dest ip is invalid: + * 1) dest ip is 0 + */ + SKB_DROP_REASON_IP_INVALID_DEST, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) @@ -346,6 +523,100 @@ enum skb_drop_reason { /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */ SKB_DROP_REASON_QUEUE_PURGE, /** + * @SKB_DROP_REASON_TC_COOKIE_ERROR: An error occurred whilst + * processing a tc ext cookie. + */ + SKB_DROP_REASON_TC_COOKIE_ERROR, + /** + * @SKB_DROP_REASON_PACKET_SOCK_ERROR: generic packet socket errors + * after its filter matches an incoming packet. + */ + SKB_DROP_REASON_PACKET_SOCK_ERROR, + /** @SKB_DROP_REASON_TC_CHAIN_NOTFOUND: tc chain lookup failed. */ + SKB_DROP_REASON_TC_CHAIN_NOTFOUND, + /** + * @SKB_DROP_REASON_TC_RECLASSIFY_LOOP: tc exceeded max reclassify loop + * iterations. + */ + SKB_DROP_REASON_TC_RECLASSIFY_LOOP, + /** + * @SKB_DROP_REASON_VXLAN_INVALID_HDR: VXLAN header is invalid. E.g.: + * 1) reserved fields are not zero + * 2) "I" flag is not set + */ + SKB_DROP_REASON_VXLAN_INVALID_HDR, + /** @SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND: no VXLAN device found for VNI */ + SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND, + /** @SKB_DROP_REASON_MAC_INVALID_SOURCE: source mac is invalid */ + SKB_DROP_REASON_MAC_INVALID_SOURCE, + /** + * @SKB_DROP_REASON_VXLAN_ENTRY_EXISTS: trying to migrate a static + * entry or an entry pointing to a nexthop. + */ + SKB_DROP_REASON_VXLAN_ENTRY_EXISTS, + /** @SKB_DROP_REASON_NO_TX_TARGET: no target found for xmit */ + SKB_DROP_REASON_NO_TX_TARGET, + /** + * @SKB_DROP_REASON_IP_TUNNEL_ECN: skb is dropped according to + * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. + */ + SKB_DROP_REASON_IP_TUNNEL_ECN, + /** + * @SKB_DROP_REASON_TUNNEL_TXINFO: packet without necessary metadata + * reached a device which is in "external" mode. + */ + SKB_DROP_REASON_TUNNEL_TXINFO, + /** + * @SKB_DROP_REASON_LOCAL_MAC: the source MAC address is equal to + * the MAC address of the local netdev. + */ + SKB_DROP_REASON_LOCAL_MAC, + /** + * @SKB_DROP_REASON_ARP_PVLAN_DISABLE: packet which is not IP is + * forwarded to the in_dev, and the proxy_arp_pvlan is not + * enabled. + */ + SKB_DROP_REASON_ARP_PVLAN_DISABLE, + /** + * @SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL: the destination MAC address + * is an IEEE MAC Control address. + */ + SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL, + /** + * @SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE: the STP state of the + * ingress bridge port does not allow frames to be forwarded. + */ + SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE, + /** + * @SKB_DROP_REASON_CAN_RX_INVALID_FRAME: received + * non conform CAN frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CAN_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_CANFD_RX_INVALID_FRAME: received + * non conform CAN-FD frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CANFD_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_CANXL_RX_INVALID_FRAME: received + * non conform CAN-XL frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CANXL_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_PFMEMALLOC: packet allocated from memory reserve + * reached a path or socket not eligible for use of memory reserves + */ + SKB_DROP_REASON_PFMEMALLOC, + /** + * @SKB_DROP_REASON_DUALPI2_STEP_DROP: dropped by the step drop + * threshold of DualPI2 qdisc. + */ + SKB_DROP_REASON_DUALPI2_STEP_DROP, + /** @SKB_DROP_REASON_PSP_INPUT: PSP input checks failed */ + SKB_DROP_REASON_PSP_INPUT, + /** @SKB_DROP_REASON_PSP_OUTPUT: PSP output checks failed */ + SKB_DROP_REASON_PSP_OUTPUT, + /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen */ diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 56cb7be92244..7d3b1a2a6fec 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -18,12 +18,6 @@ enum skb_drop_reason_subsys { SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE, /** - * @SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR: mac80211 drop reasons - * for frames still going to monitor, see net/mac80211/drop.h - */ - SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR, - - /** * @SKB_DROP_REASON_SUBSYS_OPENVSWITCH: openvswitch drop reasons, * see net/openvswitch/drop.h */ diff --git a/include/net/dsa.h b/include/net/dsa.h index 0b9c6aa27047..cced1a866757 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -24,9 +24,6 @@ struct dsa_8021q_context; struct tc_action; -struct phy_device; -struct fixed_phy_status; -struct phylink_link_state; #define DSA_TAG_PROTO_NONE_VALUE 0 #define DSA_TAG_PROTO_BRCM_VALUE 1 @@ -56,11 +53,16 @@ struct phylink_link_state; #define DSA_TAG_PROTO_RTL8_4T_VALUE 25 #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26 #define DSA_TAG_PROTO_LAN937X_VALUE 27 +#define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 +#define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE 29 +#define DSA_TAG_PROTO_YT921X_VALUE 30 +#define DSA_TAG_PROTO_MXL_GSW1XX_VALUE 31 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, + DSA_TAG_PROTO_BRCM_LEGACY_FCS = DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, @@ -86,6 +88,9 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_RTL8_4T = DSA_TAG_PROTO_RTL8_4T_VALUE, DSA_TAG_PROTO_RZN1_A5PSW = DSA_TAG_PROTO_RZN1_A5PSW_VALUE, DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE, + DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE, + DSA_TAG_PROTO_YT921X = DSA_TAG_PROTO_YT921X_VALUE, + DSA_TAG_PROTO_MXL_GSW1XX = DSA_TAG_PROTO_MXL_GSW1XX_VALUE, }; struct dsa_switch; @@ -102,11 +107,11 @@ struct dsa_device_ops { const char *name; enum dsa_tag_protocol proto; /* Some tagging protocols either mangle or shift the destination MAC - * address, in which case the DSA master would drop packets on ingress + * address, in which case the DSA conduit would drop packets on ingress * if what it understands out of the destination MAC address is not in * its RX filter. */ - bool promisc_on_master; + bool promisc_on_conduit; }; struct dsa_lag { @@ -236,12 +241,12 @@ struct dsa_bridge { }; struct dsa_port { - /* A CPU port is physically connected to a master device. - * A user port exposed to userspace has a slave device. + /* A CPU port is physically connected to a conduit device. A user port + * exposes a network device to user-space, called 'user' here. */ union { - struct net_device *master; - struct net_device *slave; + struct net_device *conduit; + struct net_device *user; }; /* Copy of the tagging protocol operations, for quicker access @@ -249,7 +254,7 @@ struct dsa_port { */ const struct dsa_device_ops *tag_ops; - /* Copies for faster access in master receive hot path */ + /* Copies for faster access in conduit receive hot path */ struct dsa_switch_tree *dst; struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); @@ -281,9 +286,9 @@ struct dsa_port { u8 lag_tx_enabled:1; - /* Master state bits, valid only on CPU ports */ - u8 master_admin_up:1; - u8 master_oper_up:1; + /* conduit state bits, valid only on CPU ports */ + u8 conduit_admin_up:1; + u8 conduit_oper_up:1; /* Valid only on user ports */ u8 cpu_port_in_lag:1; @@ -303,7 +308,7 @@ struct dsa_port { struct list_head list; /* - * Original copy of the master netdev ethtool_ops + * Original copy of the conduit netdev ethtool_ops */ const struct ethtool_ops *orig_ethtool_ops; @@ -327,6 +332,12 @@ struct dsa_port { }; }; +static inline struct dsa_port * +dsa_phylink_to_port(struct phylink_config *config) +{ + return container_of(config, struct dsa_port, pl_config); +} + /* TODO: ideally DSA ports would have a single dp->link_dp member, * and no dst->rtable nor this struct dsa_link would be needed, * but this would require some more complex tree walking, @@ -398,14 +409,18 @@ struct dsa_switch { */ u32 configure_vlan_while_not_filtering:1; - /* If the switch driver always programs the CPU port as egress tagged - * despite the VLAN configuration indicating otherwise, then setting - * @untag_bridge_pvid will force the DSA receive path to pop the - * bridge's default_pvid VLAN tagged frames to offer a consistent - * behavior between a vlan_filtering=0 and vlan_filtering=1 bridge - * device. + /* Pop the default_pvid of VLAN-unaware bridge ports from tagged frames. + * DEPRECATED: Do NOT set this field in new drivers. Instead look at + * the dsa_software_vlan_untag() comments. */ u32 untag_bridge_pvid:1; + /* Pop the default_pvid of VLAN-aware bridge ports from tagged frames. + * Useful if the switch cannot preserve the VLAN tag as seen on the + * wire for user port ingress, and chooses to send all frames as + * VLAN-tagged to the CPU, including those which were originally + * untagged. + */ + u32 untag_vlan_aware_bridge_pvid:1; /* Let DSA manage the FDB entries towards the * CPU, based on the software bridge database. @@ -430,6 +445,11 @@ struct dsa_switch { */ u32 fdb_isolation:1; + /* Drivers that have global DSCP mapping settings must set this to + * true to automatically apply the settings to all ports. + */ + u32 dscp_prio_mapping_is_global:1; + /* Listener for switch fabric events */ struct notifier_block nb; @@ -452,10 +472,15 @@ struct dsa_switch { const struct dsa_switch_ops *ops; /* - * Slave mii_bus and devices for the individual ports. + * Allow a DSA switch driver to override the phylink MAC ops + */ + const struct phylink_mac_ops *phylink_mac_ops; + + /* + * User mii_bus and devices for the individual ports. */ u32 phys_mii_mask; - struct mii_bus *slave_mii_bus; + struct mii_bus *user_mii_bus; /* Ageing Time limits in msecs */ unsigned int ageing_time_min; @@ -520,10 +545,10 @@ static inline bool dsa_port_is_unused(struct dsa_port *dp) return dp->type == DSA_PORT_TYPE_UNUSED; } -static inline bool dsa_port_master_is_operational(struct dsa_port *dp) +static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp) { - return dsa_port_is_cpu(dp) && dp->master_admin_up && - dp->master_oper_up; + return dsa_port_is_cpu(dp) && dp->conduit_admin_up && + dp->conduit_oper_up; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) @@ -578,6 +603,10 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) +#define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \ + dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ + if (dsa_port_is_user((_dp))) + #define dsa_switch_for_each_cpu_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) @@ -713,12 +742,12 @@ static inline bool dsa_port_offloads_lag(struct dsa_port *dp, return dsa_port_lag_dev_get(dp) == lag->dev; } -static inline struct net_device *dsa_port_to_master(const struct dsa_port *dp) +static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp) { if (dp->cpu_port_in_lag) return dsa_port_lag_dev_get(dp->cpu_dp); - return dp->cpu_dp->master; + return dp->cpu_dp->conduit; } static inline @@ -732,7 +761,7 @@ struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) else if (dp->hsr_dev) return dp->hsr_dev; - return dp->slave; + return dp->user; } static inline struct net_device * @@ -834,9 +863,9 @@ struct dsa_switch_ops { int (*connect_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); - int (*port_change_master)(struct dsa_switch *ds, int port, - struct net_device *master, - struct netlink_ext_ack *extack); + int (*port_change_conduit)(struct dsa_switch *ds, int port, + struct net_device *conduit, + struct netlink_ext_ack *extack); /* Optional switch-wide initialization and destruction methods */ int (*setup)(struct dsa_switch *ds); @@ -858,39 +887,10 @@ struct dsa_switch_ops { int regnum, u16 val); /* - * Link state adjustment (called from libphy) - */ - void (*adjust_link)(struct dsa_switch *ds, int port, - struct phy_device *phydev); - void (*fixed_link_update)(struct dsa_switch *ds, int port, - struct fixed_phy_status *st); - - /* * PHYLINK integration */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); - struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, - int port, - phy_interface_t iface); - int (*phylink_mac_prepare)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface); - void (*phylink_mac_config)(struct dsa_switch *ds, int port, - unsigned int mode, - const struct phylink_link_state *state); - int (*phylink_mac_finish)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface); - void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface); - void (*phylink_mac_link_up)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev, - int speed, int duplex, - bool tx_pause, bool rx_pause); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* @@ -912,6 +912,8 @@ struct dsa_switch_ops { void (*get_rmon_stats)(struct dsa_switch *ds, int port, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); + void (*get_ts_stats)(struct dsa_switch *ds, int port, + struct ethtool_ts_stats *ts_stats); void (*get_stats64)(struct dsa_switch *ds, int port, struct rtnl_link_stats64 *s); void (*get_pause_stats)(struct dsa_switch *ds, int port, @@ -931,7 +933,7 @@ struct dsa_switch_ops { * ethtool timestamp info */ int (*get_ts_info)(struct dsa_switch *ds, int port, - struct ethtool_ts_info *ts); + struct kernel_ethtool_ts_info *ts); /* * ethtool MAC merge layer @@ -955,6 +957,10 @@ struct dsa_switch_ops { u8 prio); int (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); + int (*port_set_apptrust)(struct dsa_switch *ds, int port, + const u8 *sel, int nsel); + int (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel, + int *nsel); /* * Suspend and resume @@ -969,6 +975,16 @@ struct dsa_switch_ops { struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port); + + /* + * Notification for MAC address changes on user ports. Drivers can + * currently only veto operations. They should not use the method to + * program the hardware, since the operation is not rolled back in case + * of other errors. + */ + int (*port_set_mac_address)(struct dsa_switch *ds, int port, + const unsigned char *addr); + /* * Compatibility between device trees defining multiple CPU ports and * drivers which are not OK to use by default the numerically smallest @@ -980,10 +996,9 @@ struct dsa_switch_ops { /* * Port's MAC EEE settings */ + bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, - struct ethtool_eee *e); - int (*get_mac_eee)(struct dsa_switch *ds, int port, - struct ethtool_eee *e); + struct ethtool_keee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); @@ -1122,9 +1137,10 @@ struct dsa_switch_ops { * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, @@ -1198,7 +1214,8 @@ struct dsa_switch_ops { * HSR integration */ int (*port_hsr_join)(struct dsa_switch *ds, int port, - struct net_device *hsr); + struct net_device *hsr, + struct netlink_ext_ack *extack); int (*port_hsr_leave)(struct dsa_switch *ds, int port, struct net_device *hsr); @@ -1222,11 +1239,11 @@ struct dsa_switch_ops { int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid); /* - * DSA master tracking operations + * DSA conduit tracking operations */ - void (*master_state_change)(struct dsa_switch *ds, - const struct net_device *master, - bool operational); + void (*conduit_state_change)(struct dsa_switch *ds, + const struct net_device *conduit, + bool operational); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ @@ -1234,9 +1251,11 @@ struct dsa_switch_ops { dsa_devlink_param_get, dsa_devlink_param_set, NULL) int dsa_devlink_param_get(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int dsa_devlink_param_set(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int dsa_devlink_params_register(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); @@ -1296,11 +1315,6 @@ static inline int dsa_devlink_port_to_port(struct devlink_port *port) return port->index; } -struct dsa_switch_driver { - struct list_head list; - const struct dsa_switch_ops *ops; -}; - bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); @@ -1308,6 +1322,15 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); +int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack); +int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack); +int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port, + struct net_device *hsr); + /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { @@ -1363,9 +1386,9 @@ static inline int dsa_switch_resume(struct dsa_switch *ds) #endif /* CONFIG_PM_SLEEP */ #if IS_ENABLED(CONFIG_NET_DSA) -bool dsa_slave_dev_check(const struct net_device *dev); +bool dsa_user_dev_check(const struct net_device *dev); #else -static inline bool dsa_slave_dev_check(const struct net_device *dev) +static inline bool dsa_user_dev_check(const struct net_device *dev) { return false; } @@ -1373,5 +1396,6 @@ static inline bool dsa_slave_dev_check(const struct net_device *dev) netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); +bool dsa_supports_eee(struct dsa_switch *ds, int port); #endif diff --git a/include/net/dsa_stubs.h b/include/net/dsa_stubs.h index 361811750a54..6f384897f287 100644 --- a/include/net/dsa_stubs.h +++ b/include/net/dsa_stubs.h @@ -13,14 +13,14 @@ extern const struct dsa_stubs *dsa_stubs; struct dsa_stubs { - int (*master_hwtstamp_validate)(struct net_device *dev, - const struct kernel_hwtstamp_config *config, - struct netlink_ext_ack *extack); + int (*conduit_hwtstamp_validate)(struct net_device *dev, + const struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); }; -static inline int dsa_master_hwtstamp_validate(struct net_device *dev, - const struct kernel_hwtstamp_config *config, - struct netlink_ext_ack *extack) +static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev, + const struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { if (!netdev_uses_dsa(dev)) return 0; @@ -29,18 +29,18 @@ static inline int dsa_master_hwtstamp_validate(struct net_device *dev, * netdev_uses_dsa() returns true, the dsa_core module is still * registered, and so, dsa_unregister_stubs() couldn't have run. * For netdev_uses_dsa() to start returning false, it would imply that - * dsa_master_teardown() has executed, which requires rtnl_lock(). + * dsa_conduit_teardown() has executed, which requires rtnl_lock(). */ ASSERT_RTNL(); - return dsa_stubs->master_hwtstamp_validate(dev, config, extack); + return dsa_stubs->conduit_hwtstamp_validate(dev, config, extack); } #else -static inline int dsa_master_hwtstamp_validate(struct net_device *dev, - const struct kernel_hwtstamp_config *config, - struct netlink_ext_ack *extack) +static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev, + const struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { return 0; } diff --git a/include/net/dscp.h b/include/net/dscp.h new file mode 100644 index 000000000000..ba40540868c9 --- /dev/null +++ b/include/net/dscp.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> */ + +#ifndef __DSCP_H__ +#define __DSCP_H__ + +/* + * DSCP Pools and Codepoint Space Division: + * + * The Differentiated Services (Diffserv) architecture defines a method for + * classifying and managing network traffic using the DS field in IPv4 and IPv6 + * packet headers. This field can carry one of 64 distinct DSCP (Differentiated + * Services Code Point) values, which are divided into three pools based on + * their Least Significant Bits (LSB) patterns and intended usage. Each pool has + * a specific registration procedure for assigning DSCP values: + * + * Pool 1 (Standards Action Pool): + * - Codepoint Space: xxxxx0 + * This pool includes DSCP values ending in '0' (binary), allocated via + * Standards Action. It is intended for globally recognized traffic classes, + * ensuring interoperability across the internet. This pool encompasses + * well-known DSCP values such as CS0-CS7, AFxx, EF, and VOICE-ADMIT. + * + * Pool 2 (Experimental/Local Use Pool): + * - Codepoint Space: xxxx11 + * Reserved for DSCP values ending in '11' (binary), this pool is designated + * for Experimental or Local Use. It allows for private or temporary traffic + * marking schemes not intended for standardized global use, facilitating + * testing and network-specific configurations without impacting + * interoperability. + * + * Pool 3 (Preferential Standardization Pool): + * - Codepoint Space: xxxx01 + * Initially reserved for experimental or local use, this pool now serves as + * a secondary standardization resource should Pool 1 become exhausted. DSCP + * values ending in '01' (binary) are assigned via Standards Action, with a + * focus on adopting new, standardized traffic classes as the need arises. + * + * For pool updates see: + * https://www.iana.org/assignments/dscp-registry/dscp-registry.xhtml + */ + +/* Pool 1: Standardized DSCP values as per [RFC8126] */ +#define DSCP_CS0 0 /* 000000, [RFC2474] */ +/* CS0 is some times called default (DF) */ +#define DSCP_DF 0 /* 000000, [RFC2474] */ +#define DSCP_CS1 8 /* 001000, [RFC2474] */ +#define DSCP_CS2 16 /* 010000, [RFC2474] */ +#define DSCP_CS3 24 /* 011000, [RFC2474] */ +#define DSCP_CS4 32 /* 100000, [RFC2474] */ +#define DSCP_CS5 40 /* 101000, [RFC2474] */ +#define DSCP_CS6 48 /* 110000, [RFC2474] */ +#define DSCP_CS7 56 /* 111000, [RFC2474] */ +#define DSCP_AF11 10 /* 001010, [RFC2597] */ +#define DSCP_AF12 12 /* 001100, [RFC2597] */ +#define DSCP_AF13 14 /* 001110, [RFC2597] */ +#define DSCP_AF21 18 /* 010010, [RFC2597] */ +#define DSCP_AF22 20 /* 010100, [RFC2597] */ +#define DSCP_AF23 22 /* 010110, [RFC2597] */ +#define DSCP_AF31 26 /* 011010, [RFC2597] */ +#define DSCP_AF32 28 /* 011100, [RFC2597] */ +#define DSCP_AF33 30 /* 011110, [RFC2597] */ +#define DSCP_AF41 34 /* 100010, [RFC2597] */ +#define DSCP_AF42 36 /* 100100, [RFC2597] */ +#define DSCP_AF43 38 /* 100110, [RFC2597] */ +#define DSCP_EF 46 /* 101110, [RFC3246] */ +#define DSCP_VOICE_ADMIT 44 /* 101100, [RFC5865] */ + +/* Pool 3: Standardized assignments, previously available for experimental/local + * use + */ +#define DSCP_LE 1 /* 000001, [RFC8622] */ + +#define DSCP_MAX 64 + +#endif /* __DSCP_H__ */ diff --git a/include/net/dst.h b/include/net/dst.h index 78884429deed..f8aa1239b4db 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -24,7 +24,10 @@ struct sk_buff; struct dst_entry { - struct net_device *dev; + union { + struct net_device *dev; + struct net_device __rcu *dev_rcu; + }; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; @@ -222,13 +225,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr return msecs_to_jiffies(dst_metric(dst, metric)); } -static inline u32 -dst_allfrag(const struct dst_entry *dst) -{ - int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG); - return ret; -} - static inline int dst_metric_locked(const struct dst_entry *dst, int metric) { @@ -247,9 +243,9 @@ static inline void dst_hold(struct dst_entry *dst) static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { - if (unlikely(time != dst->lastuse)) { + if (unlikely(time != READ_ONCE(dst->lastuse))) { dst->__use++; - dst->lastuse = time; + WRITE_ONCE(dst->lastuse, time); } } @@ -314,7 +310,7 @@ static inline bool dst_hold_safe(struct dst_entry *dst) * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. - * Returns true if dst is refcounted. + * Returns: true if dst is refcounted. */ static inline bool skb_dst_force(struct sk_buff *skb) { @@ -348,7 +344,7 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; /* - * Clear hash so that we can recalulate the hash for the + * Clear hash so that we can recalculate the hash for the * encapsulated packet, unless we have already determine the hash * over the L4 4-tuple. */ @@ -392,12 +388,11 @@ static inline int dst_discard(struct sk_buff *skb) { return dst_discard_out(&init_net, skb->sk, skb); } -void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); void dst_init(struct dst_entry *dst, struct dst_ops *ops, - struct net_device *dev, int initial_ref, int initial_obsolete, + struct net_device *dev, int initial_obsolete, unsigned short flags); -struct dst_entry *dst_destroy(struct dst_entry *dst); void dst_dev_put(struct dst_entry *dst); static inline void dst_confirm(struct dst_entry *dst) @@ -439,13 +434,24 @@ static inline void dst_link_failure(struct sk_buff *skb) static inline void dst_set_expires(struct dst_entry *dst, int timeout) { - unsigned long expires = jiffies + timeout; + unsigned long old, expires = jiffies + timeout; if (expires == 0) expires = 1; - if (dst->expires == 0 || time_before(expires, dst->expires)) - dst->expires = expires; + old = READ_ONCE(dst->expires); + + if (!old || time_before(expires, old)) + WRITE_ONCE(dst->expires, expires); +} + +static inline unsigned int dst_dev_overhead(struct dst_entry *dst, + struct sk_buff *skb) +{ + if (likely(dst)) + return LL_RESERVED_SPACE(dst->dev); + + return skb->mac_len; } INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, @@ -455,7 +461,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return INDIRECT_CALL_INET(skb_dst(skb)->output, + return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->output), ip6_output, ip_output, net, sk, skb); } @@ -465,7 +471,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { - return INDIRECT_CALL_INET(skb_dst(skb)->input, + return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->input), ip6_input, ip_local_deliver, skb); } @@ -475,7 +481,7 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { - if (dst->obsolete) + if (READ_ONCE(dst->obsolete)) dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie); return dst; @@ -560,6 +566,41 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } +static inline struct net_device *dst_dev(const struct dst_entry *dst) +{ + return READ_ONCE(dst->dev); +} + +static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) +{ + return rcu_dereference(dst->dev_rcu); +} + +static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst) +{ + return dev_net_rcu(dst_dev_rcu(dst)); +} + +static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) +{ + return dst_dev(skb_dst(skb)); +} + +static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb) +{ + return dst_dev_rcu(skb_dst(skb)); +} + +static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) +{ + return dev_net(skb_dst_dev(skb)); +} + +static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) +{ + return dev_net_rcu(skb_dst_dev_rcu(skb)); +} + struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h index df6622a5fe98..1961699598e2 100644 --- a/include/net/dst_cache.h +++ b/include/net/dst_cache.h @@ -76,7 +76,7 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, */ static inline void dst_cache_reset(struct dst_cache *dst_cache) { - dst_cache->reset_ts = jiffies; + WRITE_ONCE(dst_cache->reset_ts, jiffies); } /** @@ -102,7 +102,7 @@ int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp); * @dst_cache: the cache * * No synchronization is enforced: it must be called only when the cache - * is unsed. + * is unused. */ void dst_cache_destroy(struct dst_cache *dst_cache); diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 1b7fae4c6b24..1fc2fb03ce3f 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -3,6 +3,7 @@ #define __NET_DST_METADATA_H 1 #include <linux/skbuff.h> +#include <net/ip.h> #include <net/ip_tunnels.h> #include <net/macsec.h> #include <net/dst.h> @@ -198,7 +199,7 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, __be32 daddr, __u8 tos, __u8 ttl, __be16 tp_dst, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { @@ -215,14 +216,20 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, } static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { const struct iphdr *iph = ip_hdr(skb); + struct metadata_dst *tun_dst; + + tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, flags, tunnel_id, md_size); - return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, flags, tunnel_id, md_size); + if (tun_dst && (iph->frag_off & htons(IP_DF))) + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + tun_dst->u.tun_info.key.tun_flags); + return tun_dst; } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, @@ -230,7 +237,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad __u8 tos, __u8 ttl, __be16 tp_dst, __be32 label, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { @@ -243,7 +250,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad info = &tun_dst->u.tun_info; info->mode = IP_TUNNEL_INFO_IPV6; - info->key.tun_flags = flags; + ip_tunnel_flags_copy(info->key.tun_flags, flags); info->key.tun_id = tunnel_id; info->key.tp_src = 0; info->key.tp_dst = tp_dst; @@ -259,7 +266,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad } static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 6d1c8541183d..3a9001a042a5 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -24,7 +24,7 @@ struct dst_ops { void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, struct net_device *dev); - struct dst_entry * (*negative_advice)(struct dst_entry *); + void (*negative_advice)(struct sock *sk, struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, diff --git a/include/net/eee.h b/include/net/eee.h new file mode 100644 index 000000000000..cfab1b8bc46a --- /dev/null +++ b/include/net/eee.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _EEE_H +#define _EEE_H + +#include <linux/types.h> + +struct eee_config { + u32 tx_lpi_timer; + bool tx_lpi_enabled; + bool eee_enabled; +}; + +static inline bool eeecfg_mac_can_tx_lpi(const struct eee_config *eeecfg) +{ + /* eee_enabled is the master on/off */ + return eeecfg->eee_enabled && eeecfg->tx_lpi_enabled; +} + +static inline void eeecfg_to_eee(struct ethtool_keee *eee, + const struct eee_config *eeecfg) +{ + eee->tx_lpi_timer = eeecfg->tx_lpi_timer; + eee->tx_lpi_enabled = eeecfg->tx_lpi_enabled; + eee->eee_enabled = eeecfg->eee_enabled; +} + +static inline void eee_to_eeecfg(struct eee_config *eeecfg, + const struct ethtool_keee *eee) +{ + eeecfg->tx_lpi_timer = eee->tx_lpi_timer; + eeecfg->tx_lpi_enabled = eee->tx_lpi_enabled; + eeecfg->eee_enabled = eee->eee_enabled; +} + +#endif diff --git a/include/net/erspan.h b/include/net/erspan.h index 6cb4cbd6a48f..c6209e7b6c96 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -89,7 +89,7 @@ enum erspan_encap_type { ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ ERSPAN_ENCAP_8021Q = 0x2, /* originally 802.1Q encapsulated */ - ERSPAN_ENCAP_INFRAME = 0x3, /* VLAN tag perserved in frame */ + ERSPAN_ENCAP_INFRAME = 0x3, /* VLAN tag preserved in frame */ }; #define ERSPAN_V1_MDSIZE 4 @@ -192,7 +192,7 @@ static inline void erspan_build_header(struct sk_buff *skb, enc_type = ERSPAN_ENCAP_NOVLAN; /* If mirrored packet has vlan tag, extract tci and - * perserve vlan header in the mirrored frame. + * preserve vlan header in the mirrored frame. */ if (eth->h_proto == htons(ETH_P_8021Q)) { qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); diff --git a/include/net/espintcp.h b/include/net/espintcp.h index 0335bbd76552..c70efd704b6d 100644 --- a/include/net/espintcp.h +++ b/include/net/espintcp.h @@ -32,7 +32,7 @@ struct espintcp_ctx { static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* RCU is only needed for diag */ return (__force void *)icsk->icsk_ulp_data; diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 6d59221ff05a..48aad6128fea 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -28,7 +28,7 @@ enum fib_event_type { struct fib_notifier_ops { int family; struct list_head list; - unsigned int (*fib_seq_read)(struct net *net); + unsigned int (*fib_seq_read)(const struct net *net); int (*fib_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); struct module *owner; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 82da359bca03..6e68e359ad18 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -43,6 +43,10 @@ struct fib_rule { struct fib_kuid_range uid_range; struct fib_rule_port_range sport_range; struct fib_rule_port_range dport_range; + u16 sport_mask; + u16 dport_mask; + u8 iif_is_l3_master; + u8 oif_is_l3_master; struct rcu_head rcu; }; @@ -146,6 +150,17 @@ static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, ntohs(port) <= a->end; } +static inline bool fib_rule_port_match(const struct fib_rule_port_range *range, + u16 port_mask, __be16 port) +{ + if ((range->start ^ ntohs(port)) & port_mask) + return false; + if (!port_mask && fib_rule_port_range_set(range) && + !fib_rule_port_inrange(range, port)) + return false; + return true; +} + static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) { return a->start != 0 && a->end != 0 && a->end < 0xffff && @@ -159,6 +174,12 @@ static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, a->end == b->end; } +static inline bool +fib_rule_port_is_range(const struct fib_rule_port_range *range) +{ + return range->start != range->end; +} + static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) { return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto || @@ -172,17 +193,16 @@ void fib_rules_unregister(struct fib_rules_ops *); int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); -int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, - u32 flags); +int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); -unsigned int fib_rules_seq_read(struct net *net, int family); +unsigned int fib_rules_seq_read(const struct net *net, int family); -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack); +int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held); +int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held); INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); diff --git a/include/net/flow.h b/include/net/flow.h index 7f0adda3bf2f..ae9481c40063 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -12,6 +12,7 @@ #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/uidgid.h> +#include <net/inet_dscp.h> struct flow_keys; @@ -32,16 +33,18 @@ struct flowi_common { int flowic_iif; int flowic_l3mdev; __u32 flowic_mark; - __u8 flowic_tos; + dscp_t flowic_dscp; __u8 flowic_scope; __u8 flowic_proto; __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 +#define FLOWI_FLAG_L3MDEV_OIF 0x04 +#define FLOWI_FLAG_ANY_SPORT 0x08 __u32 flowic_secid; kuid_t flowic_uid; - struct flowi_tunnel flowic_tun_key; __u32 flowic_multipath_hash; + struct flowi_tunnel flowic_tun_key; }; union flowi_uli { @@ -68,7 +71,7 @@ struct flowi4 { #define flowi4_iif __fl_common.flowic_iif #define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark -#define flowi4_tos __fl_common.flowic_tos +#define flowi4_dscp __fl_common.flowic_dscp #define flowi4_scope __fl_common.flowic_scope #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags @@ -101,7 +104,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; - fl4->flowi4_tos = tos; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_scope = scope; fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; @@ -139,7 +142,7 @@ struct flowi6 { #define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; - /* Note: flowi6_tos is encoded in flowlabel, too. */ + /* Note: flowi6_dscp is encoded in flowlabel, too. */ __be32 flowlabel; union flowi_uli uli; #define fl6_sport uli.ports.sport @@ -161,7 +164,7 @@ struct flowi { #define flowi_iif u.__fl_common.flowic_iif #define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark -#define flowi_tos u.__fl_common.flowic_tos +#define flowi_dscp u.__fl_common.flowic_dscp #define flowi_scope u.__fl_common.flowic_scope #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 1a7131d6cb0e..ced79dc8e856 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -7,6 +7,7 @@ #include <linux/siphash.h> #include <linux/string.h> #include <uapi/linux/if_ether.h> +#include <uapi/linux/pkt_cls.h> struct bpf_prog; struct net; @@ -16,7 +17,8 @@ struct sk_buff; * struct flow_dissector_key_control: * @thoff: Transport header offset * @addr_type: Type of key. One of FLOW_DISSECTOR_KEY_* - * @flags: Key flags. Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAGENCAPSULATION) + * @flags: Key flags. + * Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAG|ENCAPSULATION|F_*) */ struct flow_dissector_key_control { u16 thoff; @@ -24,9 +26,20 @@ struct flow_dissector_key_control { u32 flags; }; -#define FLOW_DIS_IS_FRAGMENT BIT(0) -#define FLOW_DIS_FIRST_FRAG BIT(1) -#define FLOW_DIS_ENCAPSULATION BIT(2) +/* The control flags are kept in sync with TCA_FLOWER_KEY_FLAGS_*, as those + * flags are exposed to userspace in some error paths, ie. unsupported flags. + */ +enum flow_dissector_ctrl_flags { + FLOW_DIS_IS_FRAGMENT = TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, + FLOW_DIS_FIRST_FRAG = TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, + FLOW_DIS_F_TUNNEL_CSUM = TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM, + FLOW_DIS_F_TUNNEL_DONT_FRAGMENT = TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT, + FLOW_DIS_F_TUNNEL_OAM = TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM, + FLOW_DIS_F_TUNNEL_CRIT_OPT = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT, + + /* These flags are internal to the kernel */ + FLOW_DIS_ENCAPSULATION = (TCA_FLOWER_KEY_FLAGS_MAX << 1), +}; enum flow_dissect_ret { FLOW_DISSECT_RET_OUT_GOOD, @@ -97,7 +110,7 @@ struct flow_dissector_key_enc_opts { * here but seems difficult to #include */ u8 len; - __be16 dst_opt_type; + u32 dst_opt_type; }; struct flow_dissector_key_keyid { @@ -433,6 +446,8 @@ static inline bool flow_keys_have_l4(const struct flow_keys *keys) } u32 flow_hash_from_keys(struct flow_keys *keys); +u32 flow_hash_from_keys_seed(struct flow_keys *keys, + const siphash_key_t *keyval); void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, const void *data, int thoff, int hlen); diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 9efa9a59e81f..596ab9791e4d 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -333,7 +333,7 @@ struct flow_action_entry { struct flow_action { unsigned int num_entries; - struct flow_action_entry entries[]; + struct flow_action_entry entries[] __counted_by(num_entries); }; static inline bool flow_action_has_entries(const struct flow_action *action) @@ -345,7 +345,7 @@ static inline bool flow_action_has_entries(const struct flow_action *action) * flow_offload_has_one_action() - check if exactly one action is present * @action: tc filter flow offload action * - * Returns true if exactly one action is present. + * Return: true if exactly one action is present. */ static inline bool flow_offload_has_one_action(const struct flow_action *action) { @@ -449,6 +449,96 @@ static inline bool flow_rule_match_key(const struct flow_rule *rule, return dissector_uses_key(rule->match.dissector, key); } +/** + * flow_rule_is_supp_control_flags() - check for supported control flags + * @supp_flags: control flags supported by driver + * @ctrl_flags: control flags present in rule + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if only supported control flags are set, false otherwise. + */ +static inline bool flow_rule_is_supp_control_flags(const u32 supp_flags, + const u32 ctrl_flags, + struct netlink_ext_ack *extack) +{ + if (likely((ctrl_flags & ~supp_flags) == 0)) + return true; + + NL_SET_ERR_MSG_FMT_MOD(extack, + "Unsupported match on control.flags %#x", + ctrl_flags); + + return false; +} + +/** + * flow_rule_is_supp_enc_control_flags() - check for supported control flags + * @supp_enc_flags: encapsulation control flags supported by driver + * @enc_ctrl_flags: encapsulation control flags present in rule + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if only supported control flags are set, false otherwise. + */ +static inline bool flow_rule_is_supp_enc_control_flags(const u32 supp_enc_flags, + const u32 enc_ctrl_flags, + struct netlink_ext_ack *extack) +{ + if (likely((enc_ctrl_flags & ~supp_enc_flags) == 0)) + return true; + + NL_SET_ERR_MSG_FMT_MOD(extack, + "Unsupported match on enc_control.flags %#x", + enc_ctrl_flags); + + return false; +} + +/** + * flow_rule_has_control_flags() - check for presence of any control flags + * @ctrl_flags: control flags present in rule + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if control flags are set, false otherwise. + */ +static inline bool flow_rule_has_control_flags(const u32 ctrl_flags, + struct netlink_ext_ack *extack) +{ + return !flow_rule_is_supp_control_flags(0, ctrl_flags, extack); +} + +/** + * flow_rule_has_enc_control_flags() - check for presence of any control flags + * @enc_ctrl_flags: encapsulation control flags present in rule + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if control flags are set, false otherwise. + */ +static inline bool flow_rule_has_enc_control_flags(const u32 enc_ctrl_flags, + struct netlink_ext_ack *extack) +{ + return !flow_rule_is_supp_enc_control_flags(0, enc_ctrl_flags, extack); +} + +/** + * flow_rule_match_has_control_flags() - match and check for any control flags + * @rule: The flow_rule under evaluation. + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if control flags are set, false otherwise. + */ +static inline bool flow_rule_match_has_control_flags(struct flow_rule *rule, + struct netlink_ext_ack *extack) +{ + struct flow_match_control match; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) + return false; + + flow_rule_match_control(rule, &match); + + return flow_rule_has_control_flags(match.mask->flags, extack); +} + struct flow_stats { u64 pkts; u64 bytes; @@ -595,6 +685,7 @@ struct flow_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; + bool skip_sw; struct netlink_ext_ack *extack; }; diff --git a/include/net/genetlink.h b/include/net/genetlink.h index e18a4c0d69ee..7b84f2cef8b1 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -2,16 +2,29 @@ #ifndef __NET_GENERIC_NETLINK_H #define __NET_GENERIC_NETLINK_H -#include <linux/genetlink.h> +#include <linux/net.h> #include <net/netlink.h> #include <net/net_namespace.h> +#include <uapi/linux/genetlink.h> #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) +/* Non-parallel generic netlink requests are serialized by a global lock. */ +void genl_lock(void); +void genl_unlock(void); + +#define MODULE_ALIAS_GENL_FAMILY(family) \ + MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family) + +/* Binding to multicast group requires %CAP_NET_ADMIN */ +#define GENL_MCAST_CAP_NET_ADMIN BIT(0) +/* Binding to multicast group requires %CAP_SYS_ADMIN */ +#define GENL_MCAST_CAP_SYS_ADMIN BIT(1) + /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family - * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) + * @flags: GENL_MCAST_* flags */ struct genl_multicast_group { char name[GENL_NAMSIZ]; @@ -36,6 +49,8 @@ struct genl_info; * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks + * @bind: called when family multicast group is added to a netlink socket + * @unbind: called when family multicast group is removed from a netlink socket * @module: pointer to the owning module (set to THIS_MODULE) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups @@ -47,8 +62,11 @@ struct genl_info; * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family * @split_ops: the split do/dump form of operation definition - * @n_split_ops: number of entries in @split_ops, not that with split do/dump + * @n_split_ops: number of entries in @split_ops, note that with split do/dump * ops the number of entries is not the same as number of commands + * @sock_priv_size: the size of per-socket private memory + * @sock_priv_init: the per-socket private memory initializer + * @sock_priv_destroy: the per-socket private memory destructor * * Attribute policies (the combination of @policy and @maxattr fields) * can be attached at the family level or at the operation level. @@ -76,17 +94,25 @@ struct genl_family { void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); + int (*bind)(int mcgrp); + void (*unbind)(int mcgrp); const struct genl_ops * ops; const struct genl_small_ops *small_ops; const struct genl_split_ops *split_ops; const struct genl_multicast_group *mcgrps; struct module *module; + size_t sock_priv_size; + void (*sock_priv_init)(void *priv); + void (*sock_priv_destroy)(void *priv); + /* private: internal use only */ /* protocol family identifier */ int id; /* starting number of multicast group IDs in this family */ unsigned int mcgrp_offset; + /* list of per-socket privs */ + struct xarray *sock_privs; }; /** @@ -98,7 +124,8 @@ struct genl_family { * @genlhdr: generic netlink message header * @attrs: netlink attributes * @_net: network namespace - * @user_ptr: user pointers + * @ctx: storage space for the use by the family + * @user_ptr: user pointers (deprecated, use ctx instead) * @extack: extended ACK report struct */ struct genl_info { @@ -109,7 +136,10 @@ struct genl_info { struct genlmsghdr * genlhdr; struct nlattr ** attrs; possible_net_t _net; - void * user_ptr[2]; + union { + u8 ctx[NETLINK_CTX_SIZE]; + void * user_ptr[2]; + }; struct netlink_ext_ack *extack; }; @@ -135,7 +165,7 @@ static inline void *genl_info_userhdr(const struct genl_info *info) /* Report that a root attribute is missing */ #define GENL_REQ_ATTR_CHECK(info, attr) ({ \ - struct genl_info *__info = (info); \ + const struct genl_info *__info = (info); \ \ NL_REQ_ATTR_CHECK(__info->extack, NULL, __info->attrs, (attr)); \ }) @@ -296,6 +326,8 @@ static inline bool genl_info_is_ntf(const struct genl_info *info) return !info->nlhdr; } +void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk); +void *genl_sk_priv_get(struct genl_family *family, struct sock *sk); int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, @@ -322,7 +354,7 @@ __genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags) * such requests) or a struct initialized by genl_info_init_ntf() * when constructing notifications. * - * Returns pointer to new genetlink header. + * Returns: pointer to new genetlink header. */ static inline void * genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) @@ -334,7 +366,7 @@ genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() * - * Returns pointer to netlink header. + * Returns: pointer to netlink header. */ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) { @@ -403,7 +435,7 @@ static inline void genl_dump_check_consistent(struct netlink_callback *cb, * @flags: netlink message flags * @cmd: generic netlink command * - * Returns pointer to user specific header + * Returns: pointer to user specific header */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, @@ -436,6 +468,35 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) } /** + * genlmsg_multicast_netns_filtered - multicast a netlink message + * to a specific netns with filter + * function + * @family: the generic netlink family + * @net: the net namespace + * @skb: netlink message as socket buffer + * @portid: own netlink portid to avoid sending to yourself + * @group: offset of multicast group in groups array + * @flags: allocation flags + * @filter: filter function + * @filter_data: filter function private data + * + * Return: 0 on success, negative error code for failure. + */ +static inline int +genlmsg_multicast_netns_filtered(const struct genl_family *family, + struct net *net, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags, + netlink_filter_fn filter, + void *filter_data) +{ + if (WARN_ON_ONCE(group >= family->n_mcgrps)) + return -EINVAL; + group = family->mcgrp_offset + group; + return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, + flags, filter, filter_data); +} + +/** * genlmsg_multicast_netns - multicast a netlink message to a specific netns * @family: the generic netlink family * @net: the net namespace @@ -448,10 +509,8 @@ static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) - return -EINVAL; - group = family->mcgrp_offset + group; - return nlmsg_multicast(net->genl_sock, skb, portid, group, flags); + return genlmsg_multicast_netns_filtered(family, net, skb, portid, + group, flags, NULL, NULL); } /** @@ -476,13 +535,12 @@ static inline int genlmsg_multicast(const struct genl_family *family, * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array - * @flags: allocation flags * * This function must hold the RTNL or rcu_read_lock(). */ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, - unsigned int group, gfp_t flags); + unsigned int group); /** * genlmsg_unicast - unicast a netlink message diff --git a/include/net/gre.h b/include/net/gre.h index 4e209708b754..ccd293203284 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -49,67 +49,61 @@ static inline bool netif_is_ip6gretap(const struct net_device *dev) !strcmp(dev->rtnl_link_ops->kind, "ip6gretap"); } -static inline int gre_calc_hlen(__be16 o_flags) +static inline int gre_calc_hlen(const unsigned long *o_flags) { int addend = 4; - if (o_flags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, o_flags)) addend += 4; - if (o_flags & TUNNEL_KEY) + if (test_bit(IP_TUNNEL_KEY_BIT, o_flags)) addend += 4; - if (o_flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, o_flags)) addend += 4; return addend; } -static inline __be16 gre_flags_to_tnl_flags(__be16 flags) +static inline void gre_flags_to_tnl_flags(unsigned long *dst, __be16 flags) { - __be16 tflags = 0; - - if (flags & GRE_CSUM) - tflags |= TUNNEL_CSUM; - if (flags & GRE_ROUTING) - tflags |= TUNNEL_ROUTING; - if (flags & GRE_KEY) - tflags |= TUNNEL_KEY; - if (flags & GRE_SEQ) - tflags |= TUNNEL_SEQ; - if (flags & GRE_STRICT) - tflags |= TUNNEL_STRICT; - if (flags & GRE_REC) - tflags |= TUNNEL_REC; - if (flags & GRE_VERSION) - tflags |= TUNNEL_VERSION; - - return tflags; + IP_TUNNEL_DECLARE_FLAGS(res) = { }; + + __assign_bit(IP_TUNNEL_CSUM_BIT, res, flags & GRE_CSUM); + __assign_bit(IP_TUNNEL_ROUTING_BIT, res, flags & GRE_ROUTING); + __assign_bit(IP_TUNNEL_KEY_BIT, res, flags & GRE_KEY); + __assign_bit(IP_TUNNEL_SEQ_BIT, res, flags & GRE_SEQ); + __assign_bit(IP_TUNNEL_STRICT_BIT, res, flags & GRE_STRICT); + __assign_bit(IP_TUNNEL_REC_BIT, res, flags & GRE_REC); + __assign_bit(IP_TUNNEL_VERSION_BIT, res, flags & GRE_VERSION); + + ip_tunnel_flags_copy(dst, res); } -static inline __be16 gre_tnl_flags_to_gre_flags(__be16 tflags) +static inline __be16 gre_tnl_flags_to_gre_flags(const unsigned long *tflags) { __be16 flags = 0; - if (tflags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, tflags)) flags |= GRE_CSUM; - if (tflags & TUNNEL_ROUTING) + if (test_bit(IP_TUNNEL_ROUTING_BIT, tflags)) flags |= GRE_ROUTING; - if (tflags & TUNNEL_KEY) + if (test_bit(IP_TUNNEL_KEY_BIT, tflags)) flags |= GRE_KEY; - if (tflags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, tflags)) flags |= GRE_SEQ; - if (tflags & TUNNEL_STRICT) + if (test_bit(IP_TUNNEL_STRICT_BIT, tflags)) flags |= GRE_STRICT; - if (tflags & TUNNEL_REC) + if (test_bit(IP_TUNNEL_REC_BIT, tflags)) flags |= GRE_REC; - if (tflags & TUNNEL_VERSION) + if (test_bit(IP_TUNNEL_VERSION_BIT, tflags)) flags |= GRE_VERSION; return flags; } static inline void gre_build_header(struct sk_buff *skb, int hdr_len, - __be16 flags, __be16 proto, + const unsigned long *flags, __be16 proto, __be32 key, __be32 seq) { + IP_TUNNEL_DECLARE_FLAGS(cond) = { }; struct gre_base_hdr *greh; skb_push(skb, hdr_len); @@ -120,18 +114,22 @@ static inline void gre_build_header(struct sk_buff *skb, int hdr_len, greh->flags = gre_tnl_flags_to_gre_flags(flags); greh->protocol = proto; - if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { + __set_bit(IP_TUNNEL_KEY_BIT, cond); + __set_bit(IP_TUNNEL_CSUM_BIT, cond); + __set_bit(IP_TUNNEL_SEQ_BIT, cond); + + if (ip_tunnel_flags_intersect(flags, cond)) { __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - if (flags & TUNNEL_SEQ) { + if (test_bit(IP_TUNNEL_SEQ_BIT, flags)) { *ptr = seq; ptr--; } - if (flags & TUNNEL_KEY) { + if (test_bit(IP_TUNNEL_KEY_BIT, flags)) { *ptr = key; ptr--; } - if (flags & TUNNEL_CSUM && + if (test_bit(IP_TUNNEL_CSUM_BIT, flags) && !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; diff --git a/include/net/gro.h b/include/net/gro.h index 88644b3ca660..b65f631c521d 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _NET_IPV6_GRO_H -#define _NET_IPV6_GRO_H +#ifndef _NET_GRO_H +#define _NET_GRO_H #include <linux/indirect_call_wrapper.h> #include <linux/ip.h> @@ -9,6 +9,10 @@ #include <net/ip6_checksum.h> #include <linux/skbuff.h> #include <net/udp.h> +#include <net/hotdata.h> + +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) struct napi_gro_cb { union { @@ -35,15 +39,14 @@ struct napi_gro_cb { /* This is non-zero if the packet cannot be merged with the new skb. */ u16 flush; - /* Save the IP ID here and check when we get to the transport layer */ - u16 flush_id; - /* Number of segments aggregated. */ u16 count; - /* Used in ipv6_gro_receive() and foo-over-udp */ + /* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */ u16 proto; + u16 pad; + /* Used in napi_gro_cb::free */ #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 @@ -68,14 +71,11 @@ struct napi_gro_cb { /* Free the skb? */ u8 free:2; - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; - /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1; - /* Used to determine if flush_id can be ignored */ - u8 is_atomic:1; + /* Used to determine if ipid_offset can be ignored */ + u8 ip_fixedid:2; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; @@ -86,6 +86,15 @@ struct napi_gro_cb { /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; + + /* L3 offsets */ + union { + struct { + u16 network_offset; + u16 inner_network_offset; + }; + u16 network_offsets[2]; + }; }; #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) @@ -139,21 +148,16 @@ static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) NAPI_GRO_CB(skb)->data_offset += len; } -static inline void *skb_gro_header_fast(struct sk_buff *skb, +static inline void *skb_gro_header_fast(const struct sk_buff *skb, unsigned int offset) { return NAPI_GRO_CB(skb)->frag0 + offset; } -static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) -{ - return NAPI_GRO_CB(skb)->frag0_len < hlen; -} - -static inline void skb_gro_frag0_invalidate(struct sk_buff *skb) +static inline bool skb_gro_may_pull(const struct sk_buff *skb, + unsigned int hlen) { - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + return likely(hlen <= NAPI_GRO_CB(skb)->frag0_len); } static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, @@ -162,28 +166,35 @@ static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, if (!pskb_may_pull(skb, hlen)) return NULL; - skb_gro_frag0_invalidate(skb); return skb->data + offset; } -static inline void *skb_gro_header(struct sk_buff *skb, - unsigned int hlen, unsigned int offset) +static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen, + unsigned int offset) { void *ptr; ptr = skb_gro_header_fast(skb, offset); - if (skb_gro_header_hard(skb, hlen)) + if (!skb_gro_may_pull(skb, hlen)) ptr = skb_gro_header_slow(skb, hlen, offset); return ptr; } -static inline void *skb_gro_network_header(struct sk_buff *skb) +static inline int skb_gro_receive_network_offset(const struct sk_buff *skb) { - return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) + - skb_network_offset(skb); + return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark]; } -static inline __wsum inet_gro_compute_pseudo(struct sk_buff *skb, int proto) +static inline void *skb_gro_network_header(const struct sk_buff *skb) +{ + if (skb_gro_may_pull(skb, skb_gro_offset(skb))) + return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb)); + + return skb->data + skb_gro_receive_network_offset(skb); +} + +static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb, + int proto) { const struct iphdr *iph = skb_gro_network_header(skb); @@ -421,7 +432,8 @@ static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) return uh; } -static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto) +static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, + int proto) { const struct ipv6hdr *iph = skb_gro_network_header(skb); @@ -429,29 +441,112 @@ static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto) skb_gro_len(skb), proto, 0)); } +static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, + struct sk_buff *p, bool inner) +{ + const u32 id = ntohl(*(__be32 *)&iph->id); + const u32 id2 = ntohl(*(__be32 *)&iph2->id); + const u16 ipid_offset = (id >> 16) - (id2 >> 16); + const u16 count = NAPI_GRO_CB(p)->count; + + /* All fields must match except length and checksum. */ + if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | ((id ^ id2) & IP_DF)) + return true; + + /* When we receive our second frame we can make a decision on if we + * continue this flow as an atomic flow with a fixed ID or if we use + * an incrementing ID. + */ + if (count == 1 && !ipid_offset) + NAPI_GRO_CB(p)->ip_fixedid |= 1 << inner; + + return ipid_offset ^ (count * !(NAPI_GRO_CB(p)->ip_fixedid & (1 << inner))); +} + +static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) +{ + /* <Version:4><Traffic_Class:8><Flow_Label:20> */ + __be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2; + + /* Flush if Traffic Class fields are different. */ + return !!((first_word & htonl(0x0FF00000)) | + (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); +} + +static inline int __gro_receive_network_flush(const void *th, const void *th2, + struct sk_buff *p, const u16 diff, + bool inner) +{ + const void *nh = th - diff; + const void *nh2 = th2 - diff; + + if (((struct iphdr *)nh)->version == 6) + return ipv6_gro_flush(nh, nh2); + else + return inet_gro_flush(nh, nh2, p, inner); +} + +static inline int gro_receive_network_flush(const void *th, const void *th2, + struct sk_buff *p) +{ + int off = skb_transport_offset(p); + int flush; + + flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, false); + if (NAPI_GRO_CB(p)->encap_mark) + flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, true); + + return flush; +} + int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); +void __gro_flush(struct gro_node *gro, bool flush_old); + +static inline void gro_flush(struct gro_node *gro, bool flush_old) +{ + if (!gro->bitmask) + return; + + __gro_flush(gro, flush_old); +} + +static inline void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + gro_flush(&napi->gro, flush_old); +} /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static inline void gro_normal_list(struct napi_struct *napi) +static inline void gro_normal_list(struct gro_node *gro) { - if (!napi->rx_count) + if (!gro->rx_count) return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; + netif_receive_skb_list_internal(&gro->rx_list); + INIT_LIST_HEAD(&gro->rx_list); + gro->rx_count = 0; +} + +static inline void gro_flush_normal(struct gro_node *gro, bool flush_old) +{ + gro_flush(gro, flush_old); + gro_normal_list(gro); } /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ -static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) +static inline void gro_normal_one(struct gro_node *gro, struct sk_buff *skb, + int segs) { - list_add_tail(&skb->list, &napi->rx_list); - napi->rx_count += segs; - if (napi->rx_count >= READ_ONCE(gro_normal_batch)) - gro_normal_list(napi); + list_add_tail(&skb->list, &gro->rx_list); + gro->rx_count += segs; + if (gro->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) + gro_normal_list(gro); } +void gro_init(struct gro_node *gro); +void gro_cleanup(struct gro_node *gro); + /* This function is the alternative of 'inet_iif' and 'inet_sdif' * functions in case we can not rely on fields of IPCB. * @@ -495,6 +590,34 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int * #endif } -extern struct list_head offload_base; +struct packet_offload *gro_find_receive_by_type(__be16 type); +struct packet_offload *gro_find_complete_by_type(__be16 type); + +static inline struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) +{ + unsigned int thlen, hlen, off; + struct tcphdr *th; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*th); + th = skb_gro_header(skb, hlen, off); + if (unlikely(!th)) + return NULL; + + thlen = th->doff * 4; + if (unlikely(thlen < sizeof(*th))) + return NULL; + + hlen = off + thlen; + if (!skb_gro_may_pull(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + return NULL; + } + + skb_gro_pull(skb, thlen); + + return th; +} -#endif /* _NET_IPV6_GRO_H */ +#endif /* _NET_GRO_H */ diff --git a/include/net/gtp.h b/include/net/gtp.h index 2a503f035d18..c0253c8702d0 100644 --- a/include/net/gtp.h +++ b/include/net/gtp.h @@ -78,4 +78,9 @@ static inline bool netif_is_gtp(const struct net_device *dev) #define GTP1_F_EXTHDR 0x04 #define GTP1_F_MASK 0x07 +struct gtp_ext_hdr { + __u8 len; + __u8 data[]; +}; + #endif diff --git a/include/net/hotdata.h b/include/net/hotdata.h new file mode 100644 index 000000000000..4acec191c54a --- /dev/null +++ b/include/net/hotdata.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_HOTDATA_H +#define _NET_HOTDATA_H + +#include <linux/llist.h> +#include <linux/types.h> +#include <linux/netdevice.h> +#include <net/protocol.h> + +struct skb_defer_node { + struct llist_head defer_list; + atomic_long_t defer_count; +} ____cacheline_aligned_in_smp; + +/* Read mostly data used in network fast paths. */ +struct net_hotdata { +#if IS_ENABLED(CONFIG_INET) + struct packet_offload ip_packet_offload; + struct net_offload tcpv4_offload; + struct net_protocol tcp_protocol; + struct net_offload udpv4_offload; + struct net_protocol udp_protocol; + struct packet_offload ipv6_packet_offload; + struct net_offload tcpv6_offload; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_protocol tcpv6_protocol; + struct inet6_protocol udpv6_protocol; +#endif + struct net_offload udpv6_offload; +#endif + struct list_head offload_base; + struct kmem_cache *skbuff_cache; + struct kmem_cache *skbuff_fclone_cache; + struct kmem_cache *skb_small_head_cache; +#ifdef CONFIG_RPS + struct rps_sock_flow_table __rcu *rps_sock_flow_table; + u32 rps_cpu_mask; +#endif + struct skb_defer_node __percpu *skb_defer_nodes; + int gro_normal_batch; + int netdev_budget; + int netdev_budget_usecs; + int tstamp_prequeue; + int max_backlog; + int dev_tx_weight; + int dev_rx_weight; + int sysctl_max_skb_frags; + int sysctl_skb_defer_max; + int sysctl_mem_pcpu_rsv; +}; + +#define inet_ehash_secret net_hotdata.tcp_protocol.secret +#define udp_ehash_secret net_hotdata.udp_protocol.secret +#define inet6_ehash_secret net_hotdata.tcpv6_protocol.secret +#define tcp_ipv6_hash_secret net_hotdata.tcpv6_offload.secret +#define udp6_ehash_secret net_hotdata.udpv6_protocol.secret +#define udp_ipv6_hash_secret net_hotdata.udpv6_offload.secret + +extern struct net_hotdata net_hotdata; + +#endif /* _NET_HOTDATA_H */ diff --git a/include/net/hwbm.h b/include/net/hwbm.h index aa495decec35..bdbe91c609ff 100644 --- a/include/net/hwbm.h +++ b/include/net/hwbm.h @@ -11,9 +11,9 @@ struct hwbm_pool { int frag_size; /* Number of buffers currently used by this pool */ int buf_num; - /* constructor called during alocation */ + /* constructor called during allocation */ int (*construct)(struct hwbm_pool *bm_pool, void *buf); - /* protect acces to the buffer counter*/ + /* protect access to the buffer counter*/ struct mutex buf_lock; /* private data */ void *priv; diff --git a/include/net/icmp.h b/include/net/icmp.h index caddf4a59ad1..935ee13d9ae9 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -37,10 +37,10 @@ struct sk_buff; struct net; void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, - const struct ip_options *opt); + const struct inet_skb_parm *parm); static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { - __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt); + __icmp_send(skb_in, type, code, info, IPCB(skb_in)); } #if IS_ENABLED(CONFIG_NF_NAT) @@ -48,8 +48,10 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); #else static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { - struct ip_options opts = { 0 }; - __icmp_send(skb_in, type, code, info, &opts); + struct inet_skb_parm parm; + + memset(&parm, 0, sizeof(parm)); + __icmp_send(skb_in, type, code, info, &parm); } #endif diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index 2338f8d2a8b3..c60867e7e43c 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2017 Intel Deutschland GmbH - * Copyright (c) 2018-2019, 2021-2022 Intel Corporation + * Copyright (c) 2018-2019, 2021-2022, 2025 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -18,31 +18,33 @@ #define __RADIOTAP_H #include <linux/kernel.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /** * struct ieee80211_radiotap_header - base radiotap header */ struct ieee80211_radiotap_header { - /** - * @it_version: radiotap version, always 0 - */ - uint8_t it_version; - - /** - * @it_pad: padding (or alignment) - */ - uint8_t it_pad; - - /** - * @it_len: overall radiotap header length - */ - __le16 it_len; - - /** - * @it_present: (first) present word - */ - __le32 it_present; + __struct_group(ieee80211_radiotap_header_fixed, hdr, __packed, + /** + * @it_version: radiotap version, always 0 + */ + uint8_t it_version; + + /** + * @it_pad: padding (or alignment) + */ + uint8_t it_pad; + + /** + * @it_len: overall radiotap header length + */ + __le16 it_len; + + /** + * @it_present: (first) present word + */ + __le32 it_present; + ); /** * @it_optional: all remaining presence bitmaps @@ -50,6 +52,9 @@ struct ieee80211_radiotap_header { __le32 it_optional[]; } __packed; +static_assert(offsetof(struct ieee80211_radiotap_header, it_optional) == sizeof(struct ieee80211_radiotap_header_fixed), + "struct member likely outside of __struct_group()"); + /* version is always 0 */ #define PKTHDR_RADIOTAP_VERSION 0 @@ -197,6 +202,24 @@ enum ieee80211_radiotap_vht_coding { IEEE80211_RADIOTAP_CODING_LDPC_USER3 = 0x08, }; +enum ieee80211_radiotap_vht_bandwidth { + /* Note: more values are defined but can't really be used */ + IEEE80211_RADIOTAP_VHT_BW_20 = 0, + IEEE80211_RADIOTAP_VHT_BW_40 = 1, + IEEE80211_RADIOTAP_VHT_BW_80 = 4, + IEEE80211_RADIOTAP_VHT_BW_160 = 11, +}; + +struct ieee80211_radiotap_vht { + __le16 known; + u8 flags; + u8 bandwidth; + u8 mcs_nss[4]; + u8 coding; + u8 group_id; + __le16 partial_aid; +} __packed; + /* for IEEE80211_RADIOTAP_TIMESTAMP */ enum ieee80211_radiotap_timestamp_unit_spos { IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MASK = 0x000F, @@ -539,6 +562,12 @@ enum ieee80211_radiotap_eht_usig_common { IEEE80211_RADIOTAP_EHT_USIG_COMMON_VALIDATE_BITS_OK = 0x00000080, IEEE80211_RADIOTAP_EHT_USIG_COMMON_PHY_VER = 0x00007000, IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW = 0x00038000, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_20MHZ = 0, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_40MHZ = 1, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_80MHZ = 2, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_160MHZ = 3, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_320MHZ_1 = 4, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_320MHZ_2 = 5, IEEE80211_RADIOTAP_EHT_USIG_COMMON_UL_DL = 0x00040000, IEEE80211_RADIOTAP_EHT_USIG_COMMON_BSS_COLOR = 0x01f80000, IEEE80211_RADIOTAP_EHT_USIG_COMMON_TXOP = 0xfe000000, @@ -576,6 +605,7 @@ enum ieee80211_radiotap_eht_usig_tb { /** * ieee80211_get_radiotap_len - get radiotap header length * @data: pointer to the header + * Return: the radiotap header length */ static inline u16 ieee80211_get_radiotap_len(const char *data) { diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h index 063313df447d..4de858f9929e 100644 --- a/include/net/ieee802154_netdev.h +++ b/include/net/ieee802154_netdev.h @@ -125,6 +125,35 @@ struct ieee802154_hdr_fc { #endif }; +struct ieee802154_assoc_req_pl { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved1:1, + device_type:1, + power_source:1, + rx_on_when_idle:1, + assoc_type:1, + reserved2:1, + security_cap:1, + alloc_addr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 alloc_addr:1, + security_cap:1, + reserved2:1, + assoc_type:1, + rx_on_when_idle:1, + power_source:1, + device_type:1, + reserved1:1; +#else +#error "Please fix <asm/byteorder.h>" +#endif +} __packed; + +struct ieee802154_assoc_resp_pl { + __le16 short_addr; + u8 status; +} __packed; + enum ieee802154_frame_version { IEEE802154_2003_STD, IEEE802154_2006_STD, @@ -140,6 +169,19 @@ enum ieee802154_addressing_mode { IEEE802154_EXTENDED_ADDRESSING, }; +enum ieee802154_association_status { + IEEE802154_ASSOCIATION_SUCCESSFUL = 0x00, + IEEE802154_PAN_AT_CAPACITY = 0x01, + IEEE802154_PAN_ACCESS_DENIED = 0x02, + IEEE802154_HOPPING_SEQUENCE_OFFSET_DUP = 0x03, + IEEE802154_FAST_ASSOCIATION_SUCCESSFUL = 0x80, +}; + +enum ieee802154_disassociation_reason { + IEEE802154_COORD_WISHES_DEVICE_TO_LEAVE = 0x1, + IEEE802154_DEVICE_WISHES_TO_LEAVE = 0x2, +}; + struct ieee802154_hdr { struct ieee802154_hdr_fc fc; u8 seq; @@ -163,6 +205,24 @@ struct ieee802154_beacon_req_frame { struct ieee802154_mac_cmd_pl mac_pl; }; +struct ieee802154_association_req_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; + struct ieee802154_assoc_req_pl assoc_req_pl; +}; + +struct ieee802154_association_resp_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; + struct ieee802154_assoc_resp_pl assoc_resp_pl; +}; + +struct ieee802154_disassociation_notif_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; + u8 disassoc_pl; +}; + /* pushes hdr onto the skb. fields of hdr->fc that can be calculated from * the contents of hdr will be, and the actual value of those bits in * hdr->fc will be ignored. this includes the INTRA_PAN bit and the frame diff --git a/include/net/ieee8021q.h b/include/net/ieee8021q.h new file mode 100644 index 000000000000..8bfe903dd3d0 --- /dev/null +++ b/include/net/ieee8021q.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> */ + +#ifndef _NET_IEEE8021Q_H +#define _NET_IEEE8021Q_H + +#include <linux/errno.h> + +/** + * enum ieee8021q_traffic_type - 802.1Q traffic type priority values (802.1Q-2022) + * + * @IEEE8021Q_TT_BK: Background + * @IEEE8021Q_TT_BE: Best Effort (default). According to 802.1Q-2022, BE is 0 + * but has higher priority than BK which is 1. + * @IEEE8021Q_TT_EE: Excellent Effort + * @IEEE8021Q_TT_CA: Critical Applications + * @IEEE8021Q_TT_VI: Video, < 100 ms latency and jitter + * @IEEE8021Q_TT_VO: Voice, < 10 ms latency and jitter + * @IEEE8021Q_TT_IC: Internetwork Control + * @IEEE8021Q_TT_NC: Network Control + */ +enum ieee8021q_traffic_type { + IEEE8021Q_TT_BK = 0, + IEEE8021Q_TT_BE = 1, + IEEE8021Q_TT_EE = 2, + IEEE8021Q_TT_CA = 3, + IEEE8021Q_TT_VI = 4, + IEEE8021Q_TT_VO = 5, + IEEE8021Q_TT_IC = 6, + IEEE8021Q_TT_NC = 7, + + /* private: */ + IEEE8021Q_TT_MAX, +}; + +#define SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp) ((dscp >> 3) & 0x7) + +#if IS_ENABLED(CONFIG_NET_IEEE8021Q_HELPERS) + +int ietf_dscp_to_ieee8021q_tt(u8 dscp); +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues); + +#else + +static inline int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ + return -EOPNOTSUPP; +} + +static inline int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, + unsigned int num_queues) +{ + return -EOPNOTSUPP; +} + +#endif +#endif /* _NET_IEEE8021Q_H */ diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index c8490729b4ae..238ad3349456 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -22,10 +22,6 @@ #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 -/* prefix flags */ -#define IF_PREFIX_ONLINK 0x01 -#define IF_PREFIX_AUTOCONF 0x02 - enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, @@ -89,7 +85,7 @@ struct ip6_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; - struct in6_addr sl_addr[]; + struct in6_addr sl_addr[] __counted_by(sl_max); }; #define IP6_SFBLOCK 10 /* allocate this many at once */ @@ -148,7 +144,7 @@ struct ipv6_ac_socklist { struct ifacaddr6 { struct in6_addr aca_addr; struct fib6_info *aca_rt; - struct ifacaddr6 *aca_next; + struct ifacaddr6 __rcu *aca_next; struct hlist_node aca_addr_lst; int aca_users; refcount_t aca_refcnt; @@ -200,7 +196,7 @@ struct inet6_dev { spinlock_t mc_report_lock; /* mld query report lock */ struct mutex mc_lock; /* mld global lock */ - struct ifacaddr6 *ac_list; + struct ifacaddr6 __rcu *ac_list; rwlock_t lock; refcount_t refcnt; __u32 if_flags; diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index 025bd8d3c769..745891d2e113 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -21,8 +21,6 @@ struct sockaddr; struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, const struct request_sock *req, u8 proto); -void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); - int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu); diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 533a7337865a..282e29237d93 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -40,8 +40,7 @@ static inline unsigned int __inet6_ehashfn(const u32 lhash, * * The sockhash lock must be held as a reader here. */ -struct sock *__inet6_lookup_established(struct net *net, - struct inet_hashinfo *hashinfo, +struct sock *__inet6_lookup_established(const struct net *net, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -56,7 +55,7 @@ inet6_ehashfn_t inet6_ehashfn; INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn); -struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, +struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, @@ -64,8 +63,7 @@ struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, unsigned short hnum, inet6_ehashfn_t *ehashfn); -struct sock *inet6_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, +struct sock *inet6_lookup_listener(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, @@ -73,7 +71,7 @@ struct sock *inet6_lookup_listener(struct net *net, const unsigned short hnum, const int dif, const int sdif); -struct sock *inet6_lookup_run_sk_lookup(struct net *net, +struct sock *inet6_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, const struct in6_addr *saddr, @@ -82,8 +80,7 @@ struct sock *inet6_lookup_run_sk_lookup(struct net *net, const u16 hnum, const int dif, inet6_ehashfn_t *ehashfn); -static inline struct sock *__inet6_lookup(struct net *net, - struct inet_hashinfo *hashinfo, +static inline struct sock *__inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, @@ -92,14 +89,14 @@ static inline struct sock *__inet6_lookup(struct net *net, const int dif, const int sdif, bool *refcounted) { - struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr, - sport, daddr, hnum, + struct sock *sk = __inet6_lookup_established(net, saddr, sport, + daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; - return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + return inet6_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } @@ -143,14 +140,13 @@ struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, return reuse_sk; } -static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, +static inline struct sock *__inet6_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, int iif, int sdif, bool *refcounted) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net_rcu(skb); const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct sock *sk; @@ -161,21 +157,17 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet6_lookup(net, hashinfo, skb, - doff, &ip6h->saddr, sport, + return __inet6_lookup(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } -struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, +struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); -int inet6_hash(struct sock *sk); - -static inline bool inet6_match(struct net *net, const struct sock *sk, +static inline bool inet6_match(const struct net *net, const struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, const __portpair ports, diff --git a/include/net/inet_common.h b/include/net/inet_common.h index f50a644d87a9..5dd2bf24449e 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -19,18 +19,17 @@ struct msghdr; struct net; struct page; struct sock; -struct sockaddr; struct socket; int inet_release(struct socket *sock); -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); -int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags, int is_sendmsg); -int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); -int inet_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern); +int inet_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk); int inet_send_prepare(struct sock *sk); @@ -42,8 +41,8 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); int __inet_listen_sk(struct sock *sk, int backlog); void inet_sock_destruct(struct sock *sk); -int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); -int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); +int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ @@ -52,7 +51,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); #define BIND_FROM_BPF (1 << 2) /* Skip CAP_NET_BIND_SERVICE check. */ #define BIND_NO_CAP_NET_BIND_SERVICE (1 << 3) -int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, +int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 5d2fcc137b88..ecb362025c4e 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -44,13 +44,10 @@ struct inet_connection_sock_af_ops { struct request_sock *req_unhash, bool *own_req); u16 net_header_len; - u16 net_frag_header_len; - u16 sockaddr_len; int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); - void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); void (*mtu_reduced)(struct sock *sk); }; @@ -59,15 +56,15 @@ struct inet_connection_sock_af_ops { * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node * @icsk_bind2_hash: Bind node in the bhash2 table - * @icsk_timeout: Timeout - * @icsk_retransmit_timer: Resend (no ack) + * @icsk_delack_timer: Delayed ACK timer + * @icsk_keepalive_timer: Keepalive timer + * @mptcp_tout_timer: mptcp timer * @icsk_rto: Retransmit timeout * @icsk_pmtu_cookie Last pmtu seen by socket * @icsk_ca_ops Pluggable congestion control hook * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data - * @icsk_clean_acked Clean acked data hook * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -86,18 +83,20 @@ struct inet_connection_sock { struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; struct inet_bind2_bucket *icsk_bind2_hash; - unsigned long icsk_timeout; - struct timer_list icsk_retransmit_timer; - struct timer_list icsk_delack_timer; + struct timer_list icsk_delack_timer; + union { + struct timer_list icsk_keepalive_timer; + struct timer_list mptcp_tout_timer; + }; __u32 icsk_rto; __u32 icsk_rto_min; + u32 icsk_rto_max; __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; - void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, @@ -114,8 +113,11 @@ struct inet_connection_sock { __u8 quick; /* Scheduled number of quick acks */ __u8 pingpong; /* The session is interactive */ __u8 retry; /* Number of attempts */ - __u32 ato; /* Predicted tick of soft clock */ - unsigned long timeout; /* Currently scheduled timeout */ + #define ATO_BITS 8 + __u32 ato:ATO_BITS, /* Predicted tick of soft clock */ + lrcv_flowlabel:20, /* last received ipv6 flowlabel */ + dst_quick_ack:1, /* cache dst RTAX_QUICKACK */ + unused:3; __u32 lrcvtime; /* timestamp of last received data packet */ __u16 last_seg_size; /* Size of last incoming segment */ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ @@ -145,10 +147,7 @@ struct inet_connection_sock { #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ #define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */ -static inline struct inet_connection_sock *inet_csk(const struct sock *sk) -{ - return (struct inet_connection_sock *)sk; -} +#define inet_csk(ptr) container_of_const(ptr, struct inet_connection_sock, icsk_inet.sk) static inline void *inet_csk_ca(const struct sock *sk) { @@ -173,6 +172,7 @@ void inet_csk_init_xmit_timers(struct sock *sk, void (*delack_handler)(struct timer_list *), void (*keepalive_handler)(struct timer_list *)); void inet_csk_clear_xmit_timers(struct sock *sk); +void inet_csk_clear_xmit_timers_sync(struct sock *sk); static inline void inet_csk_schedule_ack(struct sock *sk) { @@ -189,20 +189,28 @@ static inline void inet_csk_delack_init(struct sock *sk) memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); } -void inet_csk_delete_keepalive_timer(struct sock *sk); -void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); +static inline unsigned long tcp_timeout_expires(const struct sock *sk) +{ + return READ_ONCE(sk->tcp_retransmit_timer.expires); +} + +static inline unsigned long +icsk_delack_timeout(const struct inet_connection_sock *icsk) +{ + return READ_ONCE(icsk->icsk_delack_timer.expires); +} static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { struct inet_connection_sock *icsk = inet_csk(sk); if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { - icsk->icsk_pending = 0; + smp_store_release(&icsk->icsk_pending, 0); #ifdef INET_CSK_CLEAR_TIMERS - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &sk->tcp_retransmit_timer); #endif } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_ack.pending, 0); icsk->icsk_ack.retry = 0; #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); @@ -227,15 +235,15 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, when = max_when; } + when += jiffies; if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { - icsk->icsk_pending = what; - icsk->icsk_timeout = jiffies + when; - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + smp_store_release(&icsk->icsk_pending, what); + sk_reset_timer(sk, &sk->tcp_retransmit_timer, when); } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending |= ICSK_ACK_TIMER; - icsk->icsk_ack.timeout = jiffies + when; - sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + smp_store_release(&icsk->icsk_ack.pending, + icsk->icsk_ack.pending | ICSK_ACK_TIMER); + sk_reset_timer(sk, &icsk->icsk_delack_timer, when); } else { pr_debug("inet_csk BUG: unknown timer value\n"); } @@ -250,7 +258,7 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, return (unsigned long)min_t(u64, when, max_when); } -struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern); +struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg); int inet_csk_get_port(struct sock *sk, unsigned short snum); @@ -263,8 +271,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child); -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout); +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req); @@ -281,28 +288,14 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog; + return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog); } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); -static inline unsigned long -reqsk_timeout(struct request_sock *req, unsigned long max_timeout) -{ - u64 timeout = (u64)req->timeout << req->num_timeout; - - return (unsigned long)min_t(u64, timeout, max_timeout); -} - -static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) -{ - /* The below has to be done to allow calling inet_csk_destroy_sock */ - sock_set_flag(sk, SOCK_DEAD); - this_cpu_inc(*sk->sk_prot->orphan_count); -} - void inet_csk_destroy_sock(struct sock *sk); +void inet_csk_prepare_for_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); /* @@ -317,19 +310,17 @@ static inline __poll_t inet_csk_listen_poll(const struct sock *sk) int inet_csk_listen_start(struct sock *sk); void inet_csk_listen_stop(struct sock *sk); -void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); - /* update the fast reuse flag when adding a socket */ -void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, - struct sock *sk); +void inet_csk_update_fastreuse(const struct sock *sk, + struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2); struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); -#define TCP_PINGPONG_THRESH 1 - static inline void inet_csk_enter_pingpong_mode(struct sock *sk) { - inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH; + inet_csk(sk)->icsk_ack.pingpong = + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh); } static inline void inet_csk_exit_pingpong_mode(struct sock *sk) @@ -339,7 +330,16 @@ static inline void inet_csk_exit_pingpong_mode(struct sock *sk) static inline bool inet_csk_in_pingpong_mode(struct sock *sk) { - return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH; + return inet_csk(sk)->icsk_ack.pingpong >= + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh); +} + +static inline void inet_csk_inc_pingpong_cnt(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ack.pingpong < U8_MAX) + icsk->icsk_ack.pingpong++; } static inline bool inet_csk_has_ulp(const struct sock *sk) @@ -347,4 +347,12 @@ static inline bool inet_csk_has_ulp(const struct sock *sk) return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; } +static inline void inet_init_csk_locks(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + spin_lock_init(&icsk->icsk_accept_queue.rskq_lock); + spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock); +} + #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h index 72f250dffada..1aa9f04ed1ab 100644 --- a/include/net/inet_dscp.h +++ b/include/net/inet_dscp.h @@ -39,6 +39,12 @@ typedef u8 __bitwise dscp_t; #define INET_DSCP_MASK 0xfc +/* A few places in the IPv4 code need to ignore the three high order bits of + * DSCP because of backward compatibility (as these bits used to represent the + * IPv4 Precedence in RFC 791's TOS field and were ignored). + */ +#define INET_DSCP_LEGACY_TOS_MASK ((__force dscp_t)0x1c) + static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield) { return (__force dscp_t)(dsfield & INET_DSCP_MASK); diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 153960663ce4..0eccd9c3a883 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -76,7 +76,7 @@ struct frag_v6_compare_key { * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far - * @mono_delivery_time: stamp has a mono delivery time (EDT) + * @tstamp_type: stamp has a mono delivery time (EDT) * @flags: fragment queue flags * @max_size: maximum received fragment size * @fqdir: pointer to struct fqdir @@ -97,7 +97,7 @@ struct inet_frag_queue { ktime_t stamp; int len; int meat; - u8 mono_delivery_time; + u8 tstamp_type; __u8 flags; u16 max_size; struct fqdir *fqdir; @@ -137,7 +137,7 @@ static inline void fqdir_pre_exit(struct fqdir *fqdir) } void fqdir_exit(struct fqdir *fqdir); -void inet_frag_kill(struct inet_frag_queue *q); +void inet_frag_kill(struct inet_frag_queue *q, int *refs); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); @@ -145,9 +145,9 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); unsigned int inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason); -static inline void inet_frag_put(struct inet_frag_queue *q) +static inline void inet_frag_putn(struct inet_frag_queue *q, int refs) { - if (refcount_dec_and_test(&q->refcnt)) + if (refs && refcount_sub_and_test(refs, &q->refcnt)) inet_frag_destroy(q); } diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3ecfeadbfa06..ac05a52d9e13 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -88,7 +88,8 @@ struct inet_bind_bucket { unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; - struct hlist_head owners; + struct hlist_head bhash2; + struct rcu_head rcu; }; struct inet_bind2_bucket { @@ -96,22 +97,19 @@ struct inet_bind2_bucket { int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) - unsigned short family; -#endif - union { -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr v6_rcv_saddr; + unsigned short addr_type; + struct in6_addr v6_rcv_saddr; +#define rcv_saddr v6_rcv_saddr.s6_addr32[3] +#else + __be32 rcv_saddr; #endif - __be32 rcv_saddr; - }; /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; + struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; - /* bhash has twsk in owners, but bhash2 has twsk in - * deathrow not to add a member in struct sock_common. - */ - struct hlist_head deathrow; + signed char fastreuse; + signed char fastreuseport; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) @@ -179,14 +177,9 @@ struct inet_hashinfo { bool pernet; } ____cacheline_aligned_in_smp; -static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) +static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk) { -#if IS_ENABLED(CONFIG_IP_DCCP) - return sk->sk_prot->h.hashinfo ? : - sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#else return sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#endif } static inline struct inet_listen_hashbucket * @@ -211,12 +204,6 @@ static inline spinlock_t *inet_ehash_lockp( int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); -static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h) -{ - kfree(h->lhash2); - h->lhash2 = NULL; -} - static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); @@ -231,8 +218,7 @@ struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); -void inet_bind_bucket_destroy(struct kmem_cache *cachep, - struct inet_bind_bucket *tb); +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, @@ -241,7 +227,7 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, struct inet_bind2_bucket * inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - unsigned short port, int l3mdev, + struct inet_bind_bucket *tb, const struct sock *sk); void inet_bind2_bucket_destroy(struct kmem_cache *cachep, @@ -305,12 +291,10 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); -int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); -struct sock *__inet_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, +struct sock *__inet_lookup_listener(const struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, @@ -318,12 +302,12 @@ struct sock *__inet_lookup_listener(struct net *net, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, __be16 dport, int dif, int sdif) + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, __be16 dport, + int dif, int sdif) { - return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } @@ -356,7 +340,7 @@ static inline struct sock *inet_lookup_listener(struct net *net, ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ -static inline bool inet_match(struct net *net, const struct sock *sk, +static inline bool inet_match(const struct net *net, const struct sock *sk, const __addrpair cookie, const __portpair ports, int dif, int sdif) { @@ -373,8 +357,7 @@ static inline bool inet_match(struct net *net, const struct sock *sk, /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM */ -struct sock *__inet_lookup_established(struct net *net, - struct inet_hashinfo *hashinfo, +struct sock *__inet_lookup_established(const struct net *net, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); @@ -387,31 +370,29 @@ inet_ehashfn_t inet_ehashfn; INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); -struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, +struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn); -struct sock *inet_lookup_run_sk_lookup(struct net *net, +struct sock *inet_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); -static inline struct sock * - inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, - const __be32 saddr, const __be16 sport, - const __be32 daddr, const __be16 dport, - const int dif) +static inline struct sock *inet_lookup_established(struct net *net, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + const int dif) { - return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, + return __inet_lookup_established(net, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, @@ -421,18 +402,17 @@ static inline struct sock *__inet_lookup(struct net *net, u16 hnum = ntohs(dport); struct sock *sk; - sk = __inet_lookup_established(net, hashinfo, saddr, sport, + sk = __inet_lookup_established(net, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; - return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, + return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, @@ -441,7 +421,7 @@ static inline struct sock *inet_lookup(struct net *net, struct sock *sk; bool refcounted; - sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet_lookup(net, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) @@ -489,15 +469,14 @@ struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, return reuse_sk; } -static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, +static inline struct sock *__inet_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, const int sdif, bool *refcounted) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net_rcu(skb); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; @@ -508,8 +487,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet_lookup(net, hashinfo, skb, - doff, iph->saddr, sport, + return __inet_lookup(net, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } @@ -532,9 +510,12 @@ static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, + u32 hash_port0, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, - struct inet_timewait_sock **)); + struct inet_timewait_sock **, + bool rcu_lookup, + u32 hash)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 2de0e4d4a027..ac1c75975908 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -19,6 +19,7 @@ #include <linux/netdevice.h> #include <net/flow.h> +#include <net/inet_dscp.h> #include <net/sock.h> #include <net/request_sock.h> #include <net/netns/hash.h> @@ -150,7 +151,8 @@ static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if, return bound_dev_if == dif || bound_dev_if == sdif; } -static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if, +static inline bool inet_sk_bound_dev_eq(const struct net *net, + int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) @@ -171,8 +173,9 @@ struct inet_cork { u8 tx_flags; __u8 ttl; __s16 tos; - char priority; + u32 priority; __u16 gso_size; + u32 ts_opt_id; u64 transmit_time; u32 mark; }; @@ -211,6 +214,7 @@ struct inet_sock { struct sock sk; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *pinet6; + struct ipv6_fl_socklist __rcu *ipv6_fl_list; #endif /* Socket demultiplex comparisons on incoming packets. */ #define inet_daddr sk.__sk_common.skc_daddr @@ -234,17 +238,14 @@ struct inet_sock { int uc_index; int mc_index; __be32 mc_addr; - struct { - __u16 lo; - __u16 hi; - } local_port_range; + u32 local_port_range; /* high << 16 | low */ struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; }; -#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ -#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ +#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_TS_OPT_ID 2 /* ts_opt_id field is valid, overriding sk_tskey */ enum { INET_FLAGS_PKTINFO = 0, @@ -268,6 +269,17 @@ enum { INET_FLAGS_NODEFRAG = 17, INET_FLAGS_BIND_ADDRESS_NO_PORT = 18, INET_FLAGS_DEFER_CONNECT = 19, + INET_FLAGS_MC6_LOOP = 20, + INET_FLAGS_RECVERR6_RFC4884 = 21, + INET_FLAGS_MC6_ALL = 22, + INET_FLAGS_AUTOFLOWLABEL_SET = 23, + INET_FLAGS_AUTOFLOWLABEL = 24, + INET_FLAGS_DONTFRAG = 25, + INET_FLAGS_RECVERR6 = 26, + INET_FLAGS_REPFLOW = 27, + INET_FLAGS_RTALERT_ISOLATE = 28, + INET_FLAGS_SNDFLOW = 29, + INET_FLAGS_RTALERT = 30, }; /* cmsg flags for inet */ @@ -292,6 +304,11 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL; } +static inline dscp_t inet_sk_dscp(const struct inet_sock *inet) +{ + return inet_dsfield_to_dscp(READ_ONCE(inet->tos)); +} + #define inet_test_bit(nr, sk) \ test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet_set_bit(nr, sk) \ @@ -301,11 +318,6 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) #define inet_assign_bit(nr, sk, val) \ assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) -static inline bool sk_is_inet(struct sock *sk) -{ - return sk->sk_family == AF_INET || sk->sk_family == AF_INET6; -} - /** * sk_to_full_sk - Access to a full socket * @sk: pointer to a socket @@ -316,8 +328,10 @@ static inline bool sk_is_inet(struct sock *sk) static inline struct sock *sk_to_full_sk(struct sock *sk) { #ifdef CONFIG_INET - if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = inet_reqsk(sk)->rsk_listener; + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) + sk = NULL; #endif return sk; } @@ -326,8 +340,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk) static inline const struct sock *sk_const_to_full_sk(const struct sock *sk) { #ifdef CONFIG_INET - if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = ((const struct request_sock *)sk)->rsk_listener; + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) + sk = NULL; #endif return sk; } @@ -339,14 +355,6 @@ static inline struct sock *skb_to_full_sk(const struct sk_buff *skb) #define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk) -static inline void __inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from, - const int ancestor_size) -{ - memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, - sk_from->sk_prot->obj_size - ancestor_size); -} - int inet_sk_rebuild_header(struct sock *sk); /** diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 4a8e578405cb..63a644ff30de 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -45,6 +45,8 @@ struct inet_timewait_sock { #define tw_node __tw_common.skc_nulls_node #define tw_bind_node __tw_common.skc_bind_node #define tw_refcnt __tw_common.skc_refcnt +#define tw_tx_queue_mapping __tw_common.skc_tx_queue_mapping +#define tw_rx_queue_mapping __tw_common.skc_rx_queue_mapping #define tw_hash __tw_common.skc_hash #define tw_prot __tw_common.skc_prot #define tw_net __tw_common.skc_net @@ -58,7 +60,7 @@ struct inet_timewait_sock { #define tw_dr __tw_common.skc_tw_dr __u32 tw_mark; - volatile unsigned char tw_substate; + unsigned char tw_substate; unsigned char tw_rcv_wscale; /* Socket demultiplex comparisons on incoming packets. */ @@ -67,20 +69,30 @@ struct inet_timewait_sock { /* And these are ours. */ unsigned int tw_transparent : 1, tw_flowlabel : 20, - tw_pad : 3, /* 3 bits hole */ + tw_usec_ts : 1, + tw_connect_bind : 1, + tw_pad : 1, /* 1 bit hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; + /** + * @tw_reuse_stamp: Time of entry into %TCP_TIME_WAIT state in msec. + */ + u32 tw_entry_stamp; struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; - struct hlist_node tw_bind2_node; +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; +#endif +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff* (*tw_validate_xmit_skb)(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); +#endif }; #define tw_tclass tw_tos -#define twsk_for_each_bound_bhash2(__tw, list) \ - hlist_for_each_entry(__tw, list, tw_bind2_node) - static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) { return (struct inet_timewait_sock *)sk; @@ -96,17 +108,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, const int state); -void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, - struct inet_hashinfo *hashinfo); +void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo, + int timeo); void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm); -static inline void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) -{ - __inet_twsk_schedule(tw, timeo, false); -} - static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo) { __inet_twsk_schedule(tw, timeo, true); @@ -114,7 +123,7 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family); +void inet_twsk_purge(struct inet_hashinfo *hashinfo); static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 74ff688568a0..f475757daafb 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -96,30 +96,28 @@ static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, - const struct inetpeer_addr *daddr, - int create); + const struct inetpeer_addr *daddr); static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, - int vif, int create) + int vif) { struct inetpeer_addr daddr; daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; - return inet_getpeer(base, &daddr, create); + return inet_getpeer(base, &daddr); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, - const struct in6_addr *v6daddr, - int create) + const struct in6_addr *v6daddr) { struct inetpeer_addr daddr; daddr.a6 = *v6daddr; daddr.family = AF_INET6; - return inet_getpeer(base, &daddr, create); + return inet_getpeer(base, &daddr); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, diff --git a/include/net/ioam6.h b/include/net/ioam6.h index 781d2d8b2f29..2cbbee6e806a 100644 --- a/include/net/ioam6.h +++ b/include/net/ioam6.h @@ -12,6 +12,7 @@ #include <linux/net.h> #include <linux/ipv6.h> #include <linux/ioam6.h> +#include <linux/ioam6_genl.h> #include <linux/rhashtable-types.h> struct ioam6_namespace { @@ -65,4 +66,7 @@ void ioam6_exit(void); int ioam6_iptunnel_init(void); void ioam6_iptunnel_exit(void); +void ioam6_event(enum ioam6_event_type type, struct net *net, gfp_t gfp, + void *opt, unsigned int opt_len); + #endif /* _NET_IOAM6_H */ diff --git a/include/net/ip.h b/include/net/ip.h index 19adacd5ece0..69d5cef46004 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -33,6 +33,7 @@ #include <net/flow_dissector.h> #include <net/netns/hash.h> #include <net/lwtunnel.h> +#include <net/inet_dscp.h> #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ @@ -57,6 +58,8 @@ struct inet_skb_parm { #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) +#define IPSKB_MULTIPATH BIT(9) +#define IPSKB_MCROUTE BIT(10) u16 frag_max_size; }; @@ -79,7 +82,6 @@ struct ipcm_cookie { __u8 protocol; __u8 ttl; __s16 tos; - char priority; __u16 gso_size; }; @@ -91,10 +93,12 @@ static inline void ipcm_init(struct ipcm_cookie *ipcm) static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, const struct inet_sock *inet) { - ipcm_init(ipcm); + *ipcm = (struct ipcm_cookie) { + .tos = READ_ONCE(inet->tos), + }; + + sockcm_init(&ipcm->sockc, &inet->sk); - ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); - ipcm->sockc.tsflags = inet->sk.sk_tsflags; ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; ipcm->protocol = inet->inet_num; @@ -164,6 +168,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, int ip_local_deliver(struct sk_buff *skb); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto); int ip_mr_input(struct sk_buff *skb); +int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -255,14 +260,9 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, return RT_SCOPE_UNIVERSE; } -static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet) -{ - return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(inet->tos); -} - /* datagram.c */ -int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); -int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); +int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); void ip4_datagram_release_cb(struct sock *sk); @@ -284,7 +284,8 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, + struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, @@ -325,11 +326,12 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } #endif -#define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \ +#define snmp_get_cpu_field64_batch_cnt(buff64, stats_list, cnt, \ + mib_statistic, offset) \ { \ int i, c; \ for_each_possible_cpu(c) { \ - for (i = 0; stats_list[i].name; i++) \ + for (i = 0; i < cnt; i++) \ buff64[i] += snmp_get_cpu_field64( \ mib_statistic, \ c, stats_list[i].entry, \ @@ -337,22 +339,28 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } -#define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ +#define snmp_get_cpu_field_batch_cnt(buff, stats_list, cnt, mib_statistic) \ { \ int i, c; \ for_each_possible_cpu(c) { \ - for (i = 0; stats_list[i].name; i++) \ + for (i = 0; i < cnt; i++) \ buff[i] += snmp_get_cpu_field( \ mib_statistic, \ c, stats_list[i].entry); \ } \ } -void inet_get_local_port_range(const struct net *net, int *low, int *high); -void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); +static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) +{ + u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); + + *low = range & 0xffff; + *high = range >> 16; +} +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL -static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) +static inline bool inet_is_local_reserved_port(const struct net *net, unsigned short port) { if (!net->ipv4.sysctl_local_reserved_ports) return false; @@ -414,9 +422,14 @@ int ip_decrease_ttl(struct iphdr *iph) return --iph->ttl; } +static inline dscp_t ip4h_dscp(const struct iphdr *ip4h) +{ + return inet_dsfield_to_dscp(ip4h->tos); +} + static inline int ip_mtu_locked(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *)dst; + const struct rtable *rt = dst_rtable(dst); return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU); } @@ -433,33 +446,41 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst) static inline bool ip_sk_accept_pmtu(const struct sock *sk) { - return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE && - inet_sk(sk)->pmtudisc != IP_PMTUDISC_OMIT; + u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); + + return pmtudisc != IP_PMTUDISC_INTERFACE && + pmtudisc != IP_PMTUDISC_OMIT; } static inline bool ip_sk_use_pmtu(const struct sock *sk) { - return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE; + return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE; } static inline bool ip_sk_ignore_df(const struct sock *sk) { - return inet_sk(sk)->pmtudisc < IP_PMTUDISC_DO || - inet_sk(sk)->pmtudisc == IP_PMTUDISC_OMIT; + u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); + + return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT; } static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { - const struct rtable *rt = container_of(dst, struct rtable, dst); - struct net *net = dev_net(dst->dev); - unsigned int mtu; + const struct rtable *rt = dst_rtable(dst); + const struct net_device *dev; + unsigned int mtu, res; + struct net *net; + + rcu_read_lock(); + dev = dst_dev_rcu(dst); + net = dev_net_rcu(dev); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { mtu = rt->rt_pmtu; - if (mtu && time_before(jiffies, rt->dst.expires)) + if (mtu && time_before(jiffies, READ_ONCE(rt->dst.expires))) goto out; } @@ -468,7 +489,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, if (mtu) goto out; - mtu = READ_ONCE(dst->dev->mtu); + mtu = READ_ONCE(dev->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) @@ -478,26 +499,30 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, out: mtu = min_t(unsigned int, mtu, IP_MAX_MTU); - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); + res = mtu - lwtunnel_headroom(dst->lwtstate, mtu); + + rcu_read_unlock(); + + return res; } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, const struct sk_buff *skb) { + const struct dst_entry *dst = skb_dst(skb); unsigned int mtu; if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; - return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); + return ip_dst_mtu_maybe_forward(dst, forwarding); } - mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); - return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); + mtu = min(READ_ONCE(dst_dev(dst)->mtu), IP_MAX_MTU); + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } -struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, - int fc_mx_len, +struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack); static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics) { @@ -667,11 +692,24 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif +#if IS_MODULE(CONFIG_IPV6) +#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X) +#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X) +#else +#define EXPORT_IPV6_MOD(X) +#define EXPORT_IPV6_MOD_GPL(X) +#endif + static inline unsigned int ipv4_addr_hash(__be32 ip) { return (__force unsigned int) ip; } +static inline u32 __ipv4_addr_hash(const __be32 ip, const u32 initval) +{ + return jhash_1word((__force u32)ip, initval); +} + static inline u32 ipv4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) @@ -757,7 +795,7 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev); * Functions provided by ip_sockglue.c */ -void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb); +void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst); void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, @@ -785,9 +823,8 @@ static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0); } -bool icmp_global_allow(void); -extern int sysctl_icmp_msgs_per_sec; -extern int sysctl_icmp_msgs_burst; +bool icmp_global_allow(struct net *net); +void icmp_global_consume(struct net *net); #ifdef CONFIG_PROC_FS int ip_misc_proc_init(void); diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c9ff23cf313e..88b0dd4d8e09 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -30,12 +30,6 @@ #define RT6_DEBUG 2 -#if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) pr_debug(x) -#else -#define RT6_TRACE(x...) do { ; } while (0) -#endif - struct rt6_info; struct fib6_info; @@ -204,6 +198,7 @@ struct fib6_info { fib6_destroying:1, unused:4; + struct list_head purge_link; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; @@ -240,9 +235,11 @@ struct fib6_result { for (rt = (w)->leaf; rt; \ rt = rcu_dereference_protected(rt->fib6_next, 1)) -static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) +#define dst_rt6_info(_ptr) container_of_const(_ptr, struct rt6_info, dst) + +static inline struct inet6_dev *ip6_dst_idev(const struct dst_entry *dst) { - return ((struct rt6_info *)dst)->rt6i_idev; + return dst_rt6_info(dst)->rt6i_idev; } static inline bool fib6_requires_src(const struct fib6_info *rt) @@ -250,6 +247,25 @@ static inline bool fib6_requires_src(const struct fib6_info *rt) return rt->fib6_src.plen > 0; } +/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever + * been added to a table before. + */ +static inline void fib6_clean_expires(struct fib6_info *f6i) +{ + f6i->fib6_flags &= ~RTF_EXPIRES; + f6i->expires = 0; +} + +/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever + * been added to a table before. + */ +static inline void fib6_set_expires(struct fib6_info *f6i, + unsigned long expires) +{ + f6i->expires = expires; + f6i->fib6_flags |= RTF_EXPIRES; +} + static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->fib6_flags & RTF_EXPIRES) @@ -257,11 +273,6 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) return false; } -static inline bool fib6_has_expires(const struct fib6_info *f6i) -{ - return f6i->fib6_flags & RTF_EXPIRES; -} - /* Function to safely get fn->fn_sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely @@ -328,8 +339,10 @@ static inline bool fib6_info_hold_safe(struct fib6_info *f6i) static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) - call_rcu(&f6i->rcu, fib6_info_destroy_rcu); + if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) { + DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link)); + call_rcu_hurry(&f6i->rcu, fib6_info_destroy_rcu); + } } enum fib6_walk_state { @@ -382,7 +395,7 @@ struct fib6_table { struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; - unsigned int fib_seq; + unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; @@ -500,46 +513,36 @@ void fib6_gc_cleanup(void); int fib6_init(void); -/* fib6_info must be locked by the caller, and fib6_info->fib6_table can be - * NULL. +/* Add the route to the gc list if it is not already there + * + * The callers should hold f6i->fib6_table->tb6_lock. */ -static inline void fib6_set_expires_locked(struct fib6_info *f6i, - unsigned long expires) +static inline void fib6_add_gc_list(struct fib6_info *f6i) { - struct fib6_table *tb6; + /* If fib6_node is null, the f6i is not in (or removed from) the + * table. + * + * There is a gap between finding the f6i from the table and + * calling this function without the protection of the tb6_lock. + * This check makes sure the f6i is not added to the gc list when + * it is not on the table. + */ + if (!rcu_dereference_protected(f6i->fib6_node, + lockdep_is_held(&f6i->fib6_table->tb6_lock))) + return; - tb6 = f6i->fib6_table; - f6i->expires = expires; - if (tb6 && !fib6_has_expires(f6i)) - hlist_add_head(&f6i->gc_link, &tb6->tb6_gc_hlist); - f6i->fib6_flags |= RTF_EXPIRES; + if (hlist_unhashed(&f6i->gc_link)) + hlist_add_head(&f6i->gc_link, &f6i->fib6_table->tb6_gc_hlist); } -/* fib6_info must be locked by the caller, and fib6_info->fib6_table can be - * NULL. If fib6_table is NULL, the fib6_info will no be inserted into the - * list of GC candidates until it is inserted into a table. +/* Remove the route from the gc list if it is on the list. + * + * The callers should hold f6i->fib6_table->tb6_lock. */ -static inline void fib6_set_expires(struct fib6_info *f6i, - unsigned long expires) +static inline void fib6_remove_gc_list(struct fib6_info *f6i) { - spin_lock_bh(&f6i->fib6_table->tb6_lock); - fib6_set_expires_locked(f6i, expires); - spin_unlock_bh(&f6i->fib6_table->tb6_lock); -} - -static inline void fib6_clean_expires_locked(struct fib6_info *f6i) -{ - if (fib6_has_expires(f6i)) + if (!hlist_unhashed(&f6i->gc_link)) hlist_del_init(&f6i->gc_link); - f6i->fib6_flags &= ~RTF_EXPIRES; - f6i->expires = 0; -} - -static inline void fib6_clean_expires(struct fib6_info *f6i) -{ - spin_lock_bh(&f6i->fib6_table->tb6_lock); - fib6_clean_expires_locked(f6i); - spin_unlock_bh(&f6i->fib6_table->tb6_lock); } struct ipv6_route_iter { @@ -561,7 +564,7 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); -unsigned int fib6_tables_seq_read(struct net *net); +unsigned int fib6_tables_seq_read(const struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); @@ -630,7 +633,7 @@ void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -unsigned int fib6_rules_seq_read(struct net *net); +unsigned int fib6_rules_seq_read(const struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, @@ -642,7 +645,10 @@ static inline bool fib6_rules_early_flow_dissect(struct net *net, if (!net->ipv6.fib6_rules_require_fldissect) return false; - skb_flow_dissect_flow_keys(skb, flkeys, flag); + memset(flkeys, 0, sizeof(*flkeys)); + __skb_flow_dissect(net, skb, &flow_keys_dissector, + flkeys, NULL, 0, 0, 0, flag); + fl6->fl6_sport = flkeys->ports.src; fl6->fl6_dport = flkeys->ports.dst; fl6->flowi6_proto = flkeys->basic.ip_proto; @@ -671,7 +677,7 @@ static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, { return 0; } -static inline unsigned int fib6_rules_seq_read(struct net *net) +static inline unsigned int fib6_rules_seq_read(const struct net *net) { return 0; } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index b32539bb0fb0..7c5512baa4b2 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -53,13 +53,12 @@ struct route_info { */ static inline int rt6_srcprefs2flags(unsigned int srcprefs) { - /* No need to bitmask because srcprefs have only 3 bits. */ - return srcprefs << 3; + return (srcprefs & IPV6_PREFER_SRC_MASK) << 3; } static inline unsigned int rt6_flags2srcprefs(int flags) { - return (flags >> 3) & 7; + return (flags >> 3) & IPV6_PREFER_SRC_MASK; } static inline bool rt6_need_strict(const struct in6_addr *daddr) @@ -128,18 +127,26 @@ void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, const struct in6_addr *daddr, - unsigned int prefs, + unsigned int prefs, int l3mdev_index, struct in6_addr *saddr) { + struct net_device *l3mdev; + struct net_device *dev; + bool same_vrf; int err = 0; - if (f6i && f6i->fib6_prefsrc.plen) { + rcu_read_lock(); + + l3mdev = dev_get_by_index_rcu(net, l3mdev_index); + if (!f6i || !f6i->fib6_prefsrc.plen || l3mdev) + dev = f6i ? fib6_info_nh_dev(f6i) : NULL; + same_vrf = !l3mdev || l3mdev_master_dev_rcu(dev) == l3mdev; + if (f6i && f6i->fib6_prefsrc.plen && same_vrf) *saddr = f6i->fib6_prefsrc.addr; - } else { - struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL; + else + err = ipv6_dev_get_saddr(net, same_vrf ? dev : l3mdev, daddr, prefs, saddr); - err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr); - } + rcu_read_unlock(); return err; } @@ -171,7 +178,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, - u32 defrtr_usr_metric); + u32 defrtr_usr_metric, + int lifetime); void rt6_purge_dflt_routers(struct net *net); @@ -210,28 +218,27 @@ void rt6_uncached_list_del(struct rt6_info *rt); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { const struct dst_entry *dst = skb_dst(skb); - const struct rt6_info *rt6 = NULL; if (dst) - rt6 = container_of(dst, struct rt6_info, dst); + return dst_rt6_info(dst); - return rt6; + return NULL; } /* * Store a destination cache entry in a socket */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, - const struct in6_addr *saddr) + bool daddr_set, + bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); - np->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); + np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); sk_setup_caps(sk, dst); - np->daddr_cache = daddr; + np->daddr_cache = daddr_set; #ifdef CONFIG_IPV6_SUBTREES - np->saddr_cache = saddr; + np->saddr_cache = saddr_set; #endif } @@ -240,7 +247,7 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, static inline bool ipv6_unicast_destination(const struct sk_buff *skb) { - struct rt6_info *rt = (struct rt6_info *) skb_dst(skb); + const struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); return rt->rt6i_flags & RTF_LOCAL; } @@ -248,7 +255,7 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb) static inline bool ipv6_anycast_destination(const struct dst_entry *dst, const struct in6_addr *daddr) { - struct rt6_info *rt = (struct rt6_info *)dst; + const struct rt6_info *rt = dst_rt6_info(dst); return rt->rt6i_flags & RTF_ANYCAST || (rt->rt6i_dst.plen < 127 && @@ -266,8 +273,8 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb) const struct dst_entry *dst = skb_dst(skb); unsigned int mtu; - if (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) { - mtu = READ_ONCE(dst->dev->mtu); + if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) { + mtu = READ_ONCE(dst_dev(dst)->mtu); mtu -= lwtunnel_headroom(dst->lwtstate, mtu); } else { mtu = dst_mtu(dst); @@ -277,14 +284,18 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb) static inline bool ip6_sk_accept_pmtu(const struct sock *sk) { - return inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_INTERFACE && - inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_OMIT; + u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc); + + return pmtudisc != IPV6_PMTUDISC_INTERFACE && + pmtudisc != IPV6_PMTUDISC_OMIT; } static inline bool ip6_sk_ignore_df(const struct sock *sk) { - return inet6_sk(sk)->pmtudisc < IPV6_PMTUDISC_DO || - inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT; + u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc); + + return pmtudisc < IPV6_PMTUDISC_DO || + pmtudisc == IPV6_PMTUDISC_OMIT; } static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt, @@ -326,9 +337,9 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst mtu = IPV6_MIN_MTU; rcu_read_lock(); - idev = __in6_dev_get(dst->dev); + idev = __in6_dev_get(dst_dev_rcu(dst)); if (idev) - mtu = idev->cnf.mtu6; + mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); out: diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 74b369bddf49..120db2865811 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -30,8 +30,8 @@ struct __ip6_tnl_parm { struct in6_addr laddr; /* local tunnel end-point address */ struct in6_addr raddr; /* remote tunnel end-point address */ - __be16 i_flags; - __be16 o_flags; + IP_TUNNEL_DECLARE_FLAGS(i_flags); + IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; @@ -152,13 +152,14 @@ int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, - struct net_device *dev) + struct net_device *dev, u16 ip6cb_flags) { int pkt_len, err; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + IP6CB(skb)->flags = ip6cb_flags; pkt_len = skb->len - skb_inner_network_offset(skb); - err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); + err = ip6_local_out(skb_dst_dev_net(skb), sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a378eff827c7..b4495c38e0a0 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -22,6 +22,8 @@ #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/refcount.h> +#include <linux/ip.h> +#include <linux/in_route.h> struct fib_config { u8 fc_dst_len; @@ -154,11 +156,14 @@ struct fib_info { int fib_nhs; bool fib_nh_is_v6; bool nh_updated; + bool pfsrc_removed; struct nexthop *nh; struct rcu_head rcu; - struct fib_nh fib_nh[]; + struct fib_nh fib_nh[] __counted_by(fib_nhs); }; +int __net_init fib4_semantics_init(struct net *net); +void __net_exit fib4_semantics_exit(struct net *net); #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule; @@ -172,6 +177,7 @@ struct fib_result { unsigned char type; unsigned char scope; u32 tclassid; + dscp_t dscp; struct fib_nh_common *nhc; struct fib_info *fi; struct fib_table *table; @@ -263,6 +269,7 @@ struct fib_dump_filter { bool filter_set; bool dump_routes; bool dump_exceptions; + bool rtnl_held; unsigned char protocol; unsigned char rt_type; unsigned int flags; @@ -342,7 +349,7 @@ static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, return 0; } -static inline unsigned int fib4_rules_seq_read(struct net *net) +static inline unsigned int fib4_rules_seq_read(const struct net *net) { return 0; } @@ -406,7 +413,7 @@ static inline bool fib4_has_custom_rules(const struct net *net) bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -unsigned int fib4_rules_seq_read(struct net *net); +unsigned int fib4_rules_seq_read(const struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, @@ -418,7 +425,10 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, if (!net->ipv4.fib_rules_require_fldissect) return false; - skb_flow_dissect_flow_keys(skb, flkeys, flag); + memset(flkeys, 0, sizeof(*flkeys)); + __skb_flow_dissect(net, skb, &flow_keys_dissector, + flkeys, NULL, 0, 0, 0, flag); + fl4->fl4_sport = flkeys->ports.src; fl4->fl4_dport = flkeys->ports.dst; fl4->flowi4_proto = flkeys->basic.ip_proto; @@ -428,6 +438,11 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, #endif /* CONFIG_IP_MULTIPLE_TABLES */ +static inline bool fib_dscp_masked_match(dscp_t dscp, const struct flowi4 *fl4) +{ + return dscp == (fl4->flowi4_dscp & INET_DSCP_LEGACY_TOS_MASK); +} + /* Exported by fib_frontend.c */ extern const struct nla_policy rtm_ipv4_policy[]; void ip_fib_init(void); @@ -436,8 +451,21 @@ int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, - u8 tos, int oif, struct net_device *dev, + dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); + +static inline enum skb_drop_reason +fib_validate_source_reason(struct sk_buff *skb, __be32 src, __be32 dst, + dscp_t dscp, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int err = fib_validate_source(skb, src, dst, dscp, oif, dev, idev, + itag); + if (err < 0) + return -err; + return SKB_NOT_DROPPED_YET; +} + #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { @@ -515,10 +543,39 @@ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig); #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys); + +static void +fib_multipath_hash_construct_key(siphash_key_t *key, u32 mp_seed) +{ + u64 mp_seed_64 = mp_seed; + + key->key[0] = (mp_seed_64 << 32) | mp_seed_64; + key->key[1] = key->key[0]; +} + +static inline u32 fib_multipath_hash_from_keys(const struct net *net, + struct flow_keys *keys) +{ + siphash_aligned_key_t hash_key; + u32 mp_seed; + + mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed; + fib_multipath_hash_construct_key(&hash_key, mp_seed); + + return flow_hash_from_keys_seed(keys, &hash_key); +} +#else +static inline u32 fib_multipath_hash_from_keys(const struct net *net, + struct flow_keys *keys) +{ + return flow_hash_from_keys(keys); +} #endif + int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); -void fib_select_multipath(struct fib_result *res, int hash); +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index e8750b4ef7e1..ecae35512b9b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -11,7 +11,9 @@ #include <linux/bitops.h> #include <net/dsfield.h> +#include <net/flow.h> #include <net/gro_cells.h> +#include <net/inet_dscp.h> #include <net/inet_ecn.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> @@ -36,6 +38,24 @@ (sizeof_field(struct ip_tunnel_key, u) - \ sizeof_field(struct ip_tunnel_key, u.ipv4)) +#define __ipt_flag_op(op, ...) \ + op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM) + +#define IP_TUNNEL_DECLARE_FLAGS(...) \ + __ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__) + +#define ip_tunnel_flags_zero(...) __ipt_flag_op(bitmap_zero, __VA_ARGS__) +#define ip_tunnel_flags_copy(...) __ipt_flag_op(bitmap_copy, __VA_ARGS__) +#define ip_tunnel_flags_and(...) __ipt_flag_op(bitmap_and, __VA_ARGS__) +#define ip_tunnel_flags_or(...) __ipt_flag_op(bitmap_or, __VA_ARGS__) + +#define ip_tunnel_flags_empty(...) \ + __ipt_flag_op(bitmap_empty, __VA_ARGS__) +#define ip_tunnel_flags_intersect(...) \ + __ipt_flag_op(bitmap_intersects, __VA_ARGS__) +#define ip_tunnel_flags_subset(...) \ + __ipt_flag_op(bitmap_subset, __VA_ARGS__) + struct ip_tunnel_key { __be64 tun_id; union { @@ -48,11 +68,11 @@ struct ip_tunnel_key { struct in6_addr dst; } ipv6; } u; - __be16 tun_flags; - u8 tos; /* TOS for IPv4, TC for IPv6 */ - u8 ttl; /* TTL for IPv4, HL for IPv6 */ + IP_TUNNEL_DECLARE_FLAGS(tun_flags); __be32 label; /* Flow Label for IPv6 */ u32 nhid; + u8 tos; /* TOS for IPv4, TC for IPv6 */ + u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be16 tp_src; __be16 tp_dst; __u8 flow_flags; @@ -77,8 +97,8 @@ struct ip_tunnel_encap { #define ip_tunnel_info_opts(info) \ _Generic(info, \ - const struct ip_tunnel_info * : ((const void *)((info) + 1)),\ - struct ip_tunnel_info * : ((void *)((info) + 1))\ + const struct ip_tunnel_info * : ((const void *)(info)->options),\ + struct ip_tunnel_info * : ((void *)(info)->options)\ ) struct ip_tunnel_info { @@ -89,6 +109,7 @@ struct ip_tunnel_info { #endif u8 options_len; u8 mode; + u8 options[] __aligned_largest __counted_by(options_len); }; /* 6rd prefix/relay information */ @@ -110,6 +131,17 @@ struct ip_tunnel_prl_entry { struct metadata_dst; +/* Kernel-side variant of ip_tunnel_parm */ +struct ip_tunnel_parm_kern { + char name[IFNAMSIZ]; + IP_TUNNEL_DECLARE_FLAGS(i_flags); + IP_TUNNEL_DECLARE_FLAGS(o_flags); + __be32 i_key; + __be32 o_key; + int link; + struct iphdr iph; +}; + struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; @@ -136,7 +168,7 @@ struct ip_tunnel { struct dst_cache dst_cache; - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; int mlink; int encap_hlen; /* Encap header length (FOU,GUE) */ @@ -157,7 +189,7 @@ struct ip_tunnel { }; struct tnl_ptk_info { - __be16 flags; + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 proto; __be32 key; __be32 seq; @@ -179,11 +211,80 @@ struct ip_tunnel_net { int type; }; +static inline void ip_tunnel_set_options_present(unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(present) = { }; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); + __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); + + ip_tunnel_flags_or(flags, flags, present); +} + +static inline void ip_tunnel_clear_options_present(unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(present) = { }; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); + __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); + + __ipt_flag_op(bitmap_andnot, flags, flags, present); +} + +static inline bool ip_tunnel_is_options_present(const unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(present) = { }; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); + __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); + + return ip_tunnel_flags_intersect(flags, present); +} + +static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(supp) = { }; + + bitmap_set(supp, 0, BITS_PER_TYPE(__be16)); + __set_bit(IP_TUNNEL_VTI_BIT, supp); + + return ip_tunnel_flags_subset(flags, supp); +} + +static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags) +{ + ip_tunnel_flags_zero(dst); + + bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16)); + __assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI); +} + +static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags) +{ + __be16 ret; + + ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16))); + if (test_bit(IP_TUNNEL_VTI_BIT, flags)) + ret |= VTI_ISVTI; + + return ret; +} + static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, - __be64 tun_id, __be16 tun_flags) + __be64 tun_id, + const unsigned long *tun_flags) { key->tun_id = tun_id; key->u.ipv4.src = saddr; @@ -193,7 +294,7 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, key->tos = tos; key->ttl = ttl; key->label = label; - key->tun_flags = tun_flags; + ip_tunnel_flags_copy(key->tun_flags, tun_flags); /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. @@ -214,12 +315,8 @@ ip_tunnel_dst_cache_usable(const struct sk_buff *skb, { if (skb->mark) return false; - if (!info) - return true; - if (info->key.tun_flags & TUNNEL_NOCACHE) - return false; - return true; + return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info @@ -260,14 +357,14 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); if (oif) { - fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index_rcu(net, oif); + fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index(net, oif); /* Legacy VRF/l3mdev use case */ fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif; } fl4->daddr = daddr; fl4->saddr = saddr; - fl4->flowi4_tos = tos; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; @@ -282,22 +379,26 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); - -void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, - struct rtnl_link_ops *ops); +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); -int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd); +bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, + const void __user *data); +bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, - int link, __be16 flags, + int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key); @@ -306,16 +407,17 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark); -int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark); + struct ip_tunnel_parm_kern *p, __u32 fwmark); +int ip_tunnel_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct ip_tunnel_parm_kern *p, + __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap); void ip_tunnel_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms); + struct ip_tunnel_parm_kern *parms); extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); @@ -340,7 +442,8 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); -static inline bool pskb_inet_may_pull(struct sk_buff *skb) +static inline enum skb_drop_reason +pskb_inet_may_pull_reason(struct sk_buff *skb) { int nhlen; @@ -357,7 +460,49 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) nhlen = 0; } - return pskb_network_may_pull(skb, nhlen); + return pskb_network_may_pull_reason(skb, nhlen); +} + +static inline bool pskb_inet_may_pull(struct sk_buff *skb) +{ + return pskb_inet_may_pull_reason(skb) == SKB_NOT_DROPPED_YET; +} + +/* Variant of pskb_inet_may_pull(). + */ +static inline enum skb_drop_reason +skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit) +{ + int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN; + __be16 type = skb->protocol; + enum skb_drop_reason reason; + + /* Essentially this is skb_protocol(skb, true) + * And we get MAC len. + */ + if (eth_type_vlan(type)) + type = __vlan_get_protocol(skb, type, &maclen); + + switch (type) { +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + nhlen = sizeof(struct ipv6hdr); + break; +#endif + case htons(ETH_P_IP): + nhlen = sizeof(struct iphdr); + break; + } + /* For ETH_P_IPV6/ETH_P_IP we make sure to pull + * a base network header in skb->head. + */ + reason = pskb_may_pull_reason(skb, maclen + nhlen); + if (reason) + return reason; + + skb_set_network_header(skb, maclen); + + return SKB_NOT_DROPPED_YET; } static inline int ip_encap_hlen(struct ip_tunnel_encap *e) @@ -416,6 +561,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, return 0; } +static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, + const struct sk_buff *skb) +{ + __be16 payload_protocol = skb_protocol(skb, true); + + if (payload_protocol == htons(ETH_P_IPV6)) + return ip6_flowlabel((const struct ipv6hdr *)iph); + else + return 0; +} + static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { @@ -429,7 +585,7 @@ static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, return 0; } -/* Propogate ECN bits out */ +/* Propagate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) { @@ -449,12 +605,27 @@ static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, - u8 tos, u8 ttl, __be16 df, bool xnet); + u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply); +static inline void ip_tunnel_adj_headroom(struct net_device *dev, + unsigned int headroom) +{ + /* we must cap headroom to some upperlimit, else pskb_expand_head + * will overflow header offsets in skb_headers_offset_update(). + */ + const unsigned int max_allowed = 512; + + if (headroom > max_allowed) + headroom = max_allowed; + + if (headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, headroom); +} + int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline int iptunnel_pull_offloads(struct sk_buff *skb) @@ -483,32 +654,32 @@ static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) u64_stats_inc(&tstats->tx_packets); u64_stats_update_end(&tstats->syncp); put_cpu_ptr(tstats); + return; + } + + if (pkt_len < 0) { + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_aborted_errors); } else { - struct net_device_stats *err_stats = &dev->stats; - - if (pkt_len < 0) { - err_stats->tx_errors++; - err_stats->tx_aborted_errors++; - } else { - err_stats->tx_dropped++; - } + DEV_STATS_INC(dev, tx_dropped); } } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { - memcpy(to, info + 1, info->options_len); + memcpy(to, ip_tunnel_info_opts(info), info->options_len); } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, - __be16 flags) + const unsigned long *flags) { info->options_len = len; if (len > 0) { memcpy(ip_tunnel_info_opts(info), from, len); - info->key.tun_flags |= flags; + ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags, + flags); } } @@ -552,7 +723,7 @@ static inline void ip_tunnel_info_opts_get(void *to, static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, - __be16 flags) + const unsigned long *flags) { info->options_len = 0; } diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ff406ef4fd4a..29a36709e7f3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1163,6 +1163,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) return housekeeping_cpumask(HK_TYPE_KTHREAD); } +static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) +{ + if (ipvs->est_cpulist_valid) + return ipvs->sysctl_est_cpulist; + else + return NULL; +} + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return ipvs->sysctl_est_nice; @@ -1270,6 +1278,11 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) return housekeeping_cpumask(HK_TYPE_KTHREAD); } +static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) +{ + return NULL; +} + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return IPVS_EST_NICE; diff --git a/include/net/ipcomp.h b/include/net/ipcomp.h index 8660a2a6d1fc..51401f01e2a5 100644 --- a/include/net/ipcomp.h +++ b/include/net/ipcomp.h @@ -3,20 +3,9 @@ #define _NET_IPCOMP_H #include <linux/skbuff.h> -#include <linux/types.h> - -#define IPCOMP_SCRATCH_SIZE 65400 - -struct crypto_comp; -struct ip_comp_hdr; - -struct ipcomp_data { - u16 threshold; - struct crypto_comp * __percpu *tfms; -}; struct ip_comp_hdr; -struct sk_buff; +struct netlink_ext_ack; struct xfrm_state; int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 0675be0f3fa0..74fbf1ad8065 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -246,17 +246,20 @@ extern int sysctl_mld_qrv; #define _DEVADD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ + unsigned long _field = (field); \ + unsigned long _val = (val); \ if (likely(_idev != NULL)) \ - mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \ - mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\ + mod##SNMP_ADD_STATS((_idev)->stats.statname, _field, _val); \ + mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, _field, _val);\ }) #define _DEVUPD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ + unsigned long _val = (val); \ if (likely(_idev != NULL)) \ - mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \ - mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\ + mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, _val); \ + mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, _val);\ }) /* MIBs */ @@ -363,23 +366,16 @@ struct ipcm6_cookie { struct ipv6_txoptions *opt; }; -static inline void ipcm6_init(struct ipcm6_cookie *ipc6) -{ - *ipc6 = (struct ipcm6_cookie) { - .hlimit = -1, - .tclass = -1, - .dontfrag = -1, - }; -} - static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, - const struct ipv6_pinfo *np) + const struct sock *sk) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, - .tclass = np->tclass, - .dontfrag = np->dontfrag, + .tclass = inet6_sk(sk)->tclass, + .dontfrag = inet6_test_bit(DONTFRAG, sk), }; + + sockcm_init(&ipc6->sockc, sk); } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) @@ -428,7 +424,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); -bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); +bool ip6_autoflowlabel(struct net *net, const struct sock *sk); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { @@ -471,7 +467,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, /* This helper is specialized for BIG TCP needs. * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. * It assumes headers are already in skb->head. - * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there. + * Returns: 0, or IPPROTO_TCP if a BIG TCP packet is there. */ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) { @@ -534,13 +530,15 @@ static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb) return 0; } -static inline bool ipv6_accept_ra(struct inet6_dev *idev) +static inline bool ipv6_accept_ra(const struct inet6_dev *idev) { + s32 accept_ra = READ_ONCE(idev->cnf.accept_ra); + /* If forwarding is enabled, RA are not accepted unless the special * hybrid mode (accept_ra=2) is enabled. */ - return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 : - idev->cnf.accept_ra; + return READ_ONCE(idev->cnf.forwarding) ? accept_ra == 2 : + accept_ra; } #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ @@ -849,7 +847,7 @@ static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int * we should *never* get to this point since that * would mean the addrs are equal * - * However, we do get to it 8) And exacly, when + * However, we do get to it 8) And exactly, when * addresses are equal 8) * * ip route add 1111::/128 via ... @@ -909,9 +907,9 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, int hlimit; if (ipv6_addr_is_multicast(&fl6->daddr)) - hlimit = np->mcast_hops; + hlimit = READ_ONCE(np->mcast_hops); else - hlimit = np->hop_limit; + hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); return hlimit; @@ -971,7 +969,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, hash = skb_get_hash_flowi6(skb, fl6); /* Since this is being sent on the wire obfuscate hash a bit - * to minimize possbility that any useful information to an + * to minimize possibility that any useful information to an * attacker is leaked. Only lower 20 bits are relevant. */ hash = rol32(hash, 16); @@ -1128,12 +1126,6 @@ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, st struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected); -struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, - struct net_device *dev, - struct net *net, struct socket *sock, - struct in6_addr *saddr, - const struct ip_tunnel_info *info, - u8 protocol, bool use_cache); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *orig_dst); @@ -1196,10 +1188,10 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, +int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); -int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); -int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, +int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); +int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); void ip6_datagram_release_cb(struct sock *sk); @@ -1216,8 +1208,8 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); -int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); +int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); @@ -1298,15 +1290,16 @@ static inline int ip6_sock_set_v6only(struct sock *sk) static inline void ip6_sock_set_recverr(struct sock *sk) { - lock_sock(sk); - inet6_sk(sk)->recverr = true; - release_sock(sk); + inet6_set_bit(RECVERR6, sk); } -static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) +#define IPV6_PREFER_SRC_MASK (IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBLIC | \ + IPV6_PREFER_SRC_COA) + +static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val) { + unsigned int prefmask = ~IPV6_PREFER_SRC_MASK; unsigned int pref = 0; - unsigned int prefmask = ~0; /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ switch (val & (IPV6_PREFER_SRC_PUBLIC | @@ -1356,25 +1349,28 @@ static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) return -EINVAL; } - inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref; + WRITE_ONCE(inet6_sk(sk)->srcprefs, + (READ_ONCE(inet6_sk(sk)->srcprefs) & prefmask) | pref); return 0; } -static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val) +static inline void ip6_sock_set_recvpktinfo(struct sock *sk) { - int ret; - lock_sock(sk); - ret = __ip6_sock_set_addr_preferences(sk, val); + inet6_sk(sk)->rxopt.bits.rxinfo = true; release_sock(sk); - return ret; } -static inline void ip6_sock_set_recvpktinfo(struct sock *sk) +#define IPV6_ADDR_WORDS 4 + +static inline void ipv6_addr_cpu_to_be32(__be32 *dst, const u32 *src) { - lock_sock(sk); - inet6_sk(sk)->rxopt.bits.rxinfo = true; - release_sock(sk); + cpu_to_be32_array(dst, src, IPV6_ADDR_WORDS); +} + +static inline void ipv6_addr_be32_to_cpu(u32 *dst, const __be32 *src) +{ + be32_to_cpu_array(dst, src, IPV6_ADDR_WORDS); } #endif /* _NET_IPV6_H */ diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 7321ffe3a108..38ef66826939 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -66,6 +66,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; struct sk_buff *head; + int refs = 1; rcu_read_lock(); /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ @@ -77,7 +78,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) goto out; fq->q.flags |= INET_FRAG_DROP; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, &refs); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) @@ -109,7 +110,7 @@ out: spin_unlock(&fq->q.lock); out_rcu_unlock: rcu_read_unlock(); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); } /* Check if the upper layer header is truncated in the first fragment. */ diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index c48186bf4737..d3013e721b14 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -9,6 +9,7 @@ #include <net/flow.h> #include <net/neighbour.h> #include <net/sock.h> +#include <net/ipv6.h> /* structs from net/ip6_fib.h */ struct fib6_info; @@ -60,6 +61,9 @@ struct ipv6_stub { #if IS_ENABLED(CONFIG_XFRM) void (*xfrm6_local_rxpmtu)(struct sk_buff *skb, u32 mtu); int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb); + struct sk_buff *(*xfrm6_gro_udp_encap_rcv)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); int (*xfrm6_rcv_encap)(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); #endif @@ -69,14 +73,16 @@ struct ipv6_stub { int (*output)(struct net *, struct sock *, struct sk_buff *)); struct net_device *(*ipv6_dev_find)(struct net *net, const struct in6_addr *addr, struct net_device *dev); + int (*ip6_xmit)(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); }; extern const struct ipv6_stub *ipv6_stub __read_mostly; /* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ struct ipv6_bpf_stub { - int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, + int (*inet6_bind)(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags); - struct sock *(*udp6_lib_lookup)(struct net *net, + struct sock *(*udp6_lib_lookup)(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, @@ -85,6 +91,11 @@ struct ipv6_bpf_stub { sockptr_t optval, unsigned int optlen); int (*ipv6_getsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); + int (*ipv6_dev_get_saddr)(struct net *net, + const struct net_device *dst_dev, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr); }; extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h index f9e88401d7da..9804fa5d9c67 100644 --- a/include/net/iucv/iucv.h +++ b/include/net/iucv/iucv.h @@ -15,7 +15,7 @@ * To explore any of the IUCV functions, one must first register their * program using iucv_register(). Once your program has successfully * completed a register, it can exploit the other functions. - * For furthur reference on all IUCV functionality, refer to the + * For further reference on all IUCV functionality, refer to the * CP Programming Services book, also available on the web thru * www.vm.ibm.com/pubs, manual # SC24-6084 * @@ -30,6 +30,7 @@ #include <linux/types.h> #include <linux/slab.h> +#include <asm/dma-types.h> #include <asm/debug.h> /* @@ -76,12 +77,17 @@ * and iucv_message_reply if IUCV_IPBUFLST or IUCV_IPANSLST are used. */ struct iucv_array { - u32 address; + dma32_t address; u32 length; } __attribute__ ((aligned (8))); -extern struct bus_type iucv_bus; -extern struct device *iucv_root; +extern const struct bus_type iucv_bus; + +struct device_driver; + +struct device *iucv_alloc_device(const struct attribute_group **attrs, + struct device_driver *driver, void *priv, + const char *fmt, ...) __printf(4, 5); /* * struct iucv_path @@ -196,7 +202,7 @@ struct iucv_handler { * * Registers a driver with IUCV. * - * Returns 0 on success, -ENOMEM if the memory allocation for the pathid + * Returns: 0 on success, -ENOMEM if the memory allocation for the pathid * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus. */ int iucv_register(struct iucv_handler *handler, int smp); @@ -218,7 +224,7 @@ void iucv_unregister(struct iucv_handler *handle, int smp); * * Allocate a new path structure for use with iucv_connect. * - * Returns NULL if the memory allocation failed or a pointer to the + * Returns: NULL if the memory allocation failed or a pointer to the * path structure. */ static inline struct iucv_path *iucv_path_alloc(u16 msglim, u8 flags, gfp_t gfp) @@ -254,7 +260,7 @@ static inline void iucv_path_free(struct iucv_path *path) * This function is issued after the user received a connection pending * external interrupt and now wishes to complete the IUCV communication path. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, u8 *userdata, void *private); @@ -272,7 +278,7 @@ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, * successfully, you are not able to use the path until you receive an IUCV * Connection Complete external interrupt. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, u8 *userid, u8 *system, u8 *userdata, @@ -286,7 +292,7 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, * This function temporarily suspends incoming messages on an IUCV path. * You can later reactivate the path by invoking the iucv_resume function. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); @@ -298,7 +304,7 @@ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); * This function resumes incoming messages on an IUCV path that has * been stopped with iucv_path_quiesce. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_resume(struct iucv_path *path, u8 *userdata); @@ -309,7 +315,7 @@ int iucv_path_resume(struct iucv_path *path, u8 *userdata); * * This function terminates an IUCV path. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_sever(struct iucv_path *path, u8 *userdata); @@ -321,7 +327,7 @@ int iucv_path_sever(struct iucv_path *path, u8 *userdata); * * Cancels a message you have sent. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, u32 srccls); @@ -341,7 +347,7 @@ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual); @@ -361,7 +367,7 @@ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, * * Locking: no locking. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, @@ -376,7 +382,7 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, * are notified of a message and the time that you complete the message, * the message may be rejected. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); @@ -393,7 +399,7 @@ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into * the parameter list. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *reply, size_t size); @@ -413,7 +419,7 @@ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); @@ -433,7 +439,7 @@ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, * * Locking: no locking. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); @@ -455,7 +461,7 @@ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, * reply to the message and a buffer is provided into which IUCV moves * the reply to this message. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size, @@ -489,7 +495,7 @@ struct iucv_interface { int (*path_sever)(struct iucv_path *path, u8 userdata[16]); int (*iucv_register)(struct iucv_handler *handler, int smp); void (*iucv_unregister)(struct iucv_handler *handler, int smp); - struct bus_type *bus; + const struct bus_type *bus; struct device *root; }; diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index b2cf243ebe44..b80e474cb0aa 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -23,7 +23,7 @@ * to handle wireless statistics. * * The initial APIs served us well and has proven a reasonably good design. - * However, there is a few shortcommings : + * However, there are a few shortcomings : * o No events, everything is a request to the driver. * o Large ioctl function in driver with gigantic switch statement * (i.e. spaghetti code). @@ -38,13 +38,13 @@ * ------------------------------- * The new driver API is just a bunch of standard functions (handlers), * each handling a specific Wireless Extension. The driver just export - * the list of handler it supports, and those will be called apropriately. + * the list of handler it supports, and those will be called appropriately. * * I tried to keep the main advantage of the previous API (simplicity, * efficiency and light weight), and also I provide a good dose of backward * compatibility (most structures are the same, driver can use both API * simultaneously, ...). - * Hopefully, I've also addressed the shortcomming of the initial API. + * Hopefully, I've also addressed the shortcoming of the initial API. * * The advantage of the new API are : * o Handling of Extensions in driver broken in small contained functions @@ -84,7 +84,7 @@ /* ---------------------- THE IMPLEMENTATION ---------------------- */ /* - * Some of the choice I've made are pretty controversials. Defining an + * Some of the choice I've made are pretty controversial. Defining an * API is very much weighting compromises. This goes into some of the * details and the thinking behind the implementation. * @@ -140,7 +140,7 @@ * example to distinguish setting max rate and basic rate), I would * break the prototype. Using iwreq_data is more flexible. * 3) Also, the above form is not generic (see above). - * 4) I don't expect driver developper using the wrong field of the + * 4) I don't expect driver developer using the wrong field of the * union (Doh !), so static typechecking doesn't add much value. * 5) Lastly, you can skip the union by doing : * static int mydriver_ioctl_setrate(struct net_device *dev, @@ -279,8 +279,6 @@ #define IW_DESCR_FLAG_RESTRICT 0x0004 /* GET : request is ROOT only */ /* SET : Omit payload from generated iwevent */ #define IW_DESCR_FLAG_NOMAX 0x0008 /* GET : no limit on request size */ -/* Driver level flags */ -#define IW_DESCR_FLAG_WAIT 0x0100 /* Wait for driver event */ /****************************** TYPES ******************************/ @@ -373,11 +371,10 @@ struct iw_handler_def { */ struct iw_ioctl_description { __u8 header_type; /* NULL, iw_point or other */ - __u8 token_type; /* Future */ + __u8 flags; /* Special handling of the request */ __u16 token_size; /* Granularity of payload */ __u16 min_tokens; /* Min acceptable token number */ __u16 max_tokens; /* Max acceptable token number */ - __u32 flags; /* Special handling of the request */ }; /* Need to think of short header translation table. Later. */ @@ -404,26 +401,6 @@ struct iw_spy_data { u_char spy_thr_under[IW_MAX_SPY]; }; -/* --------------------- DEVICE WIRELESS DATA --------------------- */ -/* - * This is all the wireless data specific to a device instance that - * is managed by the core of Wireless Extensions or the 802.11 layer. - * We only keep pointer to those structures, so that a driver is free - * to share them between instances. - * This structure should be initialised before registering the device. - * Access to this data follow the same rules as any other struct net_device - * data (i.e. valid as long as struct net_device exist, same locking rules). - */ -/* Forward declaration */ -struct libipw_device; -/* The struct */ -struct iw_public_data { - /* Driver enhanced spy support */ - struct iw_spy_data * spy_data; - /* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */ - struct libipw_device * libipw; -}; - /**************************** PROTOTYPES ****************************/ /* * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c). @@ -443,23 +420,7 @@ static inline void wireless_nlevent_flush(void) {} /* We may need a function to send a stream of events to user space. * More on that later... */ -/* Standard handler for SIOCSIWSPY */ -int iw_handler_set_spy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCGIWSPY */ -int iw_handler_get_spy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCSIWTHRSPY */ -int iw_handler_set_thrspy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCGIWTHRSPY */ -int iw_handler_get_thrspy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Driver call to update spy records */ -void wireless_spy_update(struct net_device *dev, unsigned char *address, - struct iw_quality *wstats); - -/************************* INLINE FUNTIONS *************************/ +/************************* INLINE FUNCTIONS *************************/ /* * Function that are so simple that it's more efficient inlining them */ diff --git a/include/net/kcm.h b/include/net/kcm.h index 90279e5e09a5..d9c35e71ecea 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -70,7 +70,7 @@ struct kcm_sock { struct work_struct tx_work; struct list_head wait_psock_list; struct sk_buff *seq_skb; - u32 tx_stopped : 1; + struct mutex tx_mutex; /* Don't use bit fields here, these are set under different locks */ bool tx_wait; diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 031c661aa14d..1eb8dad18f7e 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -59,6 +59,20 @@ int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg); +static inline +bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) +{ + return !(fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF) && + fl->flowi_l3mdev == iifindex; +} + +static inline +bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) +{ + return fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF && + fl->flowi_l3mdev == oifindex; +} + void l3mdev_update_flow(struct net *net, struct flowi *fl); int l3mdev_master_ifindex_rcu(const struct net_device *dev); @@ -78,7 +92,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) struct net_device *dev; int rc = 0; - if (likely(ifindex)) { + if (ifindex) { rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); @@ -198,10 +212,12 @@ struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) if (netif_is_l3_slave(dev)) { struct net_device *master; + rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); if (master && master->l3mdev_ops->l3mdev_l3_out) skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, skb, proto); + rcu_read_unlock(); } return skb; @@ -325,6 +341,19 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, { return 1; } + +static inline +bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) +{ + return false; +} + +static inline +bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) +{ + return false; +} + static inline void l3mdev_update_flow(struct net *net, struct flowi *fl) { diff --git a/include/net/lapb.h b/include/net/lapb.h index 124ee122f2c8..6c07420644e4 100644 --- a/include/net/lapb.h +++ b/include/net/lapb.h @@ -4,7 +4,7 @@ #include <linux/lapb.h> #include <linux/refcount.h> -#define LAPB_HEADER_LEN 20 /* LAPB over Ethernet + a bit more */ +#define LAPB_HEADER_LEN MAX_HEADER /* LAPB over Ethernet + a bit more */ #define LAPB_ACK_PENDING_CONDITION 0x01 #define LAPB_REJECT_CONDITION 0x02 diff --git a/include/net/lib80211.h b/include/net/lib80211.h deleted file mode 100644 index 8b47d3a51cf8..000000000000 --- a/include/net/lib80211.h +++ /dev/null @@ -1,122 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * lib80211.h -- common bits for IEEE802.11 wireless drivers - * - * Copyright (c) 2008, John W. Linville <linville@tuxdriver.com> - * - * Some bits copied from old ieee80211 component, w/ original copyright - * notices below: - * - * Original code based on Host AP (software wireless LAN access point) driver - * for Intersil Prism2/2.5/3. - * - * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen - * <j@w1.fi> - * Copyright (c) 2002-2003, Jouni Malinen <j@w1.fi> - * - * Adaption to a generic IEEE 802.11 stack by James Ketrenos - * <jketreno@linux.intel.com> - * - * Copyright (c) 2004, Intel Corporation - * - */ - -#ifndef LIB80211_H -#define LIB80211_H - -#include <linux/types.h> -#include <linux/list.h> -#include <linux/atomic.h> -#include <linux/if.h> -#include <linux/skbuff.h> -#include <linux/ieee80211.h> -#include <linux/timer.h> -#include <linux/seq_file.h> - -#define NUM_WEP_KEYS 4 - -enum { - IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), -}; - -struct module; - -struct lib80211_crypto_ops { - const char *name; - struct list_head list; - - /* init new crypto context (e.g., allocate private data space, - * select IV, etc.); returns NULL on failure or pointer to allocated - * private data on success */ - void *(*init) (int keyidx); - - /* deinitialize crypto context and free allocated private data */ - void (*deinit) (void *priv); - - /* encrypt/decrypt return < 0 on error or >= 0 on success. The return - * value from decrypt_mpdu is passed as the keyidx value for - * decrypt_msdu. skb must have enough head and tail room for the - * encryption; if not, error will be returned; these functions are - * called for all MPDUs (i.e., fragments). - */ - int (*encrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); - int (*decrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); - - /* These functions are called for full MSDUs, i.e. full frames. - * These can be NULL if full MSDU operations are not needed. */ - int (*encrypt_msdu) (struct sk_buff * skb, int hdr_len, void *priv); - int (*decrypt_msdu) (struct sk_buff * skb, int keyidx, int hdr_len, - void *priv); - - int (*set_key) (void *key, int len, u8 * seq, void *priv); - int (*get_key) (void *key, int len, u8 * seq, void *priv); - - /* procfs handler for printing out key information and possible - * statistics */ - void (*print_stats) (struct seq_file *m, void *priv); - - /* Crypto specific flag get/set for configuration settings */ - unsigned long (*get_flags) (void *priv); - unsigned long (*set_flags) (unsigned long flags, void *priv); - - /* maximum number of bytes added by encryption; encrypt buf is - * allocated with extra_prefix_len bytes, copy of in_buf, and - * extra_postfix_len; encrypt need not use all this space, but - * the result must start at the beginning of the buffer and correct - * length must be returned */ - int extra_mpdu_prefix_len, extra_mpdu_postfix_len; - int extra_msdu_prefix_len, extra_msdu_postfix_len; - - struct module *owner; -}; - -struct lib80211_crypt_data { - struct list_head list; /* delayed deletion list */ - struct lib80211_crypto_ops *ops; - void *priv; - atomic_t refcnt; -}; - -struct lib80211_crypt_info { - char *name; - /* Most clients will already have a lock, - so just point to that. */ - spinlock_t *lock; - - struct lib80211_crypt_data *crypt[NUM_WEP_KEYS]; - int tx_keyidx; /* default TX key index (crypt[tx_keyidx]) */ - struct list_head crypt_deinit_list; - struct timer_list crypt_deinit_timer; - int crypt_quiesced; -}; - -int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name, - spinlock_t *lock); -void lib80211_crypt_info_free(struct lib80211_crypt_info *info); -int lib80211_register_crypto_ops(struct lib80211_crypto_ops *ops); -int lib80211_unregister_crypto_ops(struct lib80211_crypto_ops *ops); -struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name); -void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info, - struct lib80211_crypt_data **crypt); - -#endif /* LIB80211_H */ diff --git a/include/net/libeth/cache.h b/include/net/libeth/cache.h new file mode 100644 index 000000000000..bdb0c043ce61 --- /dev/null +++ b/include/net/libeth/cache.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_CACHE_H +#define __LIBETH_CACHE_H + +#include <linux/cache.h> + +/** + * libeth_cacheline_group_assert - make sure cacheline group size is expected + * @type: type of the structure containing the group + * @grp: group name inside the struct + * @sz: expected group size + */ +#if defined(CONFIG_64BIT) && SMP_CACHE_BYTES == 64 +#define libeth_cacheline_group_assert(type, grp, sz) \ + static_assert(offsetof(type, __cacheline_group_end__##grp) - \ + offsetofend(type, __cacheline_group_begin__##grp) == \ + (sz)) +#define __libeth_cacheline_struct_assert(type, sz) \ + static_assert(sizeof(type) == (sz)) +#else /* !CONFIG_64BIT || SMP_CACHE_BYTES != 64 */ +#define libeth_cacheline_group_assert(type, grp, sz) \ + static_assert(offsetof(type, __cacheline_group_end__##grp) - \ + offsetofend(type, __cacheline_group_begin__##grp) <= \ + (sz)) +#define __libeth_cacheline_struct_assert(type, sz) \ + static_assert(sizeof(type) <= (sz)) +#endif /* !CONFIG_64BIT || SMP_CACHE_BYTES != 64 */ + +#define __libeth_cls1(sz1) SMP_CACHE_ALIGN(sz1) +#define __libeth_cls2(sz1, sz2) (SMP_CACHE_ALIGN(sz1) + SMP_CACHE_ALIGN(sz2)) +#define __libeth_cls3(sz1, sz2, sz3) \ + (SMP_CACHE_ALIGN(sz1) + SMP_CACHE_ALIGN(sz2) + SMP_CACHE_ALIGN(sz3)) +#define __libeth_cls(...) \ + CONCATENATE(__libeth_cls, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) + +/** + * libeth_cacheline_struct_assert - make sure CL-based struct size is expected + * @type: type of the struct + * @...: from 1 to 3 CL group sizes (read-mostly, read-write, cold) + * + * When a struct contains several CL groups, it's difficult to predict its size + * on different architectures. The macro instead takes sizes of all of the + * groups the structure contains and generates the final struct size. + */ +#define libeth_cacheline_struct_assert(type, ...) \ + __libeth_cacheline_struct_assert(type, __libeth_cls(__VA_ARGS__)); \ + static_assert(__alignof(type) >= SMP_CACHE_BYTES) + +/** + * libeth_cacheline_set_assert - make sure CL-based struct layout is expected + * @type: type of the struct + * @ro: expected size of the read-mostly group + * @rw: expected size of the read-write group + * @c: expected size of the cold group + * + * Check that each group size is expected and then do final struct size check. + */ +#define libeth_cacheline_set_assert(type, ro, rw, c) \ + libeth_cacheline_group_assert(type, read_mostly, ro); \ + libeth_cacheline_group_assert(type, read_write, rw); \ + libeth_cacheline_group_assert(type, cold, c); \ + libeth_cacheline_struct_assert(type, ro, rw, c) + +#endif /* __LIBETH_CACHE_H */ diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h new file mode 100644 index 000000000000..5d991404845e --- /dev/null +++ b/include/net/libeth/rx.h @@ -0,0 +1,314 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024-2025 Intel Corporation */ + +#ifndef __LIBETH_RX_H +#define __LIBETH_RX_H + +#include <linux/if_vlan.h> + +#include <net/page_pool/helpers.h> +#include <net/xdp.h> + +/* Rx buffer management */ + +/* Space reserved in front of each frame */ +#define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN) +#define LIBETH_XDP_HEADROOM (ALIGN(XDP_PACKET_HEADROOM, NET_SKB_PAD) + \ + NET_IP_ALIGN) +/* Maximum headroom for worst-case calculations */ +#define LIBETH_MAX_HEADROOM LIBETH_XDP_HEADROOM +/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */ +#define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) +/* Maximum supported L2-L4 header length */ +#define LIBETH_MAX_HEAD roundup_pow_of_two(max(MAX_HEADER, 256)) + +/* Always use order-0 pages */ +#define LIBETH_RX_PAGE_ORDER 0 +/* Pick a sane buffer stride and align to a cacheline boundary */ +#define LIBETH_RX_BUF_STRIDE SKB_DATA_ALIGN(128) +/* HW-writeable space in one buffer: truesize - headroom/tailroom, aligned */ +#define LIBETH_RX_PAGE_LEN(hr) \ + ALIGN_DOWN(SKB_MAX_ORDER(hr, LIBETH_RX_PAGE_ORDER), \ + LIBETH_RX_BUF_STRIDE) + +/** + * struct libeth_fqe - structure representing an Rx buffer (fill queue element) + * @netmem: network memory reference holding the buffer + * @offset: offset from the page start (to the headroom) + * @truesize: total space occupied by the buffer (w/ headroom and tailroom) + * + * Depending on the MTU, API switches between one-page-per-frame and shared + * page model (to conserve memory on bigger-page platforms). In case of the + * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```. + */ +struct libeth_fqe { + netmem_ref netmem; + u32 offset; + u32 truesize; +} __aligned_largest; + +/** + * enum libeth_fqe_type - enum representing types of Rx buffers + * @LIBETH_FQE_MTU: buffer size is determined by MTU + * @LIBETH_FQE_SHORT: buffer size is smaller than MTU, for short frames + * @LIBETH_FQE_HDR: buffer size is ```LIBETH_MAX_HEAD```-sized, for headers + */ +enum libeth_fqe_type { + LIBETH_FQE_MTU = 0U, + LIBETH_FQE_SHORT, + LIBETH_FQE_HDR, +}; + +/** + * struct libeth_fq - structure representing a buffer (fill) queue + * @fp: hotpath part of the structure + * @pp: &page_pool for buffer management + * @fqes: array of Rx buffers + * @truesize: size to allocate per buffer, w/overhead + * @count: number of descriptors/buffers the queue has + * @type: type of the buffers this queue has + * @hsplit: flag whether header split is enabled + * @xdp: flag indicating whether XDP is enabled + * @buf_len: HW-writeable length per each buffer + * @nid: ID of the closest NUMA node with memory + */ +struct libeth_fq { + struct_group_tagged(libeth_fq_fp, fp, + struct page_pool *pp; + struct libeth_fqe *fqes; + + u32 truesize; + u32 count; + ); + + /* Cold fields */ + enum libeth_fqe_type type:2; + bool hsplit:1; + bool xdp:1; + + u32 buf_len; + int nid; +}; + +int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi); +void libeth_rx_fq_destroy(struct libeth_fq *fq); + +/** + * libeth_rx_alloc - allocate a new Rx buffer + * @fq: fill queue to allocate for + * @i: index of the buffer within the queue + * + * Return: DMA address to be passed to HW for Rx on successful allocation, + * ```DMA_MAPPING_ERROR``` otherwise. + */ +static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i) +{ + struct libeth_fqe *buf = &fq->fqes[i]; + + buf->truesize = fq->truesize; + buf->netmem = page_pool_dev_alloc_netmem(fq->pp, &buf->offset, + &buf->truesize); + if (unlikely(!buf->netmem)) + return DMA_MAPPING_ERROR; + + return page_pool_get_dma_addr_netmem(buf->netmem) + buf->offset + + fq->pp->p.offset; +} + +void libeth_rx_recycle_slow(netmem_ref netmem); + +/** + * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA + * @fqe: buffer to process + * @len: frame length from the descriptor + * + * Process the buffer after it's written by HW. The regular path is to + * synchronize DMA for CPU, but in case of no data it will be immediately + * recycled back to its PP. + * + * Return: true when there's data to process, false otherwise. + */ +static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe, + u32 len) +{ + netmem_ref netmem = fqe->netmem; + + /* Very rare, but possible case. The most common reason: + * the last fragment contained FCS only, which was then + * stripped by the HW. + */ + if (unlikely(!len)) { + libeth_rx_recycle_slow(netmem); + return false; + } + + page_pool_dma_sync_netmem_for_cpu(netmem_get_pp(netmem), netmem, + fqe->offset, len); + + return true; +} + +/* Converting abstract packet type numbers into a software structure with + * the packet parameters to do O(1) lookup on Rx. + */ + +enum { + LIBETH_RX_PT_OUTER_L2 = 0U, + LIBETH_RX_PT_OUTER_IPV4, + LIBETH_RX_PT_OUTER_IPV6, +}; + +enum { + LIBETH_RX_PT_NOT_FRAG = 0U, + LIBETH_RX_PT_FRAG, +}; + +enum { + LIBETH_RX_PT_TUNNEL_IP_NONE = 0U, + LIBETH_RX_PT_TUNNEL_IP_IP, + LIBETH_RX_PT_TUNNEL_IP_GRENAT, + LIBETH_RX_PT_TUNNEL_IP_GRENAT_MAC, + LIBETH_RX_PT_TUNNEL_IP_GRENAT_MAC_VLAN, +}; + +enum { + LIBETH_RX_PT_TUNNEL_END_NONE = 0U, + LIBETH_RX_PT_TUNNEL_END_IPV4, + LIBETH_RX_PT_TUNNEL_END_IPV6, +}; + +enum { + LIBETH_RX_PT_INNER_NONE = 0U, + LIBETH_RX_PT_INNER_UDP, + LIBETH_RX_PT_INNER_TCP, + LIBETH_RX_PT_INNER_SCTP, + LIBETH_RX_PT_INNER_ICMP, + LIBETH_RX_PT_INNER_TIMESYNC, +}; + +#define LIBETH_RX_PT_PAYLOAD_NONE PKT_HASH_TYPE_NONE +#define LIBETH_RX_PT_PAYLOAD_L2 PKT_HASH_TYPE_L2 +#define LIBETH_RX_PT_PAYLOAD_L3 PKT_HASH_TYPE_L3 +#define LIBETH_RX_PT_PAYLOAD_L4 PKT_HASH_TYPE_L4 + +struct libeth_rx_pt { + u32 outer_ip:2; + u32 outer_frag:1; + u32 tunnel_type:3; + u32 tunnel_end_prot:2; + u32 tunnel_end_frag:1; + u32 inner_prot:3; + enum pkt_hash_types payload_layer:2; + + u32 pad:2; + enum xdp_rss_hash_type hash_type:16; +}; + +/** + * struct libeth_rx_csum - checksum offload bits decoded from the Rx descriptor + * @l3l4p: detectable L3 and L4 integrity check is processed by the hardware + * @ipe: IP checksum error + * @eipe: external (outermost) IP header (only for tunels) + * @eudpe: external (outermost) UDP checksum error (only for tunels) + * @ipv6exadd: IPv6 header with extension headers + * @l4e: L4 integrity error + * @pprs: set for packets that skip checksum calculation in the HW pre parser + * @nat: the packet is a UDP tunneled packet + * @raw_csum_valid: set if raw checksum is valid + * @pad: padding to naturally align raw_csum field + * @raw_csum: raw checksum + */ +struct libeth_rx_csum { + u32 l3l4p:1; + u32 ipe:1; + u32 eipe:1; + u32 eudpe:1; + u32 ipv6exadd:1; + u32 l4e:1; + u32 pprs:1; + u32 nat:1; + + u32 raw_csum_valid:1; + u32 pad:7; + u32 raw_csum:16; +}; + +/** + * struct libeth_rqe_info - receive queue element info + * @len: packet length + * @ptype: packet type based on types programmed into the device + * @eop: whether it's the last fragment of the packet + * @rxe: MAC errors: CRC, Alignment, Oversize, Undersizes, Length error + * @vlan: C-VLAN or S-VLAN tag depending on the VLAN offload configuration + */ +struct libeth_rqe_info { + u32 len; + + u32 ptype:14; + u32 eop:1; + u32 rxe:1; + + u32 vlan:16; +}; + +void libeth_rx_pt_gen_hash_type(struct libeth_rx_pt *pt); + +/** + * libeth_rx_pt_get_ip_ver - get IP version from a packet type structure + * @pt: packet type params + * + * Wrapper to compile out the IPv6 code from the drivers when not supported + * by the kernel. + * + * Return: @pt.outer_ip or stub for IPv6 when not compiled-in. + */ +static inline u32 libeth_rx_pt_get_ip_ver(struct libeth_rx_pt pt) +{ +#if !IS_ENABLED(CONFIG_IPV6) + switch (pt.outer_ip) { + case LIBETH_RX_PT_OUTER_IPV4: + return LIBETH_RX_PT_OUTER_IPV4; + default: + return LIBETH_RX_PT_OUTER_L2; + } +#else + return pt.outer_ip; +#endif +} + +/* libeth_has_*() can be used to quickly check whether the HW metadata is + * available to avoid further expensive processing such as descriptor reads. + * They already check for the corresponding netdev feature to be enabled, + * thus can be used as drop-in replacements. + */ + +static inline bool libeth_rx_pt_has_checksum(const struct net_device *dev, + struct libeth_rx_pt pt) +{ + /* Non-zero _INNER* is only possible when _OUTER_IPV* is set, + * it is enough to check only for the L4 type. + */ + return likely(pt.inner_prot > LIBETH_RX_PT_INNER_NONE && + (dev->features & NETIF_F_RXCSUM)); +} + +static inline bool libeth_rx_pt_has_hash(const struct net_device *dev, + struct libeth_rx_pt pt) +{ + return likely(pt.payload_layer > LIBETH_RX_PT_PAYLOAD_NONE && + (dev->features & NETIF_F_RXHASH)); +} + +/** + * libeth_rx_pt_set_hash - fill in skb hash value basing on the PT + * @skb: skb to fill the hash in + * @hash: 32-bit hash value from the descriptor + * @pt: packet type + */ +static inline void libeth_rx_pt_set_hash(struct sk_buff *skb, u32 hash, + struct libeth_rx_pt pt) +{ + skb_set_hash(skb, hash, pt.payload_layer); +} + +#endif /* __LIBETH_RX_H */ diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h new file mode 100644 index 000000000000..c3db5c6f1641 --- /dev/null +++ b/include/net/libeth/tx.h @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024-2025 Intel Corporation */ + +#ifndef __LIBETH_TX_H +#define __LIBETH_TX_H + +#include <linux/skbuff.h> + +#include <net/libeth/types.h> + +/* Tx buffer completion */ + +/** + * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion + * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX/XSk frame, no action required + * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required + * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree() + * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA + * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats + * @__LIBETH_SQE_XDP_START: separator between skb and XDP types + * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats + * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats + * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA + * @LIBETH_SQE_XSK_TX: &libeth_xdp_buff on XSk queue, xsk_buff_free(), stats + * @LIBETH_SQE_XSK_TX_FRAG: &libeth_xdp_buff frag on XSk queue, xsk_buff_free() + */ +enum libeth_sqe_type { + LIBETH_SQE_EMPTY = 0U, + LIBETH_SQE_CTX, + LIBETH_SQE_SLAB, + LIBETH_SQE_FRAG, + LIBETH_SQE_SKB, + + __LIBETH_SQE_XDP_START, + LIBETH_SQE_XDP_TX = __LIBETH_SQE_XDP_START, + LIBETH_SQE_XDP_XMIT, + LIBETH_SQE_XDP_XMIT_FRAG, + LIBETH_SQE_XSK_TX, + LIBETH_SQE_XSK_TX_FRAG, +}; + +/** + * struct libeth_sqe - represents a Send Queue Element / Tx buffer + * @type: type of the buffer, see the enum above + * @rs_idx: index of the last buffer from the batch this one was sent in + * @raw: slab buffer to free via kfree() + * @skb: &sk_buff to consume + * @sinfo: skb shared info of an XDP_TX frame + * @xdpf: XDP frame from ::ndo_xdp_xmit() + * @xsk: XSk Rx frame from XDP_TX action + * @dma: DMA address to unmap + * @len: length of the mapped region to unmap + * @nr_frags: number of frags in the frame this buffer belongs to + * @packets: number of physical packets sent for this frame + * @bytes: number of physical bytes sent for this frame + * @priv: driver-private scratchpad + */ +struct libeth_sqe { + enum libeth_sqe_type type:32; + u32 rs_idx; + + union { + void *raw; + struct sk_buff *skb; + struct skb_shared_info *sinfo; + struct xdp_frame *xdpf; + struct libeth_xdp_buff *xsk; + }; + + DEFINE_DMA_UNMAP_ADDR(dma); + DEFINE_DMA_UNMAP_LEN(len); + + u32 nr_frags; + u32 packets; + u32 bytes; + + unsigned long priv; +} __aligned_largest; + +/** + * LIBETH_SQE_CHECK_PRIV - check the driver's private SQE data + * @p: type or name of the object the driver wants to fit into &libeth_sqe + * + * Make sure the driver's private data fits into libeth_sqe::priv. To be used + * right after its declaration. + */ +#define LIBETH_SQE_CHECK_PRIV(p) \ + static_assert(sizeof(p) <= sizeof_field(struct libeth_sqe, priv)) + +/** + * struct libeth_cq_pp - completion queue poll params + * @dev: &device to perform DMA unmapping + * @bq: XDP frame bulk to combine return operations + * @ss: onstack NAPI stats to fill + * @xss: onstack XDPSQ NAPI stats to fill + * @xdp_tx: number of XDP-not-XSk frames processed + * @napi: whether it's called from the NAPI context + * + * libeth uses this structure to access objects needed for performing full + * Tx complete operation without passing lots of arguments and change the + * prototypes each time a new one is added. + */ +struct libeth_cq_pp { + struct device *dev; + struct xdp_frame_bulk *bq; + + union { + struct libeth_sq_napi_stats *ss; + struct libeth_xdpsq_napi_stats *xss; + }; + u32 xdp_tx; + + bool napi; +}; + +/** + * libeth_tx_complete - perform Tx completion for one SQE + * @sqe: SQE to complete + * @cp: poll params + * + * Do Tx complete for all the types of buffers, incl. freeing, unmapping, + * updating the stats etc. + */ +static inline void libeth_tx_complete(struct libeth_sqe *sqe, + const struct libeth_cq_pp *cp) +{ + switch (sqe->type) { + case LIBETH_SQE_EMPTY: + return; + case LIBETH_SQE_SKB: + case LIBETH_SQE_FRAG: + case LIBETH_SQE_SLAB: + dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma), + dma_unmap_len(sqe, len), DMA_TO_DEVICE); + break; + default: + break; + } + + switch (sqe->type) { + case LIBETH_SQE_SKB: + cp->ss->packets += sqe->packets; + cp->ss->bytes += sqe->bytes; + + napi_consume_skb(sqe->skb, cp->napi); + break; + case LIBETH_SQE_SLAB: + kfree(sqe->raw); + break; + default: + break; + } + + sqe->type = LIBETH_SQE_EMPTY; +} + +void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp); + +#endif /* __LIBETH_TX_H */ diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h new file mode 100644 index 000000000000..cf1d78a9dc38 --- /dev/null +++ b/include/net/libeth/types.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024-2025 Intel Corporation */ + +#ifndef __LIBETH_TYPES_H +#define __LIBETH_TYPES_H + +#include <linux/workqueue.h> + +/* Stats */ + +/** + * struct libeth_rq_napi_stats - "hot" counters to update in Rx polling loop + * @packets: received frames counter + * @bytes: sum of bytes of received frames above + * @fragments: sum of fragments of received S/G frames + * @hsplit: number of frames the device performed the header split for + * @raw: alias to access all the fields as an array + */ +struct libeth_rq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + u32 fragments; + u32 hsplit; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + +/** + * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop + * @packets: completed frames counter + * @bytes: sum of bytes of completed frames above + * @raw: alias to access all the fields as an array + */ +struct libeth_sq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + +/** + * struct libeth_xdpsq_napi_stats - "hot" counters to update in XDP Tx + * completion loop + * @packets: completed frames counter + * @bytes: sum of bytes of completed frames above + * @fragments: sum of fragments of completed S/G frames + * @raw: alias to access all the fields as an array + */ +struct libeth_xdpsq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + u32 fragments; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + +/* XDP */ + +/* + * The following structures should be embedded into driver's queue structure + * and passed to the libeth_xdp helpers, never used directly. + */ + +/* XDPSQ sharing */ + +/** + * struct libeth_xdpsq_lock - locking primitive for sharing XDPSQs + * @lock: spinlock for locking the queue + * @share: whether this particular queue is shared + */ +struct libeth_xdpsq_lock { + spinlock_t lock; + bool share; +}; + +/* XDPSQ clean-up timers */ + +/** + * struct libeth_xdpsq_timer - timer for cleaning up XDPSQs w/o interrupts + * @xdpsq: queue this timer belongs to + * @lock: lock for the queue + * @dwork: work performing cleanups + * + * XDPSQs not using interrupts but lazy cleaning, i.e. only when there's no + * space for sending the current queued frame/bulk, must fire up timers to + * make sure there are no stale buffers to free. + */ +struct libeth_xdpsq_timer { + void *xdpsq; + struct libeth_xdpsq_lock *lock; + + struct delayed_work dwork; +}; + +/* Rx polling path */ + +/** + * struct libeth_xdp_buff_stash - struct for stashing &xdp_buff onto a queue + * @data: pointer to the start of the frame, xdp_buff.data + * @headroom: frame headroom, xdp_buff.data - xdp_buff.data_hard_start + * @len: frame linear space length, xdp_buff.data_end - xdp_buff.data + * @frame_sz: truesize occupied by the frame, xdp_buff.frame_sz + * @flags: xdp_buff.flags + * + * &xdp_buff is 56 bytes long on x64, &libeth_xdp_buff is 64 bytes. This + * structure carries only necessary fields to save/restore a partially built + * frame on the queue structure to finish it during the next NAPI poll. + */ +struct libeth_xdp_buff_stash { + void *data; + u16 headroom; + u16 len; + + u32 frame_sz:24; + u32 flags:8; +} __aligned_largest; + +#endif /* __LIBETH_TYPES_H */ diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h new file mode 100644 index 000000000000..898723ab62e8 --- /dev/null +++ b/include/net/libeth/xdp.h @@ -0,0 +1,1870 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Intel Corporation */ + +#ifndef __LIBETH_XDP_H +#define __LIBETH_XDP_H + +#include <linux/bpf_trace.h> +#include <linux/unroll.h> + +#include <net/libeth/rx.h> +#include <net/libeth/tx.h> +#include <net/xsk_buff_pool.h> + +/* + * Defined as bits to be able to use them as a mask on Rx. + * Also used as internal return values on Tx. + */ +enum { + LIBETH_XDP_PASS = 0U, + LIBETH_XDP_DROP = BIT(0), + LIBETH_XDP_ABORTED = BIT(1), + LIBETH_XDP_TX = BIT(2), + LIBETH_XDP_REDIRECT = BIT(3), +}; + +/* + * &xdp_buff_xsk is the largest structure &libeth_xdp_buff gets casted to, + * pick maximum pointer-compatible alignment. + */ +#define __LIBETH_XDP_BUFF_ALIGN \ + (IS_ALIGNED(sizeof(struct xdp_buff_xsk), 16) ? 16 : \ + IS_ALIGNED(sizeof(struct xdp_buff_xsk), 8) ? 8 : \ + sizeof(long)) + +/** + * struct libeth_xdp_buff - libeth extension over &xdp_buff + * @base: main &xdp_buff + * @data: shortcut for @base.data + * @desc: RQ descriptor containing metadata for this buffer + * @priv: driver-private scratchspace + * + * The main reason for this is to have a pointer to the descriptor to be able + * to quickly get frame metadata from xdpmo and driver buff-to-xdp callbacks + * (as well as bigger alignment). + * Pointer/layout-compatible with &xdp_buff and &xdp_buff_xsk. + */ +struct libeth_xdp_buff { + union { + struct xdp_buff base; + void *data; + }; + + const void *desc; + unsigned long priv[] + __aligned(__LIBETH_XDP_BUFF_ALIGN); +} __aligned(__LIBETH_XDP_BUFF_ALIGN); +static_assert(offsetof(struct libeth_xdp_buff, data) == + offsetof(struct xdp_buff_xsk, xdp.data)); +static_assert(offsetof(struct libeth_xdp_buff, desc) == + offsetof(struct xdp_buff_xsk, cb)); +static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk), + __alignof(struct libeth_xdp_buff))); + +/** + * __LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack + * @name: name of the variable to declare + * @...: sizeof() of the driver-private data + */ +#define __LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + ___LIBETH_XDP_ONSTACK_BUFF(name, ##__VA_ARGS__) +/** + * LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack + * @name: name of the variable to declare + * @...: type or variable name of the driver-private data + */ +#define LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + __LIBETH_XDP_ONSTACK_BUFF(name, __libeth_xdp_priv_sz(__VA_ARGS__)) + +#define ___LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + __DEFINE_FLEX(struct libeth_xdp_buff, name, priv, \ + LIBETH_XDP_PRIV_SZ(__VA_ARGS__ + 0), \ + __uninitialized); \ + LIBETH_XDP_ASSERT_PRIV_SZ(__VA_ARGS__ + 0) + +#define __libeth_xdp_priv_sz(...) \ + CONCATENATE(__libeth_xdp_psz, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#define __libeth_xdp_psz0(...) +#define __libeth_xdp_psz1(...) sizeof(__VA_ARGS__) + +#define LIBETH_XDP_PRIV_SZ(sz) \ + (ALIGN(sz, __alignof(struct libeth_xdp_buff)) / sizeof(long)) + +/* Performs XSK_CHECK_PRIV_TYPE() */ +#define LIBETH_XDP_ASSERT_PRIV_SZ(sz) \ + static_assert(offsetofend(struct xdp_buff_xsk, cb) >= \ + struct_size_t(struct libeth_xdp_buff, priv, \ + LIBETH_XDP_PRIV_SZ(sz))) + +/* XDPSQ sharing */ + +DECLARE_STATIC_KEY_FALSE(libeth_xdpsq_share); + +/** + * libeth_xdpsq_num - calculate optimal number of XDPSQs for this device + sys + * @rxq: current number of active Rx queues + * @txq: current number of active Tx queues + * @max: maximum number of Tx queues + * + * Each RQ must have its own XDPSQ for XSk pairs, each CPU must have own XDPSQ + * for lockless sending (``XDP_TX``, .ndo_xdp_xmit()). Cap the maximum of these + * two with the number of SQs the device can have (minus used ones). + * + * Return: number of XDP Tx queues the device needs to use. + */ +static inline u32 libeth_xdpsq_num(u32 rxq, u32 txq, u32 max) +{ + return min(max(nr_cpu_ids, rxq), max - txq); +} + +/** + * libeth_xdpsq_shared - whether XDPSQs can be shared between several CPUs + * @num: number of active XDPSQs + * + * Return: true if there's no 1:1 XDPSQ/CPU association, false otherwise. + */ +static inline bool libeth_xdpsq_shared(u32 num) +{ + return num < nr_cpu_ids; +} + +/** + * libeth_xdpsq_id - get XDPSQ index corresponding to this CPU + * @num: number of active XDPSQs + * + * Helper for libeth_xdp routines, do not use in drivers directly. + * + * Return: XDPSQ index needs to be used on this CPU. + */ +static inline u32 libeth_xdpsq_id(u32 num) +{ + u32 ret = raw_smp_processor_id(); + + if (static_branch_unlikely(&libeth_xdpsq_share) && + libeth_xdpsq_shared(num)) + ret %= num; + + return ret; +} + +void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev); +void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev); + +/** + * libeth_xdpsq_get - initialize &libeth_xdpsq_lock + * @lock: lock to initialize + * @dev: netdev which this lock belongs to + * @share: whether XDPSQs can be shared + * + * Tracks the current XDPSQ association and enables the static lock + * if needed. + */ +static inline void libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev, + bool share) +{ + if (unlikely(share)) + __libeth_xdpsq_get(lock, dev); +} + +/** + * libeth_xdpsq_put - deinitialize &libeth_xdpsq_lock + * @lock: lock to deinitialize + * @dev: netdev which this lock belongs to + * + * Tracks the current XDPSQ association and disables the static lock + * if needed. + */ +static inline void libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_put(lock, dev); +} + +void __libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock); +void __libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock); + +/** + * libeth_xdpsq_lock - grab &libeth_xdpsq_lock if needed + * @lock: lock to take + * + * Touches the underlying spinlock only if the static key is enabled + * and the queue itself is marked as shareable. + */ +static inline void libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_lock(lock); +} + +/** + * libeth_xdpsq_unlock - free &libeth_xdpsq_lock if needed + * @lock: lock to free + * + * Touches the underlying spinlock only if the static key is enabled + * and the queue itself is marked as shareable. + */ +static inline void libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_unlock(lock); +} + +/* XDPSQ clean-up timers */ + +void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq, + struct libeth_xdpsq_lock *lock, + void (*poll)(struct work_struct *work)); + +/** + * libeth_xdpsq_deinit_timer - deinitialize &libeth_xdpsq_timer + * @timer: timer to deinitialize + * + * Flush and disable the underlying workqueue. + */ +static inline void libeth_xdpsq_deinit_timer(struct libeth_xdpsq_timer *timer) +{ + cancel_delayed_work_sync(&timer->dwork); +} + +/** + * libeth_xdpsq_queue_timer - run &libeth_xdpsq_timer + * @timer: timer to queue + * + * Should be called after the queue was filled and the transmission was run + * to complete the pending buffers if no further sending will be done in a + * second (-> lazy cleaning won't happen). + * If the timer was already run, it will be requeued back to one second + * timeout again. + */ +static inline void libeth_xdpsq_queue_timer(struct libeth_xdpsq_timer *timer) +{ + mod_delayed_work_on(raw_smp_processor_id(), system_bh_highpri_wq, + &timer->dwork, HZ); +} + +/** + * libeth_xdpsq_run_timer - wrapper to run a queue clean-up on a timer event + * @work: workqueue belonging to the corresponding timer + * @poll: driver-specific completion queue poll function + * + * Run the polling function on the locked queue and requeue the timer if + * there's more work to do. + * Designed to be used via LIBETH_XDP_DEFINE_TIMER() below. + */ +static __always_inline void +libeth_xdpsq_run_timer(struct work_struct *work, + u32 (*poll)(void *xdpsq, u32 budget)) +{ + struct libeth_xdpsq_timer *timer = container_of(work, typeof(*timer), + dwork.work); + + libeth_xdpsq_lock(timer->lock); + + if (poll(timer->xdpsq, U32_MAX)) + libeth_xdpsq_queue_timer(timer); + + libeth_xdpsq_unlock(timer->lock); +} + +/* Common Tx bits */ + +/** + * enum - libeth_xdp internal Tx flags + * @LIBETH_XDP_TX_BULK: one bulk size at which it will be flushed to the queue + * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled + * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent + * @LIBETH_XDP_TX_NDO: whether the send function is called from .ndo_xdp_xmit() + * @LIBETH_XDP_TX_XSK: whether the function is called for ``XDP_TX`` for XSk + */ +enum { + LIBETH_XDP_TX_BULK = DEV_MAP_BULK_SIZE, + LIBETH_XDP_TX_BATCH = 8, + + LIBETH_XDP_TX_DROP = BIT(0), + LIBETH_XDP_TX_NDO = BIT(1), + LIBETH_XDP_TX_XSK = BIT(2), +}; + +/** + * enum - &libeth_xdp_tx_frame and &libeth_xdp_tx_desc flags + * @LIBETH_XDP_TX_LEN: only for ``XDP_TX``, [15:0] of ::len_fl is actual length + * @LIBETH_XDP_TX_CSUM: for XSk xmit, enable checksum offload + * @LIBETH_XDP_TX_XSKMD: for XSk xmit, mask of the metadata bits + * @LIBETH_XDP_TX_FIRST: indicates the frag is the first one of the frame + * @LIBETH_XDP_TX_LAST: whether the frag is the last one of the frame + * @LIBETH_XDP_TX_MULTI: whether the frame contains several frags + * @LIBETH_XDP_TX_FLAGS: only for ``XDP_TX``, [31:16] of ::len_fl is flags + */ +enum { + LIBETH_XDP_TX_LEN = GENMASK(15, 0), + + LIBETH_XDP_TX_CSUM = XDP_TXMD_FLAGS_CHECKSUM, + LIBETH_XDP_TX_XSKMD = LIBETH_XDP_TX_LEN, + + LIBETH_XDP_TX_FIRST = BIT(16), + LIBETH_XDP_TX_LAST = BIT(17), + LIBETH_XDP_TX_MULTI = BIT(18), + + LIBETH_XDP_TX_FLAGS = GENMASK(31, 16), +}; + +/** + * struct libeth_xdp_tx_frame - represents one XDP Tx element + * @data: frame start pointer for ``XDP_TX`` + * @len_fl: ``XDP_TX``, combined flags [31:16] and len [15:0] field for speed + * @soff: ``XDP_TX``, offset from @data to the start of &skb_shared_info + * @frag: one (non-head) frag for ``XDP_TX`` + * @xdpf: &xdp_frame for the head frag for .ndo_xdp_xmit() + * @dma: DMA address of the non-head frag for .ndo_xdp_xmit() + * @xsk: ``XDP_TX`` for XSk, XDP buffer for any frag + * @len: frag length for XSk ``XDP_TX`` and .ndo_xdp_xmit() + * @flags: Tx flags for the above + * @opts: combined @len + @flags for the above for speed + * @desc: XSk xmit descriptor for direct casting + */ +struct libeth_xdp_tx_frame { + union { + /* ``XDP_TX`` */ + struct { + void *data; + u32 len_fl; + u32 soff; + }; + + /* ``XDP_TX`` frag */ + skb_frag_t frag; + + /* .ndo_xdp_xmit(), XSk ``XDP_TX`` */ + struct { + union { + struct xdp_frame *xdpf; + dma_addr_t dma; + + struct libeth_xdp_buff *xsk; + }; + union { + struct { + u32 len; + u32 flags; + }; + aligned_u64 opts; + }; + }; + + /* XSk xmit */ + struct xdp_desc desc; + }; +} __aligned(sizeof(struct xdp_desc)); +static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) == + offsetof(struct libeth_xdp_tx_frame, len_fl)); +static_assert(sizeof(struct libeth_xdp_tx_frame) == sizeof(struct xdp_desc)); + +/** + * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending + * @prog: corresponding active XDP program, %NULL for .ndo_xdp_xmit() + * @dev: &net_device which the frames are transmitted on + * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure + * @act_mask: Rx only, mask of all the XDP prog verdicts for that NAPI session + * @count: current number of frames in @bulk + * @bulk: array of queued frames for bulk Tx + * + * All XDP Tx operations except XSk xmit queue each frame to the bulk first + * and flush it when @count reaches the array end. Bulk is always placed on + * the stack for performance. One bulk element contains all the data necessary + * for sending a frame and then freeing it on completion. + * For XSk xmit, Tx descriptor array from &xsk_buff_pool is casted directly + * to &libeth_xdp_tx_frame as they are compatible and the bulk structure is + * not used. + */ +struct libeth_xdp_tx_bulk { + const struct bpf_prog *prog; + struct net_device *dev; + void *xdpsq; + + u32 act_mask; + u32 count; + struct libeth_xdp_tx_frame bulk[LIBETH_XDP_TX_BULK]; +} __aligned(sizeof(struct libeth_xdp_tx_frame)); + +/** + * LIBETH_XDP_ONSTACK_BULK - declare &libeth_xdp_tx_bulk on the stack + * @bq: name of the variable to declare + * + * Helper to declare a bulk on the stack with a compiler hint that it should + * not be initialized automatically (with `CONFIG_INIT_STACK_ALL_*`) for + * performance reasons. + */ +#define LIBETH_XDP_ONSTACK_BULK(bq) \ + struct libeth_xdp_tx_bulk bq __uninitialized + +/** + * struct libeth_xdpsq - abstraction for an XDPSQ + * @pool: XSk buffer pool for XSk ``XDP_TX`` and xmit + * @sqes: array of Tx buffers from the actual queue struct + * @descs: opaque pointer to the HW descriptor array + * @ntu: pointer to the next free descriptor index + * @count: number of descriptors on that queue + * @pending: pointer to the number of sent-not-completed descs on that queue + * @xdp_tx: pointer to the above, but only for non-XSk-xmit frames + * @lock: corresponding XDPSQ lock + * + * Abstraction for driver-independent implementation of Tx. Placed on the stack + * and filled by the driver before the transmission, so that the generic + * functions can access and modify driver-specific resources. + */ +struct libeth_xdpsq { + struct xsk_buff_pool *pool; + struct libeth_sqe *sqes; + void *descs; + + u32 *ntu; + u32 count; + + u32 *pending; + u32 *xdp_tx; + struct libeth_xdpsq_lock *lock; +}; + +/** + * struct libeth_xdp_tx_desc - abstraction for an XDP Tx descriptor + * @addr: DMA address of the frame + * @len: length of the frame + * @flags: XDP Tx flags + * @opts: combined @len + @flags for speed + * + * Filled by the generic functions and then passed to driver-specific functions + * to fill a HW Tx descriptor, always placed on the [function] stack. + */ +struct libeth_xdp_tx_desc { + dma_addr_t addr; + union { + struct { + u32 len; + u32 flags; + }; + aligned_u64 opts; + }; +} __aligned_largest; + +/** + * libeth_xdp_ptr_to_priv - convert pointer to a libeth_xdp u64 priv + * @ptr: pointer to convert + * + * The main sending function passes private data as the largest scalar, u64. + * Use this helper when you want to pass a pointer there. + */ +#define libeth_xdp_ptr_to_priv(ptr) ({ \ + typecheck_pointer(ptr); \ + ((u64)(uintptr_t)(ptr)); \ +}) +/** + * libeth_xdp_priv_to_ptr - convert libeth_xdp u64 priv to a pointer + * @priv: private data to convert + * + * The main sending function passes private data as the largest scalar, u64. + * Use this helper when your callback takes this u64 and you want to convert + * it back to a pointer. + */ +#define libeth_xdp_priv_to_ptr(priv) ({ \ + static_assert(__same_type(priv, u64)); \ + ((const void *)(uintptr_t)(priv)); \ +}) + +/* + * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len + * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead. + */ +#ifdef __LITTLE_ENDIAN +#define __LIBETH_WORD_ACCESS 1 +#endif +#ifdef __LIBETH_WORD_ACCESS +#define __libeth_xdp_tx_len(flen, ...) \ + .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0))) +#else +#define __libeth_xdp_tx_len(flen, ...) \ + .len = (flen), .flags = (__VA_ARGS__ + 0) +#endif + +/** + * libeth_xdp_tx_xmit_bulk - main XDP Tx function + * @bulk: array of frames to send + * @xdpsq: pointer to the driver-specific XDPSQ struct + * @n: number of frames to send + * @unroll: whether to unroll the queue filling loop for speed + * @priv: driver-specific private data + * @prep: callback for cleaning the queue and filling abstract &libeth_xdpsq + * @fill: internal callback for filling &libeth_sqe and &libeth_xdp_tx_desc + * @xmit: callback for filling a HW descriptor with the frame info + * + * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for + * all types of frames: ``XDP_TX``, .ndo_xdp_xmit(), XSk ``XDP_TX``, and XSk + * xmit. + * @prep must lock the queue as this function releases it at the end. @unroll + * greatly increases the object code size, but also greatly increases XSk xmit + * performance; for other types of frames, it's not enabled. + * The compilers inline all those onstack abstractions to direct data accesses. + * + * Return: number of frames actually placed on the queue, <= @n. The function + * can't fail, but can send less frames if there's no enough free descriptors + * available. The actual free space is returned by @prep from the driver. + */ +static __always_inline __nocfi_generic u32 +libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq, + u32 n, bool unroll, u64 priv, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + struct libeth_xdp_tx_desc + (*fill)(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv)) +{ + struct libeth_xdpsq sq __uninitialized; + u32 this, batched, off = 0; + u32 ntu, i = 0; + + n = min(n, prep(xdpsq, &sq)); + if (unlikely(!n)) + goto unlock; + + ntu = *sq.ntu; + + this = sq.count - ntu; + if (likely(this > n)) + this = n; + +again: + if (!unroll) + goto linear; + + batched = ALIGN_DOWN(this, LIBETH_XDP_TX_BATCH); + + for ( ; i < off + batched; i += LIBETH_XDP_TX_BATCH) { + u32 base = ntu + i - off; + + unrolled_count(LIBETH_XDP_TX_BATCH) + for (u32 j = 0; j < LIBETH_XDP_TX_BATCH; j++) + xmit(fill(bulk[i + j], base + j, &sq, priv), + base + j, &sq, priv); + } + + if (batched < this) { +linear: + for ( ; i < off + this; i++) + xmit(fill(bulk[i], ntu + i - off, &sq, priv), + ntu + i - off, &sq, priv); + } + + ntu += this; + if (likely(ntu < sq.count)) + goto out; + + ntu = 0; + + if (i < n) { + this = n - i; + off = i; + + goto again; + } + +out: + *sq.ntu = ntu; + *sq.pending += n; + if (sq.xdp_tx) + *sq.xdp_tx += n; + +unlock: + libeth_xdpsq_unlock(sq.lock); + + return n; +} + +/* ``XDP_TX`` bulking */ + +void libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp); + +/** + * libeth_xdp_tx_queue_head - internal helper for queueing one ``XDP_TX`` head + * @bq: XDP Tx bulk to queue the head frag to + * @xdp: XDP buffer with the head to queue + * + * Return: false if it's the only frag of the frame, true if it's an S/G frame. + */ +static inline bool libeth_xdp_tx_queue_head(struct libeth_xdp_tx_bulk *bq, + const struct libeth_xdp_buff *xdp) +{ + const struct xdp_buff *base = &xdp->base; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .data = xdp->data, + .len_fl = (base->data_end - xdp->data) | LIBETH_XDP_TX_FIRST, + .soff = xdp_data_hard_end(base) - xdp->data, + }; + + if (!xdp_buff_has_frags(base)) + return false; + + bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_MULTI; + + return true; +} + +/** + * libeth_xdp_tx_queue_frag - internal helper for queueing one ``XDP_TX`` frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: frag to queue + */ +static inline void libeth_xdp_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, + const skb_frag_t *frag) +{ + bq->bulk[bq->count++].frag = *frag; +} + +/** + * libeth_xdp_tx_queue_bulk - internal helper for queueing one ``XDP_TX`` frame + * @bq: XDP Tx bulk to queue the frame to + * @xdp: XDP buffer to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: true on success, false on flush error. + */ +static __always_inline bool +libeth_xdp_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + const struct skb_shared_info *sinfo; + bool ret = true; + u32 nr_frags; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, 0))) { + libeth_xdp_return_buff_slow(xdp); + return false; + } + + if (!libeth_xdp_tx_queue_head(bq, xdp)) + goto out; + + sinfo = xdp_get_shared_info_from_buff(&xdp->base); + nr_frags = sinfo->nr_frags; + + for (u32 i = 0; i < nr_frags; i++) { + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, 0))) { + ret = false; + break; + } + + libeth_xdp_tx_queue_frag(bq, &sinfo->frags[i]); + } + +out: + bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_LAST; + xdp->data = NULL; + + return ret; +} + +/** + * libeth_xdp_tx_fill_stats - fill &libeth_sqe with ``XDP_TX`` frame stats + * @sqe: SQ element to fill + * @desc: libeth_xdp Tx descriptor + * @sinfo: &skb_shared_info for this frame + * + * Internal helper for filling an SQE with the frame stats, do not use in + * drivers. Fills the number of frags and bytes for this frame. + */ +#define libeth_xdp_tx_fill_stats(sqe, desc, sinfo) \ + __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, __UNIQUE_ID(sqe_), \ + __UNIQUE_ID(desc_), __UNIQUE_ID(sinfo_)) + +#define __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, ue, ud, us) do { \ + const struct libeth_xdp_tx_desc *ud = (desc); \ + const struct skb_shared_info *us; \ + struct libeth_sqe *ue = (sqe); \ + \ + ue->nr_frags = 1; \ + ue->bytes = ud->len; \ + \ + if (ud->flags & LIBETH_XDP_TX_MULTI) { \ + us = (sinfo); \ + ue->nr_frags += us->nr_frags; \ + ue->bytes += us->xdp_frags_size; \ + } \ +} while (0) + +/** + * libeth_xdp_tx_fill_buf - internal helper to fill one ``XDP_TX`` &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the synced DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xdp_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + struct skb_shared_info *sinfo; + skb_frag_t *frag = &frm.frag; + struct libeth_sqe *sqe; + netmem_ref netmem; + + if (frm.len_fl & LIBETH_XDP_TX_FIRST) { + sinfo = frm.data + frm.soff; + skb_frag_fill_netmem_desc(frag, virt_to_netmem(frm.data), + offset_in_page(frm.data), + frm.len_fl); + } else { + sinfo = NULL; + } + + netmem = skb_frag_netmem(frag); + desc = (typeof(desc)){ + .addr = page_pool_get_dma_addr_netmem(netmem) + + skb_frag_off(frag), + .len = skb_frag_size(frag) & LIBETH_XDP_TX_LEN, + .flags = skb_frag_size(frag) & LIBETH_XDP_TX_FLAGS, + }; + + dma_sync_single_for_device(__netmem_get_pp(netmem)->p.dev, desc.addr, + desc.len, DMA_BIDIRECTIONAL); + + if (!sinfo) + return desc; + + sqe = &sq->sqes[i]; + sqe->type = LIBETH_SQE_XDP_TX; + sqe->sinfo = sinfo; + libeth_xdp_tx_fill_stats(sqe, &desc, sinfo); + + return desc; +} + +void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, + u32 flags); + +/** + * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk + * @bq: bulk to flush + * @flags: XDP TX flags (.ndo_xdp_xmit(), XSk etc.) + * @prep: driver-specific callback to prepare the queue for sending + * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc + * @xmit: driver callback to fill a HW descriptor + * + * Internal abstraction to create bulk flush functions for drivers. Used for + * everything except XSk xmit. + * + * Return: true if anything was sent, false otherwise. + */ +static __always_inline bool +__libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + struct libeth_xdp_tx_desc + (*fill)(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, + u64 priv)) +{ + u32 sent, drops; + int err = 0; + + sent = libeth_xdp_tx_xmit_bulk(bq->bulk, bq->xdpsq, + min(bq->count, LIBETH_XDP_TX_BULK), + false, 0, prep, fill, xmit); + drops = bq->count - sent; + + if (unlikely(drops)) { + libeth_xdp_tx_exception(bq, sent, flags); + err = -ENXIO; + } else { + bq->count = 0; + } + + trace_xdp_bulk_tx(bq->dev, sent, drops, err); + + return likely(sent); +} + +/** + * libeth_xdp_tx_flush_bulk - wrapper to define flush of one ``XDP_TX`` bulk + * @bq: bulk to flush + * @flags: Tx flags, see above + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XDP_DEFINE_FLUSH_TX() to define an ``XDP_TX`` driver + * callback. + */ +#define libeth_xdp_tx_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf, \ + xmit) + +/* .ndo_xdp_xmit() implementation */ + +/** + * libeth_xdp_xmit_init_bulk - internal helper to initialize bulk for XDP xmit + * @bq: bulk to initialize + * @dev: target &net_device + * @xdpsqs: array of driver-specific XDPSQ structs + * @num: number of active XDPSQs (the above array length) + */ +#define libeth_xdp_xmit_init_bulk(bq, dev, xdpsqs, num) \ + __libeth_xdp_xmit_init_bulk(bq, dev, (xdpsqs)[libeth_xdpsq_id(num)]) + +static inline void __libeth_xdp_xmit_init_bulk(struct libeth_xdp_tx_bulk *bq, + struct net_device *dev, + void *xdpsq) +{ + bq->dev = dev; + bq->xdpsq = xdpsq; + bq->count = 0; +} + +/** + * libeth_xdp_xmit_frame_dma - internal helper to access DMA of an &xdp_frame + * @xf: pointer to the XDP frame + * + * There's no place in &libeth_xdp_tx_frame to store DMA address for an + * &xdp_frame head. The headroom is used then, the address is placed right + * after the frame struct, naturally aligned. + * + * Return: pointer to the DMA address to use. + */ +#define libeth_xdp_xmit_frame_dma(xf) \ + _Generic((xf), \ + const struct xdp_frame *: \ + (const dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf), \ + struct xdp_frame *: \ + (dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf) \ + ) + +static inline void *__libeth_xdp_xmit_frame_dma(const struct xdp_frame *xdpf) +{ + void *addr = (void *)(xdpf + 1); + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + __alignof(*xdpf) < sizeof(dma_addr_t)) + addr = PTR_ALIGN(addr, sizeof(dma_addr_t)); + + return addr; +} + +/** + * libeth_xdp_xmit_queue_head - internal helper for queueing one XDP xmit head + * @bq: XDP Tx bulk to queue the head frag to + * @xdpf: XDP frame with the head to queue + * @dev: device to perform DMA mapping + * + * Return: ``LIBETH_XDP_DROP`` on DMA mapping error, + * ``LIBETH_XDP_PASS`` if it's the only frag in the frame, + * ``LIBETH_XDP_TX`` if it's an S/G frame. + */ +static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame *xdpf, + struct device *dev) +{ + dma_addr_t dma; + + dma = dma_map_single(dev, xdpf->data, xdpf->len, DMA_TO_DEVICE); + if (dma_mapping_error(dev, dma)) + return LIBETH_XDP_DROP; + + *libeth_xdp_xmit_frame_dma(xdpf) = dma; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xdpf = xdpf, + __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST), + }; + + if (!xdp_frame_has_frags(xdpf)) + return LIBETH_XDP_PASS; + + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI; + + return LIBETH_XDP_TX; +} + +/** + * libeth_xdp_xmit_queue_frag - internal helper for queueing one XDP xmit frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: frag to queue + * @dev: device to perform DMA mapping + * + * Return: true on success, false on DMA mapping error. + */ +static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq, + const skb_frag_t *frag, + struct device *dev) +{ + dma_addr_t dma; + + dma = skb_frag_dma_map(dev, frag); + if (dma_mapping_error(dev, dma)) + return false; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .dma = dma, + __libeth_xdp_tx_len(skb_frag_size(frag)), + }; + + return true; +} + +/** + * libeth_xdp_xmit_queue_bulk - internal helper for queueing one XDP xmit frame + * @bq: XDP Tx bulk to queue the frame to + * @xdpf: XDP frame to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: ``LIBETH_XDP_TX`` on success, + * ``LIBETH_XDP_DROP`` if the frame should be dropped by the stack, + * ``LIBETH_XDP_ABORTED`` if the frame will be dropped by libeth_xdp. + */ +static __always_inline u32 +libeth_xdp_xmit_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame *xdpf, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + u32 head, nr_frags, i, ret = LIBETH_XDP_TX; + struct device *dev = bq->dev->dev.parent; + const struct skb_shared_info *sinfo; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO))) + return LIBETH_XDP_DROP; + + head = libeth_xdp_xmit_queue_head(bq, xdpf, dev); + if (head == LIBETH_XDP_PASS) + goto out; + else if (head == LIBETH_XDP_DROP) + return LIBETH_XDP_DROP; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + nr_frags = sinfo->nr_frags; + + for (i = 0; i < nr_frags; i++) { + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO))) + break; + + if (!libeth_xdp_xmit_queue_frag(bq, &sinfo->frags[i], dev)) + break; + } + + if (unlikely(i < nr_frags)) + ret = LIBETH_XDP_ABORTED; + +out: + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST; + + return ret; +} + +/** + * libeth_xdp_xmit_fill_buf - internal helper to fill one XDP xmit &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the mapped DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xdp_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + struct libeth_sqe *sqe; + struct xdp_frame *xdpf; + + if (frm.flags & LIBETH_XDP_TX_FIRST) { + xdpf = frm.xdpf; + desc.addr = *libeth_xdp_xmit_frame_dma(xdpf); + } else { + xdpf = NULL; + desc.addr = frm.dma; + } + desc.opts = frm.opts; + + sqe = &sq->sqes[i]; + dma_unmap_addr_set(sqe, dma, desc.addr); + dma_unmap_len_set(sqe, len, desc.len); + + if (!xdpf) { + sqe->type = LIBETH_SQE_XDP_XMIT_FRAG; + return desc; + } + + sqe->type = LIBETH_SQE_XDP_XMIT; + sqe->xdpf = xdpf; + libeth_xdp_tx_fill_stats(sqe, &desc, + xdp_get_shared_info_from_frame(xdpf)); + + return desc; +} + +/** + * libeth_xdp_xmit_flush_bulk - wrapper to define flush of one XDP xmit bulk + * @bq: bulk to flush + * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XDP_DEFINE_FLUSH_XMIT() to define an XDP xmit driver + * callback. + */ +#define libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_NDO, prep, \ + libeth_xdp_xmit_fill_buf, xmit) + +u32 libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count, const struct net_device *dev); + +/** + * __libeth_xdp_xmit_do_bulk - internal function to implement .ndo_xdp_xmit() + * @bq: XDP Tx bulk to queue frames to + * @frames: XDP frames passed by the stack + * @n: number of frames + * @flags: flags passed by the stack + * @flush_bulk: driver callback to flush an XDP xmit bulk + * @finalize: driver callback to finalize sending XDP Tx frames on the queue + * + * Perform common checks, map the frags and queue them to the bulk, then flush + * the bulk to the XDPSQ. If requested by the stack, finalize the queue. + * + * Return: number of frames send or -errno on error. + */ +static __always_inline int +__libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame **frames, u32 n, u32 flags, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + u32 nxmit = 0; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + for (u32 i = 0; likely(i < n); i++) { + u32 ret; + + ret = libeth_xdp_xmit_queue_bulk(bq, frames[i], flush_bulk); + if (unlikely(ret != LIBETH_XDP_TX)) { + nxmit += ret == LIBETH_XDP_ABORTED; + break; + } + + nxmit++; + } + + if (bq->count) { + flush_bulk(bq, LIBETH_XDP_TX_NDO); + if (unlikely(bq->count)) + nxmit -= libeth_xdp_xmit_return_bulk(bq->bulk, + bq->count, + bq->dev); + } + + finalize(bq->xdpsq, nxmit, flags & XDP_XMIT_FLUSH); + + return nxmit; +} + +/** + * libeth_xdp_xmit_do_bulk - implement full .ndo_xdp_xmit() in driver + * @dev: target &net_device + * @n: number of frames to send + * @fr: XDP frames to send + * @f: flags passed by the stack + * @xqs: array of XDPSQs driver structs + * @nqs: number of active XDPSQs, the above array length + * @fl: driver callback to flush an XDP xmit bulk + * @fin: driver cabback to finalize the queue + * + * If the driver has active XDPSQs, perform common checks and send the frames. + * Finalize the queue, if requested. + * + * Return: number of frames sent or -errno on error. + */ +#define libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin) \ + _libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(ret_), \ + __UNIQUE_ID(nqs_)) + +#define _libeth_xdp_xmit_do_bulk(d, n, fr, f, xqs, nqs, fl, fin, ub, ur, un) \ +({ \ + u32 un = (nqs); \ + int ur; \ + \ + if (likely(un)) { \ + LIBETH_XDP_ONSTACK_BULK(ub); \ + \ + libeth_xdp_xmit_init_bulk(&ub, d, xqs, un); \ + ur = __libeth_xdp_xmit_do_bulk(&ub, fr, n, f, fl, fin); \ + } else { \ + ur = -ENXIO; \ + } \ + \ + ur; \ +}) + +/* Rx polling path */ + +/** + * libeth_xdp_tx_init_bulk - initialize an XDP Tx bulk for Rx NAPI poll + * @bq: bulk to initialize + * @prog: RCU pointer to the XDP program (can be %NULL) + * @dev: target &net_device + * @xdpsqs: array of driver XDPSQ structs + * @num: number of active XDPSQs, the above array length + * + * Should be called on an onstack XDP Tx bulk before the NAPI polling loop. + * Initializes all the needed fields to run libeth_xdp functions. If @num == 0, + * assumes XDP is not enabled. + * Do not use for XSk, it has its own optimized helper. + */ +#define libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ + __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, false, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) + +#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, xsk, ub, un) do { \ + typeof(bq) ub = (bq); \ + u32 un = (num); \ + \ + rcu_read_lock(); \ + \ + if (un || (xsk)) { \ + ub->prog = rcu_dereference(pr); \ + ub->dev = (d); \ + ub->xdpsq = (xdpsqs)[libeth_xdpsq_id(un)]; \ + } else { \ + ub->prog = NULL; \ + } \ + \ + ub->act_mask = 0; \ + ub->count = 0; \ +} while (0) + +void libeth_xdp_load_stash(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src); +void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src); +void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash); + +/** + * libeth_xdp_init_buff - initialize a &libeth_xdp_buff for Rx NAPI poll + * @dst: onstack buffer to initialize + * @src: XDP buffer stash placed on the queue + * @rxq: registered &xdp_rxq_info corresponding to this queue + * + * Should be called before the main NAPI polling loop. Loads the content of + * the previously saved stash or initializes the buffer from scratch. + * Do not use for XSk. + */ +static inline void +libeth_xdp_init_buff(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src, + struct xdp_rxq_info *rxq) +{ + if (likely(!src->data)) + dst->data = NULL; + else + libeth_xdp_load_stash(dst, src); + + dst->base.rxq = rxq; +} + +/** + * libeth_xdp_save_buff - save a partially built buffer on a queue + * @dst: XDP buffer stash placed on the queue + * @src: onstack buffer to save + * + * Should be called after the main NAPI polling loop. If the loop exited before + * the buffer was finished, saves its content on the queue, so that it can be + * completed during the next poll. Otherwise, clears the stash. + */ +static inline void libeth_xdp_save_buff(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src) +{ + if (likely(!src->data)) + dst->data = NULL; + else + libeth_xdp_save_stash(dst, src); +} + +/** + * libeth_xdp_return_stash - free an XDP buffer stash from a queue + * @stash: stash to free + * + * If the queue is about to be destroyed, but it still has an incompleted + * buffer stash, this helper should be called to free it. + */ +static inline void libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash) +{ + if (stash->data) + __libeth_xdp_return_stash(stash); +} + +static inline void libeth_xdp_return_va(const void *data, bool napi) +{ + netmem_ref netmem = virt_to_netmem(data); + + page_pool_put_full_netmem(__netmem_get_pp(netmem), netmem, napi); +} + +static inline void libeth_xdp_return_frags(const struct skb_shared_info *sinfo, + bool napi) +{ + for (u32 i = 0; i < sinfo->nr_frags; i++) { + netmem_ref netmem = skb_frag_netmem(&sinfo->frags[i]); + + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi); + } +} + +/** + * libeth_xdp_return_buff - free/recycle &libeth_xdp_buff + * @xdp: buffer to free + * + * Hotpath helper to free &libeth_xdp_buff. Comparing to xdp_return_buff(), + * it's faster as it gets inlined and always assumes order-0 pages and safe + * direct recycling. Zeroes @xdp->data to avoid UAFs. + */ +#define libeth_xdp_return_buff(xdp) __libeth_xdp_return_buff(xdp, true) + +static inline void __libeth_xdp_return_buff(struct libeth_xdp_buff *xdp, + bool napi) +{ + if (!xdp_buff_has_frags(&xdp->base)) + goto out; + + libeth_xdp_return_frags(xdp_get_shared_info_from_buff(&xdp->base), + napi); + +out: + libeth_xdp_return_va(xdp->data, napi); + xdp->data = NULL; +} + +bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len); + +/** + * libeth_xdp_prepare_buff - fill &libeth_xdp_buff with head FQE data + * @xdp: XDP buffer to attach the head to + * @fqe: FQE containing the head buffer + * @len: buffer len passed from HW + * + * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer + * head with the Rx buffer data: data pointer, length, headroom, and + * truesize/tailroom. Zeroes the flags. + */ +static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + const struct page *page = __netmem_to_page(fqe->netmem); + + xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, + pp_page_to_nmdesc(page)->pp->p.offset, len, true); + xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); +} + +/** + * libeth_xdp_process_buff - attach Rx buffer to &libeth_xdp_buff + * @xdp: XDP buffer to attach the Rx buffer to + * @fqe: Rx buffer to process + * @len: received data length from the descriptor + * + * If the XDP buffer is empty, attaches the Rx buffer as head and initializes + * the required fields. Otherwise, attaches the buffer as a frag. + * Already performs DMA sync-for-CPU and frame start prefetch + * (for head buffers only). + * + * Return: true on success, false if the descriptor must be skipped (empty or + * no space for a new frag). + */ +static inline bool libeth_xdp_process_buff(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + if (!libeth_rx_sync_for_cpu(fqe, len)) + return false; + + if (xdp->data) + return libeth_xdp_buff_add_frag(xdp, fqe, len); + + libeth_xdp_prepare_buff(xdp, fqe, len); + + prefetch(xdp->data); + + return true; +} + +/** + * libeth_xdp_buff_stats_frags - update onstack RQ stats with XDP frags info + * @ss: onstack stats to update + * @xdp: buffer to account + * + * Internal helper used by __libeth_xdp_run_pass(), do not call directly. + * Adds buffer's frags count and total len to the onstack stats. + */ +static inline void +libeth_xdp_buff_stats_frags(struct libeth_rq_napi_stats *ss, + const struct libeth_xdp_buff *xdp) +{ + const struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(&xdp->base); + ss->bytes += sinfo->xdp_frags_size; + ss->fragments += sinfo->nr_frags + 1; +} + +u32 libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + enum xdp_action act, int ret); + +/** + * __libeth_xdp_run_prog - run XDP program on an XDP buffer + * @xdp: XDP buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * + * Internal inline abstraction to run XDP program. Handles ``XDP_DROP`` + * and ``XDP_REDIRECT`` only, the rest is processed levels up. + * Reports an XDP prog exception on errors. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xdp_run_prog(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq) +{ + enum xdp_action act; + + act = bpf_prog_run_xdp(bq->prog, &xdp->base); + if (unlikely(act < XDP_DROP || act > XDP_REDIRECT)) + goto out; + + switch (act) { + case XDP_PASS: + return LIBETH_XDP_PASS; + case XDP_DROP: + libeth_xdp_return_buff(xdp); + + return LIBETH_XDP_DROP; + case XDP_TX: + return LIBETH_XDP_TX; + case XDP_REDIRECT: + if (unlikely(xdp_do_redirect(bq->dev, &xdp->base, bq->prog))) + break; + + xdp->data = NULL; + + return LIBETH_XDP_REDIRECT; + default: + break; + } + +out: + return libeth_xdp_prog_exception(bq, xdp, act, 0); +} + +/** + * __libeth_xdp_run_flush - run XDP program and handle ``XDP_TX`` verdict + * @xdp: XDP buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * @run: internal callback for running XDP program + * @queue: internal callback for queuing ``XDP_TX`` frame + * @flush_bulk: driver callback for flushing a bulk + * + * Internal inline abstraction to run XDP program and additionally handle + * ``XDP_TX`` verdict. Used by both XDP and XSk, hence @run and @queue. + * Do not use directly. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xdp_run_flush(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, + u32 (*run)(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq), + bool (*queue)(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk) + (struct libeth_xdp_tx_bulk *bq, + u32 flags)), + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + u32 act; + + act = run(xdp, bq); + if (act == LIBETH_XDP_TX && unlikely(!queue(bq, xdp, flush_bulk))) + act = LIBETH_XDP_DROP; + + bq->act_mask |= act; + + return act; +} + +/** + * libeth_xdp_run_prog - run XDP program (non-XSk path) and handle all verdicts + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers + * @fl: driver ``XDP_TX`` bulk flush callback + * + * Run the attached XDP program and handle all possible verdicts. XSk has its + * own version. + * Prefer using it via LIBETH_XDP_DEFINE_RUN{,_PASS,_PROG}(). + * + * Return: true if the buffer should be passed up the stack, false if the poll + * should go to the next buffer. + */ +#define libeth_xdp_run_prog(xdp, bq, fl) \ + (__libeth_xdp_run_flush(xdp, bq, __libeth_xdp_run_prog, \ + libeth_xdp_tx_queue_bulk, \ + fl) == LIBETH_XDP_PASS) + +/** + * __libeth_xdp_run_pass - helper to run XDP program and handle the result + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @md: metadata that should be filled to the XDP buffer + * @prep: callback for filling the metadata + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Inline abstraction that does the following (non-XSk path): + * 1) adds frame size and frag number (if needed) to the onstack stats; + * 2) fills the descriptor metadata to the onstack &libeth_xdp_buff + * 3) runs XDP program if present; + * 4) handles all possible verdicts; + * 5) on ``XDP_PASS`, builds an skb from the buffer; + * 6) populates it with the descriptor metadata; + * 7) passes it up the stack. + * + * In most cases, number 2 means just writing the pointer to the HW descriptor + * to the XDP buffer. If so, please use LIBETH_XDP_DEFINE_RUN{,_PASS}() + * wrappers to build a driver function. + */ +static __always_inline void +__libeth_xdp_run_pass(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi, + struct libeth_rq_napi_stats *rs, const void *md, + void (*prep)(struct libeth_xdp_buff *xdp, + const void *md), + bool (*run)(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq), + bool (*populate)(struct sk_buff *skb, + const struct libeth_xdp_buff *xdp, + struct libeth_rq_napi_stats *rs)) +{ + struct sk_buff *skb; + + rs->bytes += xdp->base.data_end - xdp->data; + rs->packets++; + + if (xdp_buff_has_frags(&xdp->base)) + libeth_xdp_buff_stats_frags(rs, xdp); + + if (prep && (!__builtin_constant_p(!!md) || md)) + prep(xdp, md); + + if (!bq || !run || !bq->prog) + goto build; + + if (!run(xdp, bq)) + return; + +build: + skb = xdp_build_skb_from_buff(&xdp->base); + if (unlikely(!skb)) { + libeth_xdp_return_buff_slow(xdp); + return; + } + + xdp->data = NULL; + + if (unlikely(!populate(skb, xdp, rs))) { + napi_consume_skb(skb, true); + return; + } + + napi_gro_receive(napi, skb); +} + +static inline void libeth_xdp_prep_desc(struct libeth_xdp_buff *xdp, + const void *desc) +{ + xdp->desc = desc; +} + +/** + * libeth_xdp_run_pass - helper to run XDP program and handle the result + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @ss: onstack libeth RQ stats + * @desc: pointer to the HW descriptor for that frame + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Wrapper around the underscored version when "fill the descriptor metadata" + * means just writing the pointer to the HW descriptor as @xdp->desc. + */ +#define libeth_xdp_run_pass(xdp, bq, napi, ss, desc, run, populate) \ + __libeth_xdp_run_pass(xdp, bq, napi, ss, desc, libeth_xdp_prep_desc, \ + run, populate) + +/** + * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop (non-XSk) + * @bq: ``XDP_TX`` frame bulk + * @flush: driver callback to flush the bulk + * @finalize: driver callback to start sending the frames and run the timer + * + * Flush the bulk if there are frames left to send, kick the queue and flush + * the XDP maps. + */ +#define libeth_xdp_finalize_rx(bq, flush, finalize) \ + __libeth_xdp_finalize_rx(bq, 0, flush, finalize) + +static __always_inline void +__libeth_xdp_finalize_rx(struct libeth_xdp_tx_bulk *bq, u32 flags, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + if (bq->act_mask & LIBETH_XDP_TX) { + if (bq->count) + flush_bulk(bq, flags | LIBETH_XDP_TX_DROP); + finalize(bq->xdpsq, true, true); + } + if (bq->act_mask & LIBETH_XDP_REDIRECT) + xdp_do_flush(); + + rcu_read_unlock(); +} + +/* + * Helpers to reduce boilerplate code in drivers. + * + * Typical driver Rx flow would be (excl. bulk and buff init, frag attach): + * + * LIBETH_XDP_DEFINE_START(); + * LIBETH_XDP_DEFINE_FLUSH_TX(static driver_xdp_flush_tx, driver_xdp_tx_prep, + * driver_xdp_xmit); + * LIBETH_XDP_DEFINE_RUN(static driver_xdp_run, driver_xdp_run_prog, + * driver_xdp_flush_tx, driver_populate_skb); + * LIBETH_XDP_DEFINE_FINALIZE(static driver_xdp_finalize_rx, + * driver_xdp_flush_tx, driver_xdp_finalize_sq); + * LIBETH_XDP_DEFINE_END(); + * + * This will build a set of 4 static functions. The compiler is free to decide + * whether to inline them. + * Then, in the NAPI polling function: + * + * while (packets < budget) { + * // ... + * driver_xdp_run(xdp, &bq, napi, &rs, desc); + * } + * driver_xdp_finalize_rx(&bq); + */ + +#define LIBETH_XDP_DEFINE_START() \ + __diag_push(); \ + __diag_ignore(GCC, 8, "-Wold-style-declaration", \ + "Allow specifying \'static\' after the return type") + +/** + * LIBETH_XDP_DEFINE_TIMER - define a driver XDPSQ cleanup timer callback + * @name: name of the function to define + * @poll: Tx polling/completion function + */ +#define LIBETH_XDP_DEFINE_TIMER(name, poll) \ +void name(struct work_struct *work) \ +{ \ + libeth_xdpsq_run_timer(work, poll); \ +} + +/** + * LIBETH_XDP_DEFINE_FLUSH_TX - define a driver ``XDP_TX`` bulk flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit) \ + __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xdp) + +#define __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, pfx) \ +bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \ +{ \ + return libeth_##pfx##_tx_flush_bulk(bq, flags, prep, xmit); \ +} + +/** + * LIBETH_XDP_DEFINE_FLUSH_XMIT - define a driver XDP xmit bulk flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XDP_DEFINE_FLUSH_XMIT(name, prep, xmit) \ +bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \ +{ \ + return libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN_PROG - define a driver XDP program run function + * @name: name of the function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + */ +#define LIBETH_XDP_DEFINE_RUN_PROG(name, flush) \ + bool __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xdp) + +#define __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, pfx) \ +name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq) \ +{ \ + return libeth_##pfx##_run_prog(xdp, bq, flush); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN_PASS - define a driver buffer process + pass function + * @name: name of the function to define + * @run: driver callback to run XDP program (above) + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate) \ + void __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xdp) + +#define __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, pfx) \ +name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq, \ + struct napi_struct *napi, struct libeth_rq_napi_stats *ss, \ + const void *desc) \ +{ \ + return libeth_##pfx##_run_pass(xdp, bq, napi, ss, desc, run, \ + populate); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN - define a driver buffer process, run + pass function + * @name: name of the function to define + * @run: name of the XDP prog run function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XDP_DEFINE_RUN(name, run, flush, populate) \ + __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XDP) + +#define __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, pfx) \ + LIBETH_##pfx##_DEFINE_RUN_PROG(static run, flush); \ + LIBETH_##pfx##_DEFINE_RUN_PASS(name, run, populate) + +/** + * LIBETH_XDP_DEFINE_FINALIZE - define a driver Rx NAPI poll finalize function + * @name: name of the function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + * @finalize: driver callback to finalize an XDPSQ and run the timer + */ +#define LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize) \ + __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xdp) + +#define __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, pfx) \ +void name(struct libeth_xdp_tx_bulk *bq) \ +{ \ + libeth_##pfx##_finalize_rx(bq, flush, finalize); \ +} + +#define LIBETH_XDP_DEFINE_END() __diag_pop() + +/* XMO */ + +/** + * libeth_xdp_buff_to_rq - get RQ pointer from an XDP buffer pointer + * @xdp: &libeth_xdp_buff corresponding to the queue + * @type: typeof() of the driver Rx queue structure + * @member: name of &xdp_rxq_info inside @type + * + * Often times, pointer to the RQ is needed when reading/filling metadata from + * HW descriptors. The helper can be used to quickly jump from an XDP buffer + * to the queue corresponding to its &xdp_rxq_info without introducing + * additional fields (&libeth_xdp_buff is precisely 1 cacheline long on x64). + */ +#define libeth_xdp_buff_to_rq(xdp, type, member) \ + container_of_const((xdp)->base.rxq, type, member) + +/** + * libeth_xdpmo_rx_hash - convert &libeth_rx_pt to an XDP RSS hash metadata + * @hash: pointer to the variable to write the hash to + * @rss_type: pointer to the variable to write the hash type to + * @val: hash value from the HW descriptor + * @pt: libeth parsed packet type + * + * Handle zeroed/non-available hash and convert libeth parsed packet type to + * the corresponding XDP RSS hash type. To be called at the end of + * xdp_metadata_ops idpf_xdpmo::xmo_rx_hash() implementation. + * Note that if the driver doesn't use a constant packet type lookup table but + * generates it at runtime, it must call libeth_rx_pt_gen_hash_type(pt) to + * generate XDP RSS hash type for each packet type. + * + * Return: 0 on success, -ENODATA when the hash is not available. + */ +static inline int libeth_xdpmo_rx_hash(u32 *hash, + enum xdp_rss_hash_type *rss_type, + u32 val, struct libeth_rx_pt pt) +{ + if (unlikely(!val)) + return -ENODATA; + + *hash = val; + *rss_type = pt.hash_type; + + return 0; +} + +/* Tx buffer completion */ + +void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, + struct xdp_frame_bulk *bq, bool frags); +void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp); + +/** + * __libeth_xdp_complete_tx - complete sent XDPSQE + * @sqe: SQ element / Tx buffer to complete + * @cp: Tx polling/completion params + * @bulk: internal callback to bulk-free ``XDP_TX`` buffers + * @xsk: internal callback to free XSk ``XDP_TX`` buffers + * + * Use the non-underscored version in drivers instead. This one is shared + * internally with libeth_tx_complete_any(). + * Complete an XDPSQE of any type of XDP frame. This includes DMA unmapping + * when needed, buffer freeing, stats update, and SQE invalidation. + */ +static __always_inline void +__libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp, + typeof(libeth_xdp_return_buff_bulk) bulk, + typeof(libeth_xsk_buff_free_slow) xsk) +{ + enum libeth_sqe_type type = sqe->type; + + switch (type) { + case LIBETH_SQE_EMPTY: + return; + case LIBETH_SQE_XDP_XMIT: + case LIBETH_SQE_XDP_XMIT_FRAG: + dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma), + dma_unmap_len(sqe, len), DMA_TO_DEVICE); + break; + default: + break; + } + + switch (type) { + case LIBETH_SQE_XDP_TX: + bulk(sqe->sinfo, cp->bq, sqe->nr_frags != 1); + break; + case LIBETH_SQE_XDP_XMIT: + xdp_return_frame_bulk(sqe->xdpf, cp->bq); + break; + case LIBETH_SQE_XSK_TX: + case LIBETH_SQE_XSK_TX_FRAG: + xsk(sqe->xsk); + break; + default: + break; + } + + switch (type) { + case LIBETH_SQE_XDP_TX: + case LIBETH_SQE_XDP_XMIT: + case LIBETH_SQE_XSK_TX: + cp->xdp_tx -= sqe->nr_frags; + + cp->xss->packets++; + cp->xss->bytes += sqe->bytes; + break; + default: + break; + } + + sqe->type = LIBETH_SQE_EMPTY; +} + +static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe, + struct libeth_cq_pp *cp) +{ + __libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk, + libeth_xsk_buff_free_slow); +} + +/* Misc */ + +u32 libeth_xdp_queue_threshold(u32 count); + +void __libeth_xdp_set_features(struct net_device *dev, + const struct xdp_metadata_ops *xmo, + u32 zc_segs, + const struct xsk_tx_metadata_ops *tmo); +void libeth_xdp_set_redirect(struct net_device *dev, bool enable); + +/** + * libeth_xdp_set_features - set XDP features for netdev + * @dev: &net_device to configure + * @...: optional params, see __libeth_xdp_set_features() + * + * Set all the features libeth_xdp supports, including .ndo_xdp_xmit(). That + * said, it should be used only when XDPSQs are always available regardless + * of whether an XDP prog is attached to @dev. + */ +#define libeth_xdp_set_features(dev, ...) \ + CONCATENATE(__libeth_xdp_feat, \ + COUNT_ARGS(__VA_ARGS__))(dev, ##__VA_ARGS__) + +#define __libeth_xdp_feat0(dev) \ + __libeth_xdp_set_features(dev, NULL, 0, NULL) +#define __libeth_xdp_feat1(dev, xmo) \ + __libeth_xdp_set_features(dev, xmo, 0, NULL) +#define __libeth_xdp_feat2(dev, xmo, zc_segs) \ + __libeth_xdp_set_features(dev, xmo, zc_segs, NULL) +#define __libeth_xdp_feat3(dev, xmo, zc_segs, tmo) \ + __libeth_xdp_set_features(dev, xmo, zc_segs, tmo) + +/** + * libeth_xdp_set_features_noredir - enable all libeth_xdp features w/o redir + * @dev: target &net_device + * @...: optional params, see __libeth_xdp_set_features() + * + * Enable everything except the .ndo_xdp_xmit() feature, use when XDPSQs are + * not available right after netdev registration. + */ +#define libeth_xdp_set_features_noredir(dev, ...) \ + __libeth_xdp_set_features_noredir(dev, __UNIQUE_ID(dev_), \ + ##__VA_ARGS__) + +#define __libeth_xdp_set_features_noredir(dev, ud, ...) do { \ + struct net_device *ud = (dev); \ + \ + libeth_xdp_set_features(ud, ##__VA_ARGS__); \ + libeth_xdp_set_redirect(ud, false); \ +} while (0) + +#define libeth_xsktmo ((const void *)GOLDEN_RATIO_PRIME) + +#endif /* __LIBETH_XDP_H */ diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h new file mode 100644 index 000000000000..481a7b28e6f2 --- /dev/null +++ b/include/net/libeth/xsk.h @@ -0,0 +1,685 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Intel Corporation */ + +#ifndef __LIBETH_XSK_H +#define __LIBETH_XSK_H + +#include <net/libeth/xdp.h> +#include <net/xdp_sock_drv.h> + +/* ``XDP_TXMD_FLAGS_VALID`` is defined only under ``CONFIG_XDP_SOCKETS`` */ +#ifdef XDP_TXMD_FLAGS_VALID +static_assert(XDP_TXMD_FLAGS_VALID <= LIBETH_XDP_TX_XSKMD); +#endif + +/* ``XDP_TX`` bulking */ + +/** + * libeth_xsk_tx_queue_head - internal helper for queueing XSk ``XDP_TX`` head + * @bq: XDP Tx bulk to queue the head frag to + * @xdp: XSk buffer with the head to queue + * + * Return: false if it's the only frag of the frame, true if it's an S/G frame. + */ +static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp) +{ + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xsk = xdp, + __libeth_xdp_tx_len(xdp->base.data_end - xdp->data, + LIBETH_XDP_TX_FIRST), + }; + + if (likely(!xdp_buff_has_frags(&xdp->base))) + return false; + + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI; + + return true; +} + +/** + * libeth_xsk_tx_queue_frag - internal helper for queueing XSk ``XDP_TX`` frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: XSk frag to queue + */ +static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *frag) +{ + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xsk = frag, + __libeth_xdp_tx_len(frag->base.data_end - frag->data), + }; +} + +/** + * libeth_xsk_tx_queue_bulk - internal helper for queueing XSk ``XDP_TX`` frame + * @bq: XDP Tx bulk to queue the frame to + * @xdp: XSk buffer to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: true on success, false on flush error. + */ +static __always_inline bool +libeth_xsk_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + bool ret = true; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) { + libeth_xsk_buff_free_slow(xdp); + return false; + } + + if (!libeth_xsk_tx_queue_head(bq, xdp)) + goto out; + + for (const struct libeth_xdp_buff *head = xdp; ; ) { + xdp = container_of(xsk_buff_get_frag(&head->base), + typeof(*xdp), base); + if (!xdp) + break; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) { + ret = false; + break; + } + + libeth_xsk_tx_queue_frag(bq, xdp); + } + +out: + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST; + + return ret; +} + +/** + * libeth_xsk_tx_fill_buf - internal helper to fill XSk ``XDP_TX`` &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the synced DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xsk_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_buff *xdp = frm.xsk; + struct libeth_xdp_tx_desc desc = { + .addr = xsk_buff_xdp_get_dma(&xdp->base), + .opts = frm.opts, + }; + struct libeth_sqe *sqe; + + xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len); + + sqe = &sq->sqes[i]; + sqe->xsk = xdp; + + if (!(desc.flags & LIBETH_XDP_TX_FIRST)) { + sqe->type = LIBETH_SQE_XSK_TX_FRAG; + return desc; + } + + sqe->type = LIBETH_SQE_XSK_TX; + libeth_xdp_tx_fill_stats(sqe, &desc, + xdp_get_shared_info_from_buff(&xdp->base)); + + return desc; +} + +/** + * libeth_xsk_tx_flush_bulk - wrapper to define flush of XSk ``XDP_TX`` bulk + * @bq: bulk to flush + * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XSK_DEFINE_FLUSH_TX() to define an XSk ``XDP_TX`` driver + * callback. + */ +#define libeth_xsk_tx_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_XSK, prep, \ + libeth_xsk_tx_fill_buf, xmit) + +/* XSk TMO */ + +/** + * libeth_xsktmo_req_csum - XSk Tx metadata op to request checksum offload + * @csum_start: unused + * @csum_offset: unused + * @priv: &libeth_xdp_tx_desc from the filling helper + * + * Generic implementation of ::tmo_request_checksum. Works only when HW doesn't + * require filling checksum offsets and other parameters beside the checksum + * request bit. + * Consider using within @libeth_xsktmo unless the driver requires HW-specific + * callbacks. + */ +static inline void libeth_xsktmo_req_csum(u16 csum_start, u16 csum_offset, + void *priv) +{ + ((struct libeth_xdp_tx_desc *)priv)->flags |= LIBETH_XDP_TX_CSUM; +} + +/* Only to inline the callbacks below, use @libeth_xsktmo in drivers instead */ +static const struct xsk_tx_metadata_ops __libeth_xsktmo = { + .tmo_request_checksum = libeth_xsktmo_req_csum, +}; + +/** + * __libeth_xsk_xmit_fill_buf_md - internal helper to prepare XSk xmit w/meta + * @xdesc: &xdp_desc from the XSk buffer pool + * @sq: XDPSQ abstraction for the queue + * @priv: XSk Tx metadata ops + * + * Same as __libeth_xsk_xmit_fill_buf(), but requests metadata pointer and + * fills additional fields in &libeth_xdp_tx_desc to ask for metadata offload. + * + * Return: XDP Tx descriptor with the DMA, metadata request bits, and other + * info to pass to the driver callback. + */ +static __always_inline struct libeth_xdp_tx_desc +__libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc, + const struct libeth_xdpsq *sq, + u64 priv) +{ + const struct xsk_tx_metadata_ops *tmo = libeth_xdp_priv_to_ptr(priv); + struct libeth_xdp_tx_desc desc; + struct xdp_desc_ctx ctx; + + ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr); + desc = (typeof(desc)){ + .addr = ctx.dma, + __libeth_xdp_tx_len(xdesc->len), + }; + + BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo)); + tmo = tmo == libeth_xsktmo ? &__libeth_xsktmo : tmo; + + xsk_tx_metadata_request(ctx.meta, tmo, &desc); + + return desc; +} + +/* XSk xmit implementation */ + +/** + * __libeth_xsk_xmit_fill_buf - internal helper to prepare XSk xmit w/o meta + * @xdesc: &xdp_desc from the XSk buffer pool + * @sq: XDPSQ abstraction for the queue + * + * Return: XDP Tx descriptor with the DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +__libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc, + const struct libeth_xdpsq *sq) +{ + return (struct libeth_xdp_tx_desc){ + .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr), + __libeth_xdp_tx_len(xdesc->len), + }; +} + +/** + * libeth_xsk_xmit_fill_buf - internal helper to prepare an XSk xmit + * @frm: &xdp_desc from the XSk buffer pool + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: XSk Tx metadata ops + * + * Depending on the metadata ops presence (determined at compile time), calls + * the quickest helper to build a libeth XDP Tx descriptor. + * + * Return: XDP Tx descriptor with the synced DMA, metadata request bits, + * and other info to pass to the driver callback. + */ +static __always_inline struct libeth_xdp_tx_desc +libeth_xsk_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + + if (priv) + desc = __libeth_xsk_xmit_fill_buf_md(&frm.desc, sq, priv); + else + desc = __libeth_xsk_xmit_fill_buf(&frm.desc, sq); + + desc.flags |= xsk_is_eop_desc(&frm.desc) ? LIBETH_XDP_TX_LAST : 0; + + xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len); + + return desc; +} + +/** + * libeth_xsk_xmit_do_bulk - send XSk xmit frames + * @pool: XSk buffer pool containing the frames to send + * @xdpsq: opaque pointer to driver's XDPSQ struct + * @budget: maximum number of frames can be sent + * @tmo: optional XSk Tx metadata ops + * @prep: driver callback to build a &libeth_xdpsq + * @xmit: driver callback to put frames to a HW queue + * @finalize: driver callback to start a transmission + * + * Implements generic XSk xmit. Always turns on XSk Tx wakeup as it's assumed + * lazy cleaning is used and interrupts are disabled for the queue. + * HW descriptor filling is unrolled by ``LIBETH_XDP_TX_BATCH`` to optimize + * writes. + * Note that unlike other XDP Tx ops, the queue must be locked and cleaned + * prior to calling this function to already know available @budget. + * @prepare must only build a &libeth_xdpsq and return ``U32_MAX``. + * + * Return: false if @budget was exhausted, true otherwise. + */ +static __always_inline bool +libeth_xsk_xmit_do_bulk(struct xsk_buff_pool *pool, void *xdpsq, u32 budget, + const struct xsk_tx_metadata_ops *tmo, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + const struct libeth_xdp_tx_frame *bulk; + bool wake; + u32 n; + + wake = xsk_uses_need_wakeup(pool); + if (wake) + xsk_clear_tx_need_wakeup(pool); + + n = xsk_tx_peek_release_desc_batch(pool, budget); + bulk = container_of(&pool->tx_descs[0], typeof(*bulk), desc); + + libeth_xdp_tx_xmit_bulk(bulk, xdpsq, n, true, + libeth_xdp_ptr_to_priv(tmo), prep, + libeth_xsk_xmit_fill_buf, xmit); + finalize(xdpsq, n, true); + + if (wake) + xsk_set_tx_need_wakeup(pool); + + return n < budget; +} + +/* Rx polling path */ + +/** + * libeth_xsk_tx_init_bulk - initialize XDP Tx bulk for an XSk Rx NAPI poll + * @bq: bulk to initialize + * @prog: RCU pointer to the XDP program (never %NULL) + * @dev: target &net_device + * @xdpsqs: array of driver XDPSQ structs + * @num: number of active XDPSQs, the above array length + * + * Should be called on an onstack XDP Tx bulk before the XSk NAPI polling loop. + * Initializes all the needed fields to run libeth_xdp functions. + * Never checks if @prog is %NULL or @num == 0 as XDP must always be enabled + * when hitting this path. + */ +#define libeth_xsk_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ + __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, true, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) + +struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp); + +/** + * libeth_xsk_process_buff - attach XSk Rx buffer to &libeth_xdp_buff + * @head: head XSk buffer to attach the XSk buffer to (or %NULL) + * @xdp: XSk buffer to process + * @len: received data length from the descriptor + * + * If @head == %NULL, treats the XSk buffer as head and initializes + * the required fields. Otherwise, attaches the buffer as a frag. + * Already performs DMA sync-for-CPU and frame start prefetch + * (for head buffers only). + * + * Return: head XSk buffer on success or if the descriptor must be skipped + * (empty), %NULL if there is no space for a new frag. + */ +static inline struct libeth_xdp_buff * +libeth_xsk_process_buff(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp, u32 len) +{ + if (unlikely(!len)) { + libeth_xsk_buff_free_slow(xdp); + return head; + } + + xsk_buff_set_size(&xdp->base, len); + xsk_buff_dma_sync_for_cpu(&xdp->base); + + if (head) + return libeth_xsk_buff_add_frag(head, xdp); + + prefetch(xdp->data); + + return xdp; +} + +void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs, + const struct libeth_xdp_buff *xdp); + +u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq, + enum xdp_action act, int ret); + +/** + * __libeth_xsk_run_prog - run XDP program on XSk buffer + * @xdp: XSk buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * + * Internal inline abstraction to run XDP program on XSk Rx path. Handles + * only the most common ``XDP_REDIRECT`` inline, the rest is processed + * externally. + * Reports an XDP prog exception on errors. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xsk_run_prog(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq) +{ + enum xdp_action act; + int ret = 0; + + act = bpf_prog_run_xdp(bq->prog, &xdp->base); + if (unlikely(act != XDP_REDIRECT)) +rest: + return __libeth_xsk_run_prog_slow(xdp, bq, act, ret); + + ret = xdp_do_redirect(bq->dev, &xdp->base, bq->prog); + if (unlikely(ret)) + goto rest; + + return LIBETH_XDP_REDIRECT; +} + +/** + * libeth_xsk_run_prog - run XDP program on XSk path and handle all verdicts + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers + * @fl: driver ``XDP_TX`` bulk flush callback + * + * Run the attached XDP program and handle all possible verdicts. + * Prefer using it via LIBETH_XSK_DEFINE_RUN{,_PASS,_PROG}(). + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +#define libeth_xsk_run_prog(xdp, bq, fl) \ + __libeth_xdp_run_flush(xdp, bq, __libeth_xsk_run_prog, \ + libeth_xsk_tx_queue_bulk, fl) + +/** + * __libeth_xsk_run_pass - helper to run XDP program and handle the result + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @md: metadata that should be filled to the XSk buffer + * @prep: callback for filling the metadata + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Inline abstraction, XSk's counterpart of __libeth_xdp_run_pass(), see its + * doc for details. + * + * Return: false if the polling loop must be exited due to lack of free + * buffers, true otherwise. + */ +static __always_inline bool +__libeth_xsk_run_pass(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi, + struct libeth_rq_napi_stats *rs, const void *md, + void (*prep)(struct libeth_xdp_buff *xdp, + const void *md), + u32 (*run)(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq), + bool (*populate)(struct sk_buff *skb, + const struct libeth_xdp_buff *xdp, + struct libeth_rq_napi_stats *rs)) +{ + struct sk_buff *skb; + u32 act; + + rs->bytes += xdp->base.data_end - xdp->data; + rs->packets++; + + if (unlikely(xdp_buff_has_frags(&xdp->base))) + libeth_xsk_buff_stats_frags(rs, xdp); + + if (prep && (!__builtin_constant_p(!!md) || md)) + prep(xdp, md); + + act = run(xdp, bq); + if (likely(act == LIBETH_XDP_REDIRECT)) + return true; + + if (act != LIBETH_XDP_PASS) + return act != LIBETH_XDP_ABORTED; + + skb = xdp_build_skb_from_zc(&xdp->base); + if (unlikely(!skb)) { + libeth_xsk_buff_free_slow(xdp); + return true; + } + + if (unlikely(!populate(skb, xdp, rs))) { + napi_consume_skb(skb, true); + return true; + } + + napi_gro_receive(napi, skb); + + return true; +} + +/** + * libeth_xsk_run_pass - helper to run XDP program and handle the result + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @desc: pointer to the HW descriptor for that frame + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Wrapper around the underscored version when "fill the descriptor metadata" + * means just writing the pointer to the HW descriptor as @xdp->desc. + */ +#define libeth_xsk_run_pass(xdp, bq, napi, rs, desc, run, populate) \ + __libeth_xsk_run_pass(xdp, bq, napi, rs, desc, libeth_xdp_prep_desc, \ + run, populate) + +/** + * libeth_xsk_finalize_rx - finalize XDPSQ after an XSk NAPI polling loop + * @bq: ``XDP_TX`` frame bulk + * @flush: driver callback to flush the bulk + * @finalize: driver callback to start sending the frames and run the timer + * + * Flush the bulk if there are frames left to send, kick the queue and flush + * the XDP maps. + */ +#define libeth_xsk_finalize_rx(bq, flush, finalize) \ + __libeth_xdp_finalize_rx(bq, LIBETH_XDP_TX_XSK, flush, finalize) + +/* + * Helpers to reduce boilerplate code in drivers. + * + * Typical driver XSk Rx flow would be (excl. bulk and buff init, frag attach): + * + * LIBETH_XDP_DEFINE_START(); + * LIBETH_XSK_DEFINE_FLUSH_TX(static driver_xsk_flush_tx, driver_xsk_tx_prep, + * driver_xdp_xmit); + * LIBETH_XSK_DEFINE_RUN(static driver_xsk_run, driver_xsk_run_prog, + * driver_xsk_flush_tx, driver_populate_skb); + * LIBETH_XSK_DEFINE_FINALIZE(static driver_xsk_finalize_rx, + * driver_xsk_flush_tx, driver_xdp_finalize_sq); + * LIBETH_XDP_DEFINE_END(); + * + * This will build a set of 4 static functions. The compiler is free to decide + * whether to inline them. + * Then, in the NAPI polling function: + * + * while (packets < budget) { + * // ... + * if (!driver_xsk_run(xdp, &bq, napi, &rs, desc)) + * break; + * } + * driver_xsk_finalize_rx(&bq); + */ + +/** + * LIBETH_XSK_DEFINE_FLUSH_TX - define a driver XSk ``XDP_TX`` flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XSK_DEFINE_FLUSH_TX(name, prep, xmit) \ + __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN_PROG - define a driver XDP program run function + * @name: name of the function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + */ +#define LIBETH_XSK_DEFINE_RUN_PROG(name, flush) \ + u32 __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN_PASS - define a driver buffer process + pass function + * @name: name of the function to define + * @run: driver callback to run XDP program (above) + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XSK_DEFINE_RUN_PASS(name, run, populate) \ + bool __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN - define a driver buffer process, run + pass function + * @name: name of the function to define + * @run: name of the XDP prog run function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XSK_DEFINE_RUN(name, run, flush, populate) \ + __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XSK) + +/** + * LIBETH_XSK_DEFINE_FINALIZE - define a driver XSk NAPI poll finalize function + * @name: name of the function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + * @finalize: driver callback to finalize an XDPSQ and run the timer + */ +#define LIBETH_XSK_DEFINE_FINALIZE(name, flush, finalize) \ + __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xsk) + +/* Refilling */ + +/** + * struct libeth_xskfq - structure representing an XSk buffer (fill) queue + * @fp: hotpath part of the structure + * @pool: &xsk_buff_pool for buffer management + * @fqes: array of XSk buffer pointers + * @descs: opaque pointer to the HW descriptor array + * @ntu: index of the next buffer to poll + * @count: number of descriptors/buffers the queue has + * @pending: current number of XSkFQEs to refill + * @thresh: threshold below which the queue is refilled + * @buf_len: HW-writeable length per each buffer + * @nid: ID of the closest NUMA node with memory + */ +struct libeth_xskfq { + struct_group_tagged(libeth_xskfq_fp, fp, + struct xsk_buff_pool *pool; + struct libeth_xdp_buff **fqes; + void *descs; + + u32 ntu; + u32 count; + ); + + /* Cold fields */ + u32 pending; + u32 thresh; + + u32 buf_len; + int nid; +}; + +int libeth_xskfq_create(struct libeth_xskfq *fq); +void libeth_xskfq_destroy(struct libeth_xskfq *fq); + +/** + * libeth_xsk_buff_xdp_get_dma - get DMA address of XSk &libeth_xdp_buff + * @xdp: buffer to get the DMA addr for + */ +#define libeth_xsk_buff_xdp_get_dma(xdp) \ + xsk_buff_xdp_get_dma(&(xdp)->base) + +/** + * libeth_xskfqe_alloc - allocate @n XSk Rx buffers + * @fq: hotpath part of the XSkFQ, usually onstack + * @n: number of buffers to allocate + * @fill: driver callback to write DMA addresses to HW descriptors + * + * Note that @fq->ntu gets updated, but ::pending must be recalculated + * by the caller. + * + * Return: number of buffers refilled. + */ +static __always_inline u32 +libeth_xskfqe_alloc(struct libeth_xskfq_fp *fq, u32 n, + void (*fill)(const struct libeth_xskfq_fp *fq, u32 i)) +{ + u32 this, ret, done = 0; + struct xdp_buff **xskb; + + this = fq->count - fq->ntu; + if (likely(this > n)) + this = n; + +again: + xskb = (typeof(xskb))&fq->fqes[fq->ntu]; + ret = xsk_buff_alloc_batch(fq->pool, xskb, this); + + for (u32 i = 0, ntu = fq->ntu; likely(i < ret); i++) + fill(fq, ntu + i); + + done += ret; + fq->ntu += ret; + + if (likely(fq->ntu < fq->count) || unlikely(ret < this)) + goto out; + + fq->ntu = 0; + + if (this < n) { + this = n - this; + goto again; + } + +out: + return done; +} + +/* .ndo_xsk_wakeup */ + +void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi); +void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid); + +/* Pool setup */ + +int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable); + +#endif /* __LIBETH_XSK_H */ diff --git a/include/net/llc_c_st.h b/include/net/llc_c_st.h index 53823d61d8b6..a4bea0f33188 100644 --- a/include/net/llc_c_st.h +++ b/include/net/llc_c_st.h @@ -44,8 +44,8 @@ struct llc_conn_state_trans { }; struct llc_conn_state { - u8 current_state; - struct llc_conn_state_trans **transitions; + u8 current_state; + const struct llc_conn_state_trans **transitions; }; extern struct llc_conn_state llc_conn_state_table[]; diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h index 7e73f8e5e497..86681f29bda7 100644 --- a/include/net/llc_pdu.h +++ b/include/net/llc_pdu.h @@ -254,7 +254,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, } /** - * llc_pdu_decode_sa - extracs source address (MAC) of input frame + * llc_pdu_decode_sa - extracts, source address (MAC) of input frame * @skb: input skb that source address must be extracted from it. * @sa: pointer to source address (6 byte array). * @@ -262,8 +262,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, */ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) { - if (skb->protocol == htons(ETH_P_802_2)) - memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); + memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); } /** @@ -275,8 +274,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) */ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) { - if (skb->protocol == htons(ETH_P_802_2)) - memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); + memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); } /** diff --git a/include/net/llc_s_st.h b/include/net/llc_s_st.h index ed5b2fa40d32..fca49d483d20 100644 --- a/include/net/llc_s_st.h +++ b/include/net/llc_s_st.h @@ -29,8 +29,8 @@ struct llc_sap_state_trans { }; struct llc_sap_state { - u8 curr_state; - struct llc_sap_state_trans **transitions; + u8 curr_state; + const struct llc_sap_state_trans **transitions; }; /* only access to SAP state table */ diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 53bd2d02a4f0..26232f603e33 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -138,12 +138,12 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, static inline void lwtunnel_set_redirect(struct dst_entry *dst) { if (lwtunnel_output_redirect(dst->lwtstate)) { - dst->lwtstate->orig_output = dst->output; - dst->output = lwtunnel_output; + dst->lwtstate->orig_output = READ_ONCE(dst->output); + WRITE_ONCE(dst->output, lwtunnel_output); } if (lwtunnel_input_redirect(dst->lwtstate)) { - dst->lwtstate->orig_input = dst->input; - dst->input = lwtunnel_input; + dst->lwtstate->orig_input = READ_ONCE(dst->input); + WRITE_ONCE(dst->input, lwtunnel_input); } } #else @@ -206,6 +206,7 @@ static inline int lwtunnel_valid_encap_type(u16 encap_type, NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } + static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack) { diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7c707358d15c..c2e49542626c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7,7 +7,7 @@ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2023 Intel Corporation + * Copyright (C) 2018 - 2025 Intel Corporation */ #ifndef MAC80211_H @@ -22,7 +22,7 @@ #include <net/cfg80211.h> #include <net/codel.h> #include <net/ieee80211_radiotap.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /** * DOC: Introduction @@ -79,7 +79,7 @@ * helpers for sanity checking. Drivers must ensure all work added onto the * mac80211 workqueue should be cancelled on the driver stop() callback. * - * mac80211 will flushed the workqueue upon interface removal and during + * mac80211 will flush the workqueue upon interface removal and during * suspend. * * All work performed on the mac80211 workqueue must not acquire the RTNL lock. @@ -138,7 +138,7 @@ * field to the frame RX timestamp and report the ack TX timestamp in the * ieee80211_rx_status struct. * - * Similarly, To report hardware timestamps for Timing Measurement or Fine + * Similarly, to report hardware timestamps for Timing Measurement or Fine * Timing Measurement frame TX, the driver should set the SKB's hwtstamp field * to the frame TX timestamp and report the ack RX timestamp in the * ieee80211_tx_status struct. @@ -213,14 +213,31 @@ struct ieee80211_low_level_stats { * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel, * this is used only with channel switching with CSA - * @IEEE80211_CHANCTX_CHANGE_MIN_WIDTH: The min required channel width changed + * @IEEE80211_CHANCTX_CHANGE_MIN_DEF: The min chandef changed + * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider + * bandwidth) OFDMA settings need to be changed + * @IEEE80211_CHANCTX_CHANGE_PUNCTURING: The punctured channel(s) bitmap + * was changed. */ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_WIDTH = BIT(0), IEEE80211_CHANCTX_CHANGE_RX_CHAINS = BIT(1), IEEE80211_CHANCTX_CHANGE_RADAR = BIT(2), IEEE80211_CHANCTX_CHANGE_CHANNEL = BIT(3), - IEEE80211_CHANCTX_CHANGE_MIN_WIDTH = BIT(4), + IEEE80211_CHANCTX_CHANGE_MIN_DEF = BIT(4), + IEEE80211_CHANCTX_CHANGE_AP = BIT(5), + IEEE80211_CHANCTX_CHANGE_PUNCTURING = BIT(6), +}; + +/** + * struct ieee80211_chan_req - A channel "request" + * @oper: channel definition to use for operation + * @ap: the channel definition of the AP, if any + * (otherwise the chan member is %NULL) + */ +struct ieee80211_chan_req { + struct cfg80211_chan_def oper; + struct cfg80211_chan_def ap; }; /** @@ -231,6 +248,9 @@ enum ieee80211_chanctx_change { * * @def: the channel definition * @min_def: the minimum channel definition currently required. + * @ap: the channel definition the AP actually is operating as, + * for use with (wider bandwidth) OFDMA + * @radio_idx: index of the wiphy radio used used for this channel * @rx_chains_static: The number of RX chains that must always be * active on the channel to receive MIMO transmissions * @rx_chains_dynamic: The number of RX chains that must be enabled @@ -243,7 +263,9 @@ enum ieee80211_chanctx_change { struct ieee80211_chanctx_conf { struct cfg80211_chan_def def; struct cfg80211_chan_def min_def; + struct cfg80211_chan_def ap; + int radio_idx; u8 rx_chains_static, rx_chains_dynamic; bool radar_enabled; @@ -340,7 +362,9 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_FILS_DISCOVERY: FILS discovery status changed. * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response * status changed. - * @BSS_CHANGED_EHT_PUNCTURING: The channel puncturing bitmap changed. + * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed. + * @BSS_CHANGED_MLD_TTLM: negotiated TID to link mapping was changed + * @BSS_CHANGED_TPE: transmit power envelope changed */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, @@ -374,8 +398,10 @@ enum ieee80211_bss_change { BSS_CHANGED_HE_OBSS_PD = 1<<28, BSS_CHANGED_HE_BSS_COLOR = 1<<29, BSS_CHANGED_FILS_DISCOVERY = 1<<30, - BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = 1<<31, - BSS_CHANGED_EHT_PUNCTURING = BIT_ULL(32), + BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = BIT_ULL(31), + BSS_CHANGED_MLD_VALID_LINKS = BIT_ULL(33), + BSS_CHANGED_MLD_TTLM = BIT_ULL(34), + BSS_CHANGED_TPE = BIT_ULL(35), /* when adding here, make sure to change ieee80211_reconfig */ }; @@ -474,9 +500,9 @@ struct ieee80211_ba_event { /** * struct ieee80211_event - event to be sent to the driver * @type: The event itself. See &enum ieee80211_event_type. - * @rssi: relevant if &type is %RSSI_EVENT - * @mlme: relevant if &type is %AUTH_EVENT - * @ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT + * @u.rssi: relevant if &type is %RSSI_EVENT + * @u.mlme: relevant if &type is %AUTH_EVENT + * @u.ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT * @u:union holding the fields above */ struct ieee80211_event { @@ -528,6 +554,39 @@ struct ieee80211_fils_discovery { u32 max_interval; }; +#define IEEE80211_TPE_EIRP_ENTRIES_320MHZ 5 +struct ieee80211_parsed_tpe_eirp { + bool valid; + s8 power[IEEE80211_TPE_EIRP_ENTRIES_320MHZ]; + u8 count; +}; + +#define IEEE80211_TPE_PSD_ENTRIES_320MHZ 16 +struct ieee80211_parsed_tpe_psd { + bool valid; + s8 power[IEEE80211_TPE_PSD_ENTRIES_320MHZ]; + u8 count, n; +}; + +/** + * struct ieee80211_parsed_tpe - parsed transmit power envelope information + * @max_local: maximum local EIRP, one value for 20, 40, 80, 160, 320 MHz each + * (indexed by TX power category) + * @max_reg_client: maximum regulatory client EIRP, one value for 20, 40, 80, + * 160, 320 MHz each + * (indexed by TX power category) + * @psd_local: maximum local power spectral density, one value for each 20 MHz + * subchannel per bss_conf's chanreq.oper + * (indexed by TX power category) + * @psd_reg_client: maximum regulatory power spectral density, one value for + * each 20 MHz subchannel per bss_conf's chanreq.oper + * (indexed by TX power category) + */ +struct ieee80211_parsed_tpe { + struct ieee80211_parsed_tpe_eirp max_local[2], max_reg_client[2]; + struct ieee80211_parsed_tpe_psd psd_local[2], psd_reg_client[2]; +}; + /** * struct ieee80211_bss_conf - holds the BSS's changing parameters * @@ -535,12 +594,14 @@ struct ieee80211_fils_discovery { * to that BSS) that can change during the lifetime of the BSS. * * @vif: reference to owning VIF + * @bss: the cfg80211 bss descriptor. Valid only for a station, and only + * when associated. Note: This contains information which is not + * necessarily authenticated. For example, information coming from probe + * responses. * @addr: (link) address used locally * @link_id: link ID, or 0 for non-MLO * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE * @uora_exists: is the UORA element advertised by AP - * @ack_enabled: indicates support to receive a multi-TID that solicits either - * ACK, BACK or both * @uora_ocw_range: UORA element's OCW Range field * @frame_time_rts_th: HE duration RTS threshold, in units of 32us * @he_support: does this BSS support HE @@ -581,7 +642,7 @@ struct ieee80211_fils_discovery { * @mcast_rate: per-band multicast rate index + 1 (0: disabled) * @bssid: The BSSID for this BSS * @enable_beacon: whether beaconing should be enabled or not - * @chandef: Channel definition for this BSS -- the hardware might be + * @chanreq: Channel request for this BSS -- the hardware might be * configured a higher bandwidth than this BSS uses, for example. * @mu_group: VHT MU-MIMO group membership data * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation. @@ -621,6 +682,9 @@ struct ieee80211_fils_discovery { * responder functionality. * @ftmr_params: configurable lci/civic parameter when enabling FTM responder. * @nontransmitted: this BSS is a nontransmitted BSS profile + * @tx_bss_conf: Pointer to the BSS configuration of transmitting interface + * if MBSSID is enabled. This pointer is RCU-protected due to CSA finish + * and BSS color change flows accessing it. * @transmitter_bssid: the address of transmitter AP * @bssid_index: index inside the multiple BSSID set * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set @@ -638,24 +702,18 @@ struct ieee80211_fils_discovery { * @beacon_tx_rate: The configured beacon transmit rate that needs to be passed * to driver when rate control is offloaded to firmware. * @power_type: power type of BSS for 6 GHz - * @tx_pwr_env: transmit power envelope array of BSS. - * @tx_pwr_env_num: number of @tx_pwr_env. + * @tpe: transmit power envelope information * @pwr_reduction: power constraint of BSS. * @eht_support: does this BSS support EHT - * @eht_puncturing: bitmap to indicate which channels are punctured in this BSS - * @csa_active: marks whether a channel switch is going on. Internally it is - * write-protected by sdata_lock and local->mtx so holding either is fine - * for read access. - * @csa_punct_bitmap: new puncturing bitmap for channel switch + * @epcs_support: does this BSS support EPCS + * @csa_active: marks whether a channel switch is going on. * @mu_mimo_owner: indicates interface owns MU-MIMO capability * @chanctx_conf: The channel context this interface is assigned to, or %NULL * when it is not assigned. This pointer is RCU-protected due to the TX * path needing to access it; even though the netdev carrier will always * be off when it is %NULL there can still be races and packets could be * processed after it switches back to %NULL. - * @color_change_active: marks whether a color change is ongoing. Internally it is - * write-protected by sdata_lock and local->mtx so holding either is fine - * for read access. + * @color_change_active: marks whether a color change is ongoing. * @color_change_color: the bss color that will be used after the change. * @ht_ldpc: in AP mode, indicates interface has HT LDPC capability. * @vht_ldpc: in AP mode, indicates interface has VHT LDPC capability. @@ -683,9 +741,29 @@ struct ieee80211_fils_discovery { * beamformee * @eht_mu_beamformer: in AP-mode, does this BSS enable operation as an EHT MU * beamformer + * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the + * reception of an EHT TB PPDU on an RU that spans the entire PPDU + * bandwidth + * @eht_disable_mcs15: disable EHT-MCS 15 reception capability. + * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This + * information is the latest known value. It can come from this link's + * beacon or from a beacon sent by another link. + * @bss_param_ch_cnt_link_id: in BSS-mode, the link_id to which the beacon + * that updated &bss_param_ch_cnt belongs. E.g. if link 1 doesn't hear + * its beacons, and link 2 sent a beacon with an RNR element that updated + * link 1's BSS params change count, then, link 1's + * bss_param_ch_cnt_link_id will be 2. That means that link 1 knows that + * link 2 was the link that updated its bss_param_ch_cnt value. + * In case link 1 hears its beacon again, bss_param_ch_cnt_link_id will + * be updated to 1, even if bss_param_ch_cnt didn't change. This allows + * the link to know that it heard the latest value from its own beacon + * (as opposed to hearing its value from another link's beacon). + * @s1g_long_beacon_period: number of beacon intervals between each long + * beacon transmission. */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; + struct cfg80211_bss *bss; const u8 *bssid; unsigned int link_id; @@ -718,7 +796,7 @@ struct ieee80211_bss_conf { u32 cqm_rssi_hyst; s32 cqm_rssi_low; s32 cqm_rssi_high; - struct cfg80211_chan_def chandef; + struct ieee80211_chan_req chanreq; struct ieee80211_mu_group_data mu_group; bool qos; bool hidden_ssid; @@ -732,6 +810,7 @@ struct ieee80211_bss_conf { struct ieee80211_ftm_responder_params *ftmr_params; /* Multiple BSSID data */ bool nontransmitted; + struct ieee80211_bss_conf __rcu *tx_bss_conf; u8 transmitter_bssid[ETH_ALEN]; u8 bssid_index; u8 bssid_indicator; @@ -747,14 +826,13 @@ struct ieee80211_bss_conf { u32 unsol_bcast_probe_resp_interval; struct cfg80211_bitrate_mask beacon_tx_rate; enum ieee80211_ap_reg_power power_type; - struct ieee80211_tx_pwr_env tx_pwr_env[IEEE80211_TPE_MAX_IE_COUNT]; - u8 tx_pwr_env_num; + + struct ieee80211_parsed_tpe tpe; + u8 pwr_reduction; bool eht_support; - u16 eht_puncturing; - + bool epcs_support; bool csa_active; - u16 csa_punct_bitmap; bool mu_mimo_owner; struct ieee80211_chanctx_conf __rcu *chanctx_conf; @@ -776,6 +854,13 @@ struct ieee80211_bss_conf { bool eht_su_beamformer; bool eht_su_beamformee; bool eht_mu_beamformer; + bool eht_80mhz_full_bw_ul_mumimo; + bool eht_disable_mcs15; + + u8 bss_param_ch_cnt; + u8 bss_param_ch_cnt_link_id; + + u8 s1g_long_beacon_period; }; /** @@ -936,6 +1021,9 @@ enum mac80211_tx_info_flags { * of their QoS TID or other priority field values. * @IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX: first MLO TX, used mostly internally * for sequence number assignment + * @IEEE80211_TX_CTRL_DONT_USE_RATE_MASK: Don't use rate mask for this frame + * which is transmitted due to scanning or offchannel TX, not in normal + * operation on the interface. * @IEEE80211_TX_CTRL_MLO_LINK: If not @IEEE80211_LINK_UNSPECIFIED, this * frame should be transmitted on the specific link. This really is * only relevant for frames that do not have data present, and is @@ -956,6 +1044,7 @@ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_NO_SEQNO = BIT(7), IEEE80211_TX_CTRL_DONT_REORDER = BIT(8), IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX = BIT(9), + IEEE80211_TX_CTRL_DONT_USE_RATE_MASK = BIT(10), IEEE80211_TX_CTRL_MLO_LINK = 0xf0000000, }; @@ -1082,6 +1171,11 @@ struct ieee80211_tx_rate { #define IEEE80211_MAX_TX_RETRY 31 +static inline bool ieee80211_rate_valid(struct ieee80211_tx_rate *rate) +{ + return rate->idx >= 0 && rate->count > 0; +} + static inline void ieee80211_rate_set_vht(struct ieee80211_tx_rate *rate, u8 mcs, u8 nss) { @@ -1115,7 +1209,9 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * not valid if the interface is an MLD since we won't know which * link the frame will be transmitted on * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC - * @ack_frame_id: internal frame ID for TX status, used internally + * @status_data: internal data for TX status handling, assigned privately, + * see also &enum ieee80211_status_data for the internal documentation + * @status_data_idr: indicates status data is IDR allocated ID for ack frame * @tx_time_est: TX time estimate in units of 4us, used internally * @control: union part for control data * @control.rates: TX rates array to try @@ -1145,20 +1241,16 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * @ack: union part for pure ACK data * @ack.cookie: cookie for the ACK * @driver_data: array of driver_data pointers - * @ampdu_ack_len: number of acked aggregated frames. - * relevant only if IEEE80211_TX_STAT_AMPDU was set. - * @ampdu_len: number of aggregated frames. - * relevant only if IEEE80211_TX_STAT_AMPDU was set. - * @ack_signal: signal strength of the ACK frame */ struct ieee80211_tx_info { /* common information */ u32 flags; u32 band:3, - ack_frame_id:13, + status_data_idr:1, + status_data:13, hw_queue:4, tx_time_est:10; - /* 2 free bits */ + /* 1 free bit */ union { struct { @@ -1172,7 +1264,11 @@ struct ieee80211_tx_info { u8 use_cts_prot:1; u8 short_preamble:1; u8 skip_table:1; - /* 2 bytes free */ + + /* for injection only (bitmap) */ + u8 antennas:2; + + /* 14 bits free */ }; /* only needed before rate control */ unsigned long jiffies; @@ -1352,6 +1448,9 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * the frame. * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on * the frame. + * @RX_FLAG_MACTIME: The timestamp passed in the RX status (@mactime + * field) is valid if this field is non-zero, and the position + * where the timestamp was sampled depends on the value. * @RX_FLAG_MACTIME_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the first symbol of the MPDU * was received. This is useful in monitor mode and for proper IBSS @@ -1361,6 +1460,11 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * (including FCS) was received. * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the SYNC preamble was received. + * @RX_FLAG_MACTIME_IS_RTAP_TS64: The timestamp passed in the RX status @mactime + * is only for use in the radiotap timestamp header, not otherwise a valid + * @mactime value. Note this is a separate flag so that we continue to see + * %RX_FLAG_MACTIME as unset. Also note that in this case the timestamp is + * reported to be 64 bits wide, not just 32. * @RX_FLAG_NO_SIGNAL_VAL: The signal strength value is not present. * Valid only for data frames (mainly A-MPDU) * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference @@ -1371,8 +1475,6 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU * @RX_FLAG_AMPDU_DELIM_CRC_ERROR: A delimiter CRC error has been detected * on this subframe - * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC - * is stored in the @ampdu_delimiter_crc field) * @RX_FLAG_MIC_STRIPPED: The mic was stripped of this packet. Decryption was * done by the hardware * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without @@ -1427,16 +1529,17 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * known the frame shouldn't be reported. * @RX_FLAG_8023: the frame has an 802.3 header (decap offload performed by * hardware or driver) + * @RX_FLAG_RADIOTAP_VHT: VHT radiotap data is present */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), RX_FLAG_DECRYPTED = BIT(1), - RX_FLAG_MACTIME_PLCP_START = BIT(2), + RX_FLAG_ONLY_MONITOR = BIT(2), RX_FLAG_MMIC_STRIPPED = BIT(3), RX_FLAG_IV_STRIPPED = BIT(4), RX_FLAG_FAILED_FCS_CRC = BIT(5), RX_FLAG_FAILED_PLCP_CRC = BIT(6), - RX_FLAG_MACTIME_START = BIT(7), + RX_FLAG_MACTIME_IS_RTAP_TS64 = BIT(7), RX_FLAG_NO_SIGNAL_VAL = BIT(8), RX_FLAG_AMPDU_DETAILS = BIT(9), RX_FLAG_PN_VALIDATED = BIT(10), @@ -1444,9 +1547,11 @@ enum mac80211_rx_flags { RX_FLAG_AMPDU_LAST_KNOWN = BIT(12), RX_FLAG_AMPDU_IS_LAST = BIT(13), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), - RX_FLAG_AMPDU_DELIM_CRC_KNOWN = BIT(15), - RX_FLAG_MACTIME_END = BIT(16), - RX_FLAG_ONLY_MONITOR = BIT(17), + /* one free bit at 15 */ + RX_FLAG_MACTIME = BIT(16) | BIT(17), + RX_FLAG_MACTIME_PLCP_START = 1 << 16, + RX_FLAG_MACTIME_START = 2 << 16, + RX_FLAG_MACTIME_END = 3 << 16, RX_FLAG_SKIP_MONITOR = BIT(18), RX_FLAG_AMSDU_MORE = BIT(19), RX_FLAG_RADIOTAP_TLV_AT_END = BIT(20), @@ -1460,6 +1565,7 @@ enum mac80211_rx_flags { RX_FLAG_RADIOTAP_LSIG = BIT(28), RX_FLAG_NO_PSDU = BIT(29), RX_FLAG_8023 = BIT(30), + RX_FLAG_RADIOTAP_VHT = BIT(31), }; /** @@ -1539,7 +1645,6 @@ enum mac80211_rx_encoding { * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU - * @ampdu_delimiter_crc: A-MPDU delimiter CRC * @zero_length_psdu_type: radiotap type of the 0-length PSDU * @link_valid: if the link which is identified by @link_id is valid. This flag * is set only when connection is MLO. @@ -1577,7 +1682,6 @@ struct ieee80211_rx_status { s8 signal; u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; - u8 ampdu_delimiter_crc; u8 zero_length_psdu_type; u8 link_valid:1, link_id:4; }; @@ -1729,8 +1833,9 @@ struct ieee80211_conf { * @chandef: the new channel to switch to * @count: the number of TBTT's until the channel switch event * @delay: maximum delay between the time the AP transmitted the last beacon in - * current channel and the expected time of the first beacon in the new - * channel, expressed in TU. + * current channel and the expected time of the first beacon in the new + * channel, expressed in TU. + * @link_id: the link ID of the link doing the channel switch, 0 for non-MLO */ struct ieee80211_channel_switch { u64 timestamp; @@ -1738,6 +1843,7 @@ struct ieee80211_channel_switch { bool block_tx; struct cfg80211_chan_def chandef; u8 count; + u8 link_id; u32 delay; }; @@ -1757,15 +1863,24 @@ struct ieee80211_channel_switch { * @IEEE80211_VIF_GET_NOA_UPDATE: request to handle NOA attributes * and send P2P_PS notification to the driver if NOA changed, even * this is not pure P2P vif. - * @IEEE80211_VIF_DISABLE_SMPS_OVERRIDE: disable user configuration of - * SMPS mode via debugfs. + * @IEEE80211_VIF_EML_ACTIVE: The driver indicates that EML operation is + * enabled for the interface. + * @IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW: Ignore wider bandwidth OFDMA + * operation on this interface and request a channel context without + * the AP definition. Use this e.g. because the device is able to + * handle OFDMA (downlink and trigger for uplink) on a per-AP basis. + * @IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC: indicates that the AP sta should + * be removed only after setting the vif as unassociated, and not the + * opposite. Only relevant for STA vifs. */ enum ieee80211_vif_flags { IEEE80211_VIF_BEACON_FILTER = BIT(0), IEEE80211_VIF_SUPPORTS_CQM_RSSI = BIT(1), IEEE80211_VIF_SUPPORTS_UAPSD = BIT(2), IEEE80211_VIF_GET_NOA_UPDATE = BIT(3), - IEEE80211_VIF_DISABLE_SMPS_OVERRIDE = BIT(4), + IEEE80211_VIF_EML_ACTIVE = BIT(4), + IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW = BIT(5), + IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC = BIT(6), }; @@ -1795,9 +1910,11 @@ enum ieee80211_offload_flags { * @ps: power-save mode (STA only). This flag is NOT affected by * offchannel/dynamic_ps operations. * @aid: association ID number, valid only when @assoc is true - * @eml_cap: EML capabilities as described in P802.11be_D2.2 Figure 9-1002k. + * @eml_cap: EML capabilities as described in P802.11be_D4.1 Figure 9-1001j. * @eml_med_sync_delay: Medium Synchronization delay as described in - * P802.11be_D2.2 Figure 9-1002j. + * P802.11be_D4.1 Figure 9-1001i. + * @mld_capa_op: MLD Capabilities and Operations per P802.11be_D4.1 + * Figure 9-1001k * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The * may filter ARP queries targeted for other addresses than listed here. * The driver must allow ARP queries targeted for all address listed here @@ -1822,6 +1939,7 @@ struct ieee80211_vif_cfg { u16 aid; u16 eml_cap; u16 eml_med_sync_delay; + u16 mld_capa_op; __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; int arp_addr_cnt; @@ -1832,6 +1950,35 @@ struct ieee80211_vif_cfg { u8 ap_addr[ETH_ALEN] __aligned(2); }; +#define IEEE80211_TTLM_NUM_TIDS 8 + +/** + * struct ieee80211_neg_ttlm - negotiated TID to link map info + * + * @downlink: bitmap of active links per TID for downlink, or 0 if mapping for + * this TID is not included. + * @uplink: bitmap of active links per TID for uplink, or 0 if mapping for this + * TID is not included. + * @valid: info is valid or not. + */ +struct ieee80211_neg_ttlm { + u16 downlink[IEEE80211_TTLM_NUM_TIDS]; + u16 uplink[IEEE80211_TTLM_NUM_TIDS]; + bool valid; +}; + +/** + * enum ieee80211_neg_ttlm_res - return value for negotiated TTLM handling + * @NEG_TTLM_RES_ACCEPT: accept the request + * @NEG_TTLM_RES_REJECT: reject the request + * @NEG_TTLM_RES_SUGGEST_PREFERRED: reject and suggest a new mapping + */ +enum ieee80211_neg_ttlm_res { + NEG_TTLM_RES_ACCEPT, + NEG_TTLM_RES_REJECT, + NEG_TTLM_RES_SUGGEST_PREFERRED +}; + /** * struct ieee80211_vif - per-interface data * @@ -1848,9 +1995,18 @@ struct ieee80211_vif_cfg { * @active_links: The bitmap of active links, or 0 for non-MLO. * The driver shouldn't change this directly, but use the * API calls meant for that purpose. - * @dormant_links: bitmap of valid but disabled links, or 0 for non-MLO. - * Must be a subset of valid_links. + * @dormant_links: subset of the valid links that are disabled/suspended + * due to advertised or negotiated TTLM respectively. + * 0 for non-MLO. + * @suspended_links: subset of dormant_links representing links that are + * suspended due to negotiated TTLM, and could be activated in the + * future by tearing down the TTLM negotiation. + * 0 for non-MLO. + * @neg_ttlm: negotiated TID to link mapping info. + * see &struct ieee80211_neg_ttlm. * @addr: address of this interface + * @addr_valid: indicates if the address is actively used. Set to false for + * passive monitor interfaces, true in all other cases. * @p2p: indicates whether this AP or STA interface is a p2p * interface, i.e. a GO or p2p-sta respectively * @netdev_features: tx netdev features supported by the hardware for this @@ -1880,15 +2036,16 @@ struct ieee80211_vif_cfg { * @txq: the multicast data TX queue * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see * &enum ieee80211_offload_flags. - * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled. */ struct ieee80211_vif { enum nl80211_iftype type; struct ieee80211_vif_cfg cfg; struct ieee80211_bss_conf bss_conf; struct ieee80211_bss_conf __rcu *link_conf[IEEE80211_MLD_MAX_NUM_LINKS]; - u16 valid_links, active_links, dormant_links; + u16 valid_links, active_links, dormant_links, suspended_links; + struct ieee80211_neg_ttlm neg_ttlm; u8 addr[ETH_ALEN] __aligned(2); + bool addr_valid; bool p2p; u8 cab_queue; @@ -1907,8 +2064,6 @@ struct ieee80211_vif { bool probe_req_reg; bool rx_mcast_action_reg; - struct ieee80211_vif *mbssid_tx_vif; - /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; @@ -1934,11 +2089,26 @@ static inline bool ieee80211_vif_is_mld(const struct ieee80211_vif *vif) return vif->valid_links != 0; } +/** + * ieee80211_vif_link_active - check if a given link is active + * @vif: the vif + * @link_id: the link ID to check + * Return: %true if the vif is an MLD and the link is active, or if + * the vif is not an MLD and the link ID is 0; %false otherwise. + */ +static inline bool ieee80211_vif_link_active(const struct ieee80211_vif *vif, + unsigned int link_id) +{ + if (!ieee80211_vif_is_mld(vif)) + return link_id == 0; + return vif->active_links & BIT(link_id); +} + #define for_each_vif_active_link(vif, link, link_id) \ for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++) \ if ((!(vif)->active_links || \ (vif)->active_links & BIT(link_id)) && \ - (link = rcu_dereference((vif)->link_conf[link_id]))) + (link = link_conf_dereference_check(vif, link_id))) static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) { @@ -1955,7 +2125,7 @@ static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that get a wdev. * - * Note that this function may return %NULL if the given wdev isn't + * Return: pointer to the wdev, or %NULL if the given wdev isn't * associated with a vif that the driver knows about (e.g. monitor * or AP_VLAN interfaces.) */ @@ -1968,25 +2138,23 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that needs to get the wdev for a vif. * This can also be useful to get the netdev associated to a vif. + * + * Return: pointer to the wdev */ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); -/** - * lockdep_vif_mutex_held - for lockdep checks on link poiners - * @vif: the interface to check - */ -static inline bool lockdep_vif_mutex_held(struct ieee80211_vif *vif) +static inline bool lockdep_vif_wiphy_mutex_held(struct ieee80211_vif *vif) { - return lockdep_is_held(&ieee80211_vif_to_wdev(vif)->mtx); + return lockdep_is_held(&ieee80211_vif_to_wdev(vif)->wiphy->mtx); } #define link_conf_dereference_protected(vif, link_id) \ rcu_dereference_protected((vif)->link_conf[link_id], \ - lockdep_vif_mutex_held(vif)) + lockdep_vif_wiphy_mutex_held(vif)) #define link_conf_dereference_check(vif, link_id) \ rcu_dereference_check((vif)->link_conf[link_id], \ - lockdep_vif_mutex_held(vif)) + lockdep_vif_wiphy_mutex_held(vif)) /** * enum ieee80211_key_flags - key flags @@ -2030,8 +2198,10 @@ static inline bool lockdep_vif_mutex_held(struct ieee80211_vif *vif) * @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key. * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation. * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver - * for a AES_CMAC key to indicate that it requires sequence number - * generation only + * for a AES_CMAC or a AES_GMAC key to indicate that it requires sequence + * number generation only + * @IEEE80211_KEY_FLAG_SPP_AMSDU: SPP A-MSDUs can be used with this key + * (set by mac80211 from the sta->spp_amsdu flag) */ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_GENERATE_IV_MGMT = BIT(0), @@ -2045,6 +2215,7 @@ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_PUT_MIC_SPACE = BIT(8), IEEE80211_KEY_FLAG_NO_AUTO_TX = BIT(9), IEEE80211_KEY_FLAG_GENERATE_MMIE = BIT(10), + IEEE80211_KEY_FLAG_SPP_AMSDU = BIT(11), }; /** @@ -2060,7 +2231,7 @@ enum ieee80211_key_flags { * @tx_pn: PN used for TX keys, may be used by the driver as well if it * needs to do software PN assignment by itself (e.g. due to TSO) * @flags: key flags, see &enum ieee80211_key_flags. - * @keyidx: the key index (0-3) + * @keyidx: the key index (0-7) * @keylen: key material length * @key: key material. For ALG_TKIP the key is encoded as a 256-bit (32 byte) * data block: @@ -2069,7 +2240,7 @@ enum ieee80211_key_flags { * - Temporal Authenticator Rx MIC Key (64 bits) * @icv_len: The ICV length for this key type * @iv_len: The IV length for this key type - * @link_id: the link ID for MLO, or -1 for non-MLO or pairwise keys + * @link_id: the link ID, 0 for non-MLO, or -1 for pairwise keys */ struct ieee80211_key_conf { atomic64_t tx_pn; @@ -2180,6 +2351,8 @@ enum ieee80211_sta_rx_bandwidth { IEEE80211_STA_RX_BW_320, }; +#define IEEE80211_STA_RX_BW_MAX IEEE80211_STA_RX_BW_320 + /** * struct ieee80211_sta_rates - station rate selection table * @@ -2261,6 +2434,7 @@ struct ieee80211_sta_aggregates { * @he_cap: HE capabilities of this STA * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @eht_cap: EHT capabilities of this STA + * @s1g_cap: S1G capabilities of this STA * @agg: per-link data for multi-link aggregation * @bandwidth: current bandwidth the station can receive with * @rx_nss: in HT/VHT, the maximum number of spatial streams the @@ -2283,6 +2457,7 @@ struct ieee80211_link_sta { struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; struct ieee80211_sta_eht_cap eht_cap; + struct ieee80211_sta_s1g_cap s1g_cap; struct ieee80211_sta_aggregates agg; @@ -2325,6 +2500,7 @@ struct ieee80211_link_sta { * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single * A-MSDU. Taken from the Extended Capabilities element. 0 means * unlimited. + * @eml_cap: EML capabilities of this MLO station * @cur: currently valid data as aggregated from the active links * For non MLO STA it will point to the deflink data. For MLO STA * ieee80211_sta_recalc_aggregates() must be called to update it. @@ -2343,9 +2519,10 @@ struct ieee80211_link_sta { * would be assigned to link[link_id] where link_id is the id assigned * by the AP. * @valid_links: bitmap of valid links, or 0 for non-MLO + * @spp_amsdu: indicates whether the STA uses SPP A-MSDU or not. */ struct ieee80211_sta { - u8 addr[ETH_ALEN]; + u8 addr[ETH_ALEN] __aligned(2); u16 aid; u16 max_rx_aggregation_subframes; bool wme; @@ -2356,7 +2533,9 @@ struct ieee80211_sta { bool tdls_initiator; bool mfp; bool mlo; + bool spp_amsdu; u8 max_amsdu_subframes; + u16 eml_cap; struct ieee80211_sta_aggregates *cur; @@ -2393,7 +2572,7 @@ static inline bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) \ if ((!(vif)->active_links || \ (vif)->active_links & BIT(link_id)) && \ - ((link_sta) = link_sta_dereference_protected(sta, link_id))) + ((link_sta) = link_sta_dereference_check(sta, link_id))) /** * enum sta_notify_cmd - sta notify command @@ -2539,6 +2718,11 @@ struct ieee80211_txq { * a virtual monitor interface when monitor interfaces are the only * active interfaces. * + * @IEEE80211_HW_NO_VIRTUAL_MONITOR: The driver would like to be informed + * of any monitor interface, as well as their configured channel. + * This is useful for supporting multiple monitor interfaces on different + * channels. + * * @IEEE80211_HW_NO_AUTO_VIF: The driver would like for no wlanX to * be created. It is expected user-space will create vifs as * desired (and thus have them named as desired). @@ -2626,14 +2810,6 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on * TDLS links. * - * @IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP: The driver requires the - * mgd_prepare_tx() callback to be called before transmission of a - * deauthentication frame in case the association was completed but no - * beacon was heard. This is required in multi-channel scenarios, where the - * virtual interface might not be given air time for the transmission of - * the frame, as it is not synced with the AP/P2P GO yet, and thus the - * deauthentication frame might not be transmitted. - * * @IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP: The driver (or firmware) doesn't * support QoS NDP for AP probing - that's most likely a driver bug. * @@ -2680,6 +2856,19 @@ struct ieee80211_txq { * @IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX: Hardware/driver handles transmitting * multicast frames on all links, mac80211 should not do that. * + * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT + * and connecting with a lower bandwidth instead + * + * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so + * no need to stop queues. This really should be set by a driver that + * implements MLO, so operation can continue on other links when one + * link is switching. + * + * @IEEE80211_HW_STRICT: strictly enforce certain things mandated by the spec + * but otherwise ignored/worked around for interoperability. This is a + * HW flag so drivers can opt in according to their own control, e.g. in + * testing. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2696,6 +2885,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_DYNAMIC_PS, IEEE80211_HW_MFP_CAPABLE, IEEE80211_HW_WANT_MONITOR_VIF, + IEEE80211_HW_NO_VIRTUAL_MONITOR, IEEE80211_HW_NO_AUTO_VIF, IEEE80211_HW_SW_CRYPTO_CONTROL, IEEE80211_HW_SUPPORT_FAST_XMIT, @@ -2723,7 +2913,6 @@ enum ieee80211_hw_flags { IEEE80211_HW_REPORTS_LOW_ACK, IEEE80211_HW_SUPPORTS_TX_FRAG, IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA, - IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP, IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP, IEEE80211_HW_BUFF_MMPDU_TXQ, IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW, @@ -2737,6 +2926,9 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP, IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, + IEEE80211_HW_DISALLOW_PUNCTURING, + IEEE80211_HW_HANDLES_QUIET_CSA, + IEEE80211_HW_STRICT, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS @@ -2825,8 +3017,6 @@ enum ieee80211_hw_flags { * the default is _GI | _BANDWIDTH. * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values. * - * @radiotap_he: HE radiotap validity flags - * * @radiotap_timestamp: Information for the radiotap timestamp field; if the * @units_pos member is set to a non-negative value then the timestamp * field will be added and populated from the &struct ieee80211_rx_status @@ -3004,6 +3194,10 @@ ieee80211_get_tx_rate(const struct ieee80211_hw *hw, { if (WARN_ON_ONCE(c->control.rates[0].idx < 0)) return NULL; + + if (c->band >= NUM_NL80211_BANDS) + return NULL; + return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[0].idx]; } @@ -3036,6 +3230,19 @@ ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw, void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); /** + * ieee80211_purge_tx_queue - purge TX skb queue + * @hw: the hardware + * @skbs: the skbs + * + * Free a set of transmit skbs. Use this function when device is going to stop + * but some transmit skbs without TX status are still queued. + * This function does not take the list lock and the caller must hold the + * relevant locks to use it. + */ +void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, + struct sk_buff_head *skbs); + +/** * DOC: Hardware crypto acceleration * * mac80211 is capable of taking advantage of many hardware @@ -3056,7 +3263,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * The set_key() call for the %SET_KEY command should return 0 if * the key is now in use, -%EOPNOTSUPP or -%ENOSPC if it couldn't be * added; if you return 0 then hw_key_idx must be assigned to the - * hardware key index, you are free to use the full u8 range. + * hardware key index. You are free to use the full u8 range. * * Note that in the case that the @IEEE80211_HW_SW_CRYPTO_CONTROL flag is * set, mac80211 will not automatically fall back to software crypto if @@ -3066,7 +3273,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * When the cmd is %DISABLE_KEY then it must succeed. * * Note that it is permissible to not decrypt a frame even if a key - * for it has been uploaded to hardware, the stack will not make any + * for it has been uploaded to hardware. The stack will not make any * decision based on whether a key has been uploaded or not but rather * based on the receive flags. * @@ -3081,7 +3288,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * The update_tkip_key() call updates the driver with the new phase 1 key. * This happens every time the iv16 wraps around (every 65536 packets). The * set_key() call will happen only once for each key (unless the AP did - * rekeying), it will not include a valid phase 1 key. The valid phase 1 key is + * rekeying); it will not include a valid phase 1 key. The valid phase 1 key is * provided by update_tkip_key only. The trigger that makes mac80211 call this * handler is software decryption with wrap around of iv16. * @@ -3108,7 +3315,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * * mac80211 has support for various powersave implementations. * - * First, it can support hardware that handles all powersaving by itself, + * First, it can support hardware that handles all powersaving by itself; * such hardware should simply set the %IEEE80211_HW_SUPPORTS_PS hardware * flag. In that case, it will be told about the desired powersave mode * with the %IEEE80211_CONF_PS flag depending on the association status. @@ -3133,12 +3340,12 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * %IEEE80211_HW_PS_NULLFUNC_STACK flags. The hardware is of course still * required to pass up beacons. The hardware is still required to handle * waking up for multicast traffic; if it cannot the driver must handle that - * as best as it can, mac80211 is too slow to do that. + * as best as it can; mac80211 is too slow to do that. * * Dynamic powersave is an extension to normal powersave in which the * hardware stays awake for a user-specified period of time after sending a * frame so that reply frames need not be buffered and therefore delayed to - * the next wakeup. It's compromise of getting good enough latency when + * the next wakeup. It's a compromise of getting good enough latency when * there's data traffic and still saving significantly power in idle * periods. * @@ -3203,7 +3410,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * Note that change, for the sake of simplification, also includes information * elements appearing or disappearing from the beacon. * - * Some hardware supports an "ignore list" instead, just make sure nothing + * Some hardware supports an "ignore list" instead. Just make sure nothing * that was requested is on the ignore list, and include commonly changing * information element IDs in the ignore list, for example 11 (BSS load) and * the various vendor-assigned IEs with unknown contents (128, 129, 133-136, @@ -3214,7 +3421,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * In addition to these capabilities, hardware should support notifying the * host of changes in the beacon RSSI. This is relevant to implement roaming * when no traffic is flowing (when traffic is flowing we see the RSSI of - * the received data packets). This can consist in notifying the host when + * the received data packets). This can consist of notifying the host when * the RSSI changes significantly or when it drops below or rises above * configurable thresholds. In the future these thresholds will also be * configured by mac80211 (which gets them from userspace) to implement @@ -3361,8 +3568,8 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * period starts for any reason, @release_buffered_frames is called * with the number of frames to be released and which TIDs they are * to come from. In this case, the driver is responsible for setting - * the EOSP (for uAPSD) and MORE_DATA bits in the released frames, - * to help the @more_data parameter is passed to tell the driver if + * the EOSP (for uAPSD) and MORE_DATA bits in the released frames. + * To help the @more_data parameter is passed to tell the driver if * there is more data on other TIDs -- the TIDs to release frames * from are ignored since mac80211 doesn't know how many frames the * buffers for those TIDs contain. @@ -3411,7 +3618,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * Additionally, the driver has to then use these HW queue IDs for the queue * management functions (ieee80211_stop_queue() et al.) * - * The driver is free to set up the queue mappings as needed, multiple virtual + * The driver is free to set up the queue mappings as needed; multiple virtual * interfaces may map to the same hardware queues if needed. The setup has to * happen during add_interface or change_interface callbacks. For example, a * driver supporting station+station and station+AP modes might decide to have @@ -3635,11 +3842,16 @@ enum ieee80211_reconfig_type { * @success: whether the frame exchange was successful, only * used with the mgd_complete_tx() method, and then only * valid for auth and (re)assoc. + * @was_assoc: set if this call is due to deauth/disassoc + * while just having been associated + * @link_id: the link id on which the frame will be TX'ed. + * 0 for a non-MLO connection. */ struct ieee80211_prep_tx_info { u16 duration; u16 subtype; - u8 success:1; + u8 success:1, was_assoc:1; + int link_id; }; /** @@ -3863,6 +4075,10 @@ struct ieee80211_prep_tx_info { * the station. See @sta_pre_rcu_remove if needed. * This callback can sleep. * + * @vif_add_debugfs: Drivers can use this callback to add a debugfs vif + * directory with its files. This callback should be within a + * CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep. + * * @link_add_debugfs: Drivers can use this callback to add debugfs files * when a link is added to a mac80211 vif. This callback should be within * a CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep. @@ -3907,8 +4123,8 @@ struct ieee80211_prep_tx_info { * in @sta_state. * The callback can sleep. * - * @sta_rc_update: Notifies the driver of changes to the bitrates that can be - * used to transmit to the station. The changes are advertised with bits + * @link_sta_rc_update: Notifies the driver of changes to the bitrates that can + * be used to transmit to the station. The changes are advertised with bits * from &enum ieee80211_rate_control_changed and the values are reflected * in the station data. This callback should only be used when the driver * uses hardware rate control (%IEEE80211_HW_HAS_RATE_CONTROL) since @@ -3926,6 +4142,15 @@ struct ieee80211_prep_tx_info { * Statistics that the driver doesn't fill will be filled by mac80211. * The callback can sleep. * + * @link_sta_statistics: Get link statistics for this station. For example with + * beacon filtering, the statistics kept by mac80211 might not be + * accurate, so let the driver pre-fill the statistics. The driver can + * fill most of the values (indicating which by setting the filled + * bitmap), but not all of them make sense - see the source for which + * ones are possible. + * Statistics that the driver doesn't fill will be filled by mac80211. + * The callback can sleep. + * * @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max), * bursting) for a hardware TX queue. * Returns a negative error code on failure. @@ -4067,11 +4292,15 @@ struct ieee80211_prep_tx_info { * This callback must be atomic. * * @get_et_sset_count: Ethtool API to get string-set count. + * Note that the wiphy mutex is not held for this callback since it's + * expected to return a static value. * * @get_et_stats: Ethtool API to get a set of u64 stats. * * @get_et_strings: Ethtool API to get a set of strings to describe stats * and perhaps other supported types of ethtool data-sets. + * Note that the wiphy mutex is not held for this callback since it's + * expected to return a static value. * * @mgd_prepare_tx: Prepare for transmitting a management frame for association * before associated. In multi-channel scenarios, a virtual interface is @@ -4079,12 +4308,9 @@ struct ieee80211_prep_tx_info { * yet it need not necessarily be given airtime, in particular since any * transmission to a P2P GO needs to be synchronized against the GO's * powersave state. mac80211 will call this function before transmitting a - * management frame prior to having successfully associated to allow the - * driver to give it channel time for the transmission, to get a response - * and to be able to synchronize with the GO. - * For drivers that set %IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP, mac80211 - * would also call this function before transmitting a deauthentication - * frame in case that no beacon was heard from the AP/P2P GO. + * management frame prior to transmitting that frame to allow the driver + * to give it channel time for the transmission, to get a response and be + * able to synchronize with the GO. * The callback will be called before each transmission and upon return * mac80211 will transmit the frame right away. * Additional information is passed in the &struct ieee80211_prep_tx_info @@ -4094,6 +4320,8 @@ struct ieee80211_prep_tx_info { * @mgd_complete_tx: Notify the driver that the response frame for a previously * transmitted frame announced with @mgd_prepare_tx was received, the data * is filled similarly to @mgd_prepare_tx though the duration is not used. + * Note that this isn't always called for each mgd_prepare_tx() call, for + * example for SAE the 'confirm' messages can be on the air in any order. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending * a TDLS discovery-request, we expect a reply to arrive on the AP's @@ -4160,7 +4388,7 @@ struct ieee80211_prep_tx_info { * after a channel switch procedure is completed, allowing the * driver to go back to a normal configuration. * @abort_channel_switch: This is an optional callback that is called - * when channel switch procedure was completed, allowing the + * when channel switch procedure was aborted, allowing the * driver to go back to a normal configuration. * @channel_switch_rx_beacon: This is an optional callback that is called * when channel switch procedure is in progress and additional beacon with @@ -4250,12 +4478,16 @@ struct ieee80211_prep_tx_info { * disable background CAC/radar detection. * @net_fill_forward_path: Called from .ndo_fill_forward_path in order to * resolve a path for hardware flow offloading + * @can_activate_links: Checks if a specific active_links bitmap is + * supported by the driver. * @change_vif_links: Change the valid links on an interface, note that while * removing the old link information is still valid (link_conf pointer), * but may immediately disappear after the function returns. The old or * new links bitmaps may be 0 if going from/to a non-MLO situation. * The @old array contains pointers to the old bss_conf structures * that were already removed, in case they're needed. + * Note that removal of link should always succeed, so the return value + * will be ignored in a removal only case. * This callback can sleep. * @change_sta_links: Change the valid links of a station, similar to * @change_vif_links. This callback can sleep. @@ -4269,13 +4501,23 @@ struct ieee80211_prep_tx_info { * flow offloading for flows originating from the vif. * Note that the driver must not assume that the vif driver_data is valid * at this point, since the callback can be called during netdev teardown. + * @can_neg_ttlm: for managed interface, requests the driver to determine + * if the requested TID-To-Link mapping can be accepted or not. + * If it's not accepted the driver may suggest a preferred mapping and + * modify @ttlm parameter with the suggested TID-to-Link mapping. + * @prep_add_interface: prepare for interface addition. This can be used by + * drivers to prepare for the addition of a new interface, e.g., allocate + * the needed resources etc. This callback doesn't guarantee that an + * interface with the specified type would be added, and thus drivers that + * implement this callback need to handle such cases. The type is the full + * &enum nl80211_iftype. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb); int (*start)(struct ieee80211_hw *hw); - void (*stop)(struct ieee80211_hw *hw); + void (*stop)(struct ieee80211_hw *hw, bool suspend); #ifdef CONFIG_PM int (*suspend)(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan); int (*resume)(struct ieee80211_hw *hw); @@ -4288,7 +4530,7 @@ struct ieee80211_ops { enum nl80211_iftype new_type, bool p2p); void (*remove_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); - int (*config)(struct ieee80211_hw *hw, u32 changed); + int (*config)(struct ieee80211_hw *hw, int radio_idx, u32 changed); void (*bss_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, @@ -4351,13 +4593,17 @@ struct ieee80211_ops { void (*get_key_seq)(struct ieee80211_hw *hw, struct ieee80211_key_conf *key, struct ieee80211_key_seq *seq); - int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value); - int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value); + int (*set_frag_threshold)(struct ieee80211_hw *hw, int radio_idx, + u32 value); + int (*set_rts_threshold)(struct ieee80211_hw *hw, int radio_idx, + u32 value); int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); #ifdef CONFIG_MAC80211_DEBUGFS + void (*vif_add_debugfs)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); void (*link_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, @@ -4383,10 +4629,10 @@ struct ieee80211_ops { void (*sta_pre_rcu_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); - void (*sta_rc_update)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct ieee80211_sta *sta, - u32 changed); + void (*link_sta_rc_update)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + u32 changed); void (*sta_rate_tbl_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); @@ -4405,6 +4651,10 @@ struct ieee80211_ops { s64 offset); void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*tx_last_beacon)(struct ieee80211_hw *hw); + void (*link_sta_statistics)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + struct link_station_info *link_sinfo); /** * @ampdu_action: @@ -4443,7 +4693,8 @@ struct ieee80211_ops { int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); - void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class); + void (*set_coverage_class)(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class); #ifdef CONFIG_NL80211_TESTMODE int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len); @@ -4458,8 +4709,10 @@ struct ieee80211_ops { void (*channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); - int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); - int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); + int (*set_antenna)(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant); + int (*get_antenna)(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant); int (*remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4506,7 +4759,8 @@ struct ieee80211_ops { struct ieee80211_prep_tx_info *info); void (*mgd_protect_tdls_discover)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + unsigned int link_id); int (*add_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); @@ -4544,9 +4798,11 @@ struct ieee80211_ops { struct ieee80211_channel_switch *ch_switch); int (*post_channel_switch)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf); void (*abort_channel_switch)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf); void (*channel_switch_rx_beacon)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); @@ -4556,7 +4812,7 @@ struct ieee80211_ops { u32 (*get_expected_throughput)(struct ieee80211_hw *hw, struct ieee80211_sta *sta); int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - int *dbm); + unsigned int link_id, int *dbm); int (*tdls_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4626,6 +4882,9 @@ struct ieee80211_ops { struct ieee80211_sta *sta, struct net_device_path_ctx *ctx, struct net_device_path *path); + bool (*can_activate_links)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u16 active_links); int (*change_vif_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 old_links, u16 new_links, @@ -4642,6 +4901,11 @@ struct ieee80211_ops { struct net_device *dev, enum tc_setup_type type, void *type_data); + enum ieee80211_neg_ttlm_res + (*can_neg_ttlm)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_neg_ttlm *ttlm); + void (*prep_add_interface)(struct ieee80211_hw *hw, + enum nl80211_iftype type); }; /** @@ -4890,7 +5154,7 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw); * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with - * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * This function must be called with BHs disabled and RCU read lock * @@ -4915,7 +5179,7 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *sta, * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with - * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * This function must be called with BHs disabled. * @@ -4940,7 +5204,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *sta, * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with - * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * In process context use instead ieee80211_rx_ni(). * @@ -4960,7 +5224,7 @@ static inline void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) * * Calls to this function, ieee80211_rx() or ieee80211_rx_ni() may not * be mixed for a single hardware.Must not run concurrently with - * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call @@ -4975,7 +5239,7 @@ void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); * * Calls to this function, ieee80211_rx() and ieee80211_rx_irqsafe() may * not be mixed for a single hardware. Must not run concurrently with - * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call @@ -5120,22 +5384,6 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, int max_rates); /** - * ieee80211_sta_set_expected_throughput - set the expected tpt for a station - * - * Call this function to notify mac80211 about a change in expected throughput - * to a station. A driver for a device that does rate control in firmware can - * call this function when the expected throughput estimate towards a station - * changes. The information is used to tune the CoDel AQM applied to traffic - * going towards that station (which can otherwise be too aggressive and cause - * slow stations to starve). - * - * @pubsta: the station to set throughput for. - * @thr: the current expected throughput in kbps. - */ -void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, - u32 thr); - -/** * ieee80211_tx_rate_update - transmit rate update callback * * Drivers should call this functions with a non-NULL pub sta @@ -5151,7 +5399,7 @@ void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_tx_info *info); /** - * ieee80211_tx_status - transmit status callback + * ieee80211_tx_status_skb - transmit status callback * * Call this function for all transmitted frames after they have been * transmitted. It is permissible to not call this function for @@ -5166,13 +5414,13 @@ void ieee80211_tx_rate_update(struct ieee80211_hw *hw, * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ -void ieee80211_tx_status(struct ieee80211_hw *hw, - struct sk_buff *skb); +void ieee80211_tx_status_skb(struct ieee80211_hw *hw, + struct sk_buff *skb); /** * ieee80211_tx_status_ext - extended transmit status callback * - * This function can be used as a replacement for ieee80211_tx_status + * This function can be used as a replacement for ieee80211_tx_status_skb() * in drivers that may want to provide extra information that does not * fit into &struct ieee80211_tx_info. * @@ -5189,7 +5437,7 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, /** * ieee80211_tx_status_noskb - transmit status callback without skb * - * This function can be used as a replacement for ieee80211_tx_status + * This function can be used as a replacement for ieee80211_tx_status_skb() * in drivers that cannot reliably map tx status information back to * specific skbs. * @@ -5217,9 +5465,9 @@ static inline void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, /** * ieee80211_tx_status_ni - transmit status callback (in process context) * - * Like ieee80211_tx_status() but can be called in process context. + * Like ieee80211_tx_status_skb() but can be called in process context. * - * Calls to this function, ieee80211_tx_status() and + * Calls to this function, ieee80211_tx_status_skb() and * ieee80211_tx_status_irqsafe() may not be mixed * for a single hardware. * @@ -5230,17 +5478,17 @@ static inline void ieee80211_tx_status_ni(struct ieee80211_hw *hw, struct sk_buff *skb) { local_bh_disable(); - ieee80211_tx_status(hw, skb); + ieee80211_tx_status_skb(hw, skb); local_bh_enable(); } /** * ieee80211_tx_status_irqsafe - IRQ-safe transmit status callback * - * Like ieee80211_tx_status() but can be called in IRQ context + * Like ieee80211_tx_status_skb() but can be called in IRQ context * (internally defers to a tasklet.) * - * Calls to this function, ieee80211_tx_status() and + * Calls to this function, ieee80211_tx_status_skb() and * ieee80211_tx_status_ni() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by @@ -5424,6 +5672,7 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, /** * ieee80211_beacon_update_cntdwn - request mac80211 to decrement the beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * * The beacon counter should be updated after each beacon transmission. * This function is called implicitly when @@ -5433,7 +5682,8 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, * * Return: new countdown value */ -u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif); +u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, + unsigned int link_id); /** * ieee80211_beacon_set_cntdwn - request mac80211 to set beacon countdown @@ -5451,30 +5701,34 @@ void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter); /** * ieee80211_csa_finish - notify mac80211 about channel switch * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * * After a channel switch announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the channel can be changed. */ -void ieee80211_csa_finish(struct ieee80211_vif *vif); +void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1 * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * - * This function returns whether the countdown reached zero. + * Return: %true if the countdown reached 1, %false otherwise */ -bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif); +bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, + unsigned int link_id); /** * ieee80211_color_change_finish - notify mac80211 about color change * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * * After a color change announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the color can be changed */ -void ieee80211_color_change_finish(struct ieee80211_vif *vif); +void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id); /** * ieee80211_proberesp_get - retrieve a Probe Response template @@ -5785,29 +6039,20 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); /** - * ieee80211_remove_key - remove the given key - * @keyconf: the parameter passed with the set key - * - * Remove the given key. If the key was uploaded to the hardware at the - * time this function is called, it is not deleted in the hardware but - * instead assumed to have been removed already. - * - * Note that due to locking considerations this function can (currently) - * only be called during key iteration (ieee80211_iter_keys().) - */ -void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); - -/** * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN * @vif: the virtual interface to add the key on - * @keyconf: new key data + * @idx: the keyidx of the key + * @key_data: the key data + * @key_len: the key data. Might be bigger than the actual key length, + * but not smaller (for the driver convinence) + * @link_id: the link id of the key or -1 for non-MLO * * When GTK rekeying was done while the system was suspended, (a) new * key(s) will be available. These will be needed by mac80211 for proper * RX processing, so this function allows setting them. * - * The function returns the newly allocated key structure, which will - * have similar contents to the passed key configuration but point to + * Return: the newly allocated key structure, which will have + * similar contents to the passed key configuration but point to * mac80211-owned memory. In case of errors, the function returns an * ERR_PTR(), use IS_ERR() etc. * @@ -5822,13 +6067,12 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); * for the new key for each TID to set up sequence counters properly. * * IMPORTANT: If this replaces a key that is present in the hardware, - * then it will attempt to remove it during this call. In many cases - * this isn't what you want, so call ieee80211_remove_key() first for - * the key that's being replaced. + * then it will attempt to remove it during this call. */ struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, - struct ieee80211_key_conf *keyconf); + u8 idx, u8 *key_data, u8 key_len, + int link_id); /** * ieee80211_gtk_rekey_notify - notify userspace supplicant of rekeying @@ -6063,6 +6307,24 @@ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data); + +/** + * ieee80211_iterate_stations_mtx - iterate stations + * + * This function iterates over all stations associated with a given + * hardware that are currently uploaded to the driver and calls the callback + * function for them. This version can only be used while holding the wiphy + * mutex. + * + * @hw: the hardware struct of which the interfaces should be iterated over + * @iterator: the iterator function to call + * @data: first argument of the iterator function + */ +void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, + void (*iterator)(void *data, + struct ieee80211_sta *sta), + void *data); + /** * ieee80211_queue_work - add work onto the mac80211 workqueue * @@ -6205,6 +6467,8 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, * may be %NULL if the link ID is not needed * * Obtain the STA by link address, must use RCU protection. + * + * Return: pointer to STA if found, otherwise %NULL. */ struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, @@ -6334,8 +6598,8 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, * @hw: pointer obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * - * Return true if the AQL's airtime limit has not been reached and the txq can - * continue to send more packets to the device. Otherwise return false. + * Return: %true if the AQL's airtime limit has not been reached and the txq can + * continue to send more packets to the device. Otherwise return %false. */ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); @@ -6347,12 +6611,12 @@ ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * + * Context: Must be called with wiphy mutex held; can sleep. + * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. This is intended for use in WoWLAN if the device - * needs reprogramming of the keys during suspend. Note that due - * to locking reasons, it is also only safe to call this at few - * spots since it must hold the RTNL and be able to sleep. + * needs reprogramming of the keys during suspend. * * The order in which the keys are iterated matches the order * in which they were originally installed and handed to the @@ -6418,6 +6682,31 @@ void ieee80211_iter_chan_contexts_atomic( void *iter_data); /** + * ieee80211_iter_chan_contexts_mtx - iterate channel contexts + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @iter: iterator function + * @iter_data: data passed to iterator function + * + * Iterate all active channel contexts. This function can only be used while + * holding the wiphy mutex. + * + * The iterator will not find a context that's being added (during + * the driver callback to add it) but will find it while it's being + * removed. + * + * Note that during hardware restart, all contexts that existed + * before the restart are considered already present so will be + * found while iterating, whether they've been re-added already + * or not. + */ +void ieee80211_iter_chan_contexts_mtx( + struct ieee80211_hw *hw, + void (*iter)(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *chanctx_conf, + void *data), + void *iter_data); + +/** * ieee80211_ap_probereq_get - retrieve a Probe Request template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. @@ -6535,30 +6824,34 @@ void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp); * ieee80211_radar_detected - inform that a radar was detected * * @hw: pointer as obtained from ieee80211_alloc_hw() + * @chanctx_conf: Channel context on which radar is detected. Mandatory to + * pass a valid pointer during MLO. For non-MLO %NULL can be passed */ -void ieee80211_radar_detected(struct ieee80211_hw *hw); +void ieee80211_radar_detected(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *chanctx_conf); /** * ieee80211_chswitch_done - Complete channel switch process * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @success: make the channel switch successful or not + * @link_id: the link_id on which the switch was done. Ignored if success is + * false. * * Complete the channel switch post-process: set the new operational channel * and wake up the suspended queues. */ -void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success); +void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, + unsigned int link_id); /** * ieee80211_channel_switch_disconnect - disconnect due to channel switch error * @vif: &struct ieee80211_vif pointer from the add_interface callback. - * @block_tx: if %true, do not send deauth frame. * * Instruct mac80211 to disconnect due to a channel switch error. The channel * switch can request to block the tx and so, we need to make sure we do not send * a deauth frame in this case. */ -void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif, - bool block_tx); +void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif); /** * ieee80211_request_smps - request SM PS transition @@ -6835,6 +7128,8 @@ bool rate_usable_index_exists(struct ieee80211_supported_band *sband, * @hw: pointer as obtained from ieee80211_alloc_hw() * @pubsta: &struct ieee80211_sta pointer to the target destination. * @rates: new tx rate set to be used for this station. + * + * Return: 0 on success. An error code otherwise. */ int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, @@ -6928,7 +7223,7 @@ ieee80211_get_he_6ghz_capa_vif(const struct ieee80211_supported_band *sband, } /** - * ieee80211_get_eht_iftype_cap_vif - return ETH capabilities for sband/vif + * ieee80211_get_eht_iftype_cap_vif - return EHT capabilities for sband/vif * @sband: the sband to search for the iftype on * @vif: the vif to get the iftype from * @@ -6967,13 +7262,14 @@ void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif); * ieee80211_ave_rssi - report the average RSSI for the specified interface * * @vif: the specified virtual interface + * @link_id: the link ID for MLO, or -1 for non-MLO * * Note: This function assumes that the given vif is valid. * * Return: The average RSSI value for the requested interface, or 0 if not * applicable. */ -int ieee80211_ave_rssi(struct ieee80211_vif *vif); +int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id); /** * ieee80211_report_wowlan_wakeup - report WoWLAN wakeup @@ -6995,6 +7291,8 @@ void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif, * @band: the band to transmit on * @sta: optional pointer to get the station to send the frame to * + * Return: %true if the skb was prepared, %false otherwise + * * Note: must be called under RCU lock */ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, @@ -7011,6 +7309,8 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, * * @skb: packet injected by userspace * @dev: the &struct device of this 802.11 device + * + * Return: %true if the radiotap header was parsed, %false otherwise */ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct net_device *dev); @@ -7120,7 +7420,7 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() * - * Returns the skb if successful, %NULL if no frame was available. + * Return: the skb if successful, %NULL if no frame was available. * * Note that this must be called in an rcu_read_lock() critical section, * which can only be released after the SKB was handled. Some pointers in @@ -7146,6 +7446,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() + * + * Return: the skb if successful, %NULL if no frame was available. */ static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw, struct ieee80211_txq *txq) @@ -7177,7 +7479,7 @@ void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, * @hw: pointer as obtained from ieee80211_alloc_hw() * @ac: AC number to return packets from. * - * Returns the next txq if successful, %NULL if no queue is eligible. If a txq + * Return: the next txq if successful, %NULL if no queue is eligible. If a txq * is returned, it should be returned with ieee80211_return_txq() after the * driver has finished scheduling it. */ @@ -7242,7 +7544,7 @@ ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, * * This function is used to check whether given txq is allowed to transmit by * the airtime scheduler, and can be used by drivers to access the airtime - * fairness accounting without going using the scheduling order enfored by + * fairness accounting without using the scheduling order enforced by * next_txq(). * * Returns %true if the airtime scheduler thinks the TXQ should be allowed to @@ -7260,6 +7562,8 @@ ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface + * + * Return: %true if transmission is allowed, %false otherwise */ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq); @@ -7320,6 +7624,8 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif, * @status: &struct ieee80211_rx_status containing the transmission rate * information. * @len: frame length in bytes + * + * Return: the airtime estimate */ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, struct ieee80211_rx_status *status, @@ -7334,23 +7640,13 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, * @hw: pointer as obtained from ieee80211_alloc_hw() * @info: &struct ieee80211_tx_info of the frame. * @len: frame length in bytes + * + * Return: the airtime estimate */ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int len); /** - * ieee80211_set_hw_80211_encap - enable hardware encapsulation offloading. - * - * This function is used to notify mac80211 that a vif can be passed raw 802.3 - * frames. The driver needs to then handle the 802.11 encapsulation inside the - * hardware or firmware. - * - * @vif: &struct ieee80211_vif pointer from the add_interface callback. - * @enable: indicate if the feature should be turned on or off - */ -bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable); - -/** * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. @@ -7379,15 +7675,15 @@ ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, /** * ieee80211_obss_color_collision_notify - notify userland about a BSS color * collision. + * @link_id: valid link_id during MLO or 0 for non-MLO * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is * aware of. - * @gfp: allocation flags */ void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, - u64 color_bitmap, gfp_t gfp); + u64 color_bitmap, u8 link_id); /** * ieee80211_is_tx_data - check if frame is a data frame @@ -7396,6 +7692,8 @@ ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, * hardware encapsulation enabled are data frames. * * @skb: the frame to be transmitted. + * + * Return: %true if @skb is a data frame, %false otherwise */ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) { @@ -7411,6 +7709,9 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * @vif: interface to set active links on * @active_links: the new active links bitmap * + * Context: Must be called with wiphy mutex held; may sleep; calls + * back into the driver. + * * This changes the active links on an interface. The interface * must be in client mode (in AP mode, all links are always active), * and @active_links must be a subset of the vif's valid_links. @@ -7418,19 +7719,18 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * If a link is switched off and another is switched on at the same * time (e.g. active_links going from 0x1 to 0x10) then you will get * a sequence of calls like + * * - change_vif_links(0x11) * - unassign_vif_chanctx(link_id=0) + * - assign_vif_chanctx(link_id=4) * - change_sta_links(0x11) for each affected STA (the AP) * (TDLS connections on now inactive links should be torn down) * - remove group keys on the old link (link_id 0) * - add new group keys (GTK/IGTK/BIGTK) on the new link (link_id 4) * - change_sta_links(0x10) for each affected STA (the AP) - * - assign_vif_chanctx(link_id=4) * - change_vif_links(0x10) * - * Note: This function acquires some mac80211 locks and must not - * be called with any driver locks held that could cause a - * lock dependency inversion. Best call it without locks. + * Return: 0 on success. An error code otherwise. */ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); @@ -7447,4 +7747,103 @@ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); void ieee80211_set_active_links_async(struct ieee80211_vif *vif, u16 active_links); +/** + * ieee80211_send_teardown_neg_ttlm - tear down a negotiated TTLM request + * @vif: the interface on which the tear down request should be sent. + * + * This function can be used to tear down a previously accepted negotiated + * TTLM request. + */ +void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif); + +/** + * ieee80211_chan_width_to_rx_bw - convert channel width to STA RX bandwidth + * @width: the channel width value to convert + * Return: the STA RX bandwidth value for the channel width + */ +static inline enum ieee80211_sta_rx_bandwidth +ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) +{ + switch (width) { + default: + WARN_ON_ONCE(1); + fallthrough; + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + return IEEE80211_STA_RX_BW_20; + case NL80211_CHAN_WIDTH_40: + return IEEE80211_STA_RX_BW_40; + case NL80211_CHAN_WIDTH_80: + return IEEE80211_STA_RX_BW_80; + case NL80211_CHAN_WIDTH_160: + case NL80211_CHAN_WIDTH_80P80: + return IEEE80211_STA_RX_BW_160; + case NL80211_CHAN_WIDTH_320: + return IEEE80211_STA_RX_BW_320; + } +} + +/** + * ieee80211_prepare_rx_omi_bw - prepare for sending BW RX OMI + * @link_sta: the link STA the OMI is going to be sent to + * @bw: the bandwidth requested + * + * When the driver decides to do RX OMI to change bandwidth with a STA + * it calls this function to prepare, then sends the OMI, and finally + * calls ieee80211_finalize_rx_omi_bw(). + * + * Note that the (link) STA rate control is updated accordingly as well, + * but the chanctx might not be updated if there are other users. + * If the intention is to reduce the listen bandwidth, the driver must + * ensure there are no TDLS stations nor other uses of the chanctx. + * + * Also note that in order to sequence correctly, narrowing bandwidth + * will only happen in ieee80211_finalize_rx_omi_bw(), whereas widening + * again (e.g. going back to normal) will happen here. + * + * Note that we treat this symmetrically, so if the driver calls this + * and tells the peer to only send with a lower bandwidth, we assume + * that the driver also wants to only send at that lower bandwidth, to + * allow narrowing of the chanctx request for this station/interface. + * + * Finally, the driver must ensure that if the function returned %true, + * ieee80211_finalize_rx_omi_bw() is also called, even for example in + * case of HW restart. + * + * Context: Must be called with wiphy mutex held, and will call back + * into the driver, so ensure no driver locks are held. + * + * Return: %true if changes are going to be made, %false otherwise + */ +bool ieee80211_prepare_rx_omi_bw(struct ieee80211_link_sta *link_sta, + enum ieee80211_sta_rx_bandwidth bw); + +/** + * ieee80211_finalize_rx_omi_bw - finalize BW RX OMI update + * @link_sta: the link STA the OMI was sent to + * + * See ieee80211_client_prepare_rx_omi_bw(). Context is the same here + * as well. + */ +void ieee80211_finalize_rx_omi_bw(struct ieee80211_link_sta *link_sta); + +/* for older drivers - let's not document these ... */ +int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx); +void ieee80211_emulate_remove_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx); +void ieee80211_emulate_change_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx, + u32 changed); +int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, + struct ieee80211_vif_chanctx_switch *vifs, + int n_vifs, + enum ieee80211_chanctx_switch_mode mode); + +/** + * ieee80211_vif_nan_started - Return whether a NAN vif is started + * @vif: the vif + * Return: %true iff the vif is a NAN interface and NAN is started + */ +bool ieee80211_vif_nan_started(struct ieee80211_vif *vif); #endif /* MAC80211_H */ diff --git a/include/net/mac802154.h b/include/net/mac802154.h index 4a3a9de9da73..d72006a85f02 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -7,7 +7,7 @@ #ifndef NET_MAC802154_H #define NET_MAC802154_H -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/af_ieee802154.h> #include <linux/ieee802154.h> #include <linux/skbuff.h> @@ -140,7 +140,7 @@ enum ieee802154_hw_flags { * * xmit_sync: * Handler that 802.15.4 module calls for each transmitted frame. - * skb cntains the buffer starting from the IEEE 802.15.4 header. + * skb contains the buffer starting from the IEEE 802.15.4 header. * The low-level driver should send the frame based on available * configuration. This is called by a workqueue and useful for * synchronous 802.15.4 drivers. @@ -152,7 +152,7 @@ enum ieee802154_hw_flags { * * xmit_async: * Handler that 802.15.4 module calls for each transmitted frame. - * skb cntains the buffer starting from the IEEE 802.15.4 header. + * skb contains the buffer starting from the IEEE 802.15.4 header. * The low-level driver should send the frame based on available * configuration. * This function should return zero or negative errno. diff --git a/include/net/macsec.h b/include/net/macsec.h index 75a6f4863c83..bc7de5b53e54 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -38,8 +38,8 @@ struct metadata_dst; typedef union salt { struct { - u32 ssci; - u64 pn; + ssci_t ssci; + __be64 pn; } __packed; u8 bytes[MACSEC_SALT_LEN]; } __packed salt_t; @@ -247,6 +247,23 @@ struct macsec_secy { /** * struct macsec_context - MACsec context for hardware offloading + * @netdev: a valid pointer to a struct net_device if @offload == + * MACSEC_OFFLOAD_MAC + * @phydev: a valid pointer to a struct phy_device if @offload == + * MACSEC_OFFLOAD_PHY + * @offload: MACsec offload status + * @secy: pointer to a MACsec SecY + * @rx_sc: pointer to a RX SC + * @update_pn: when updating the SA, update the next PN + * @assoc_num: association number of the target SA + * @key: key of the target SA + * @rx_sa: pointer to an RX SA if a RX SA is added/updated/removed + * @tx_sa: pointer to an TX SA if a TX SA is added/updated/removed + * @tx_sc_stats: pointer to TX SC stats structure + * @tx_sa_stats: pointer to TX SA stats structure + * @rx_sc_stats: pointer to RX SC stats structure + * @rx_sa_stats: pointer to RX SA stats structure + * @dev_stats: pointer to dev stats structure */ struct macsec_context { union { @@ -258,6 +275,7 @@ struct macsec_context { struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct { + bool update_pn; unsigned char assoc_num; u8 key[MACSEC_MAX_KEY_LEN]; union { @@ -276,6 +294,34 @@ struct macsec_context { /** * struct macsec_ops - MACsec offloading operations + * @mdo_dev_open: called when the MACsec interface transitions to the up state + * @mdo_dev_stop: called when the MACsec interface transitions to the down + * state + * @mdo_add_secy: called when a new SecY is added + * @mdo_upd_secy: called when the SecY flags are changed or the MAC address of + * the MACsec interface is changed + * @mdo_del_secy: called when the hw offload is disabled or the MACsec + * interface is removed + * @mdo_add_rxsc: called when a new RX SC is added + * @mdo_upd_rxsc: called when a certain RX SC is updated + * @mdo_del_rxsc: called when a certain RX SC is removed + * @mdo_add_rxsa: called when a new RX SA is added + * @mdo_upd_rxsa: called when a certain RX SA is updated + * @mdo_del_rxsa: called when a certain RX SA is removed + * @mdo_add_txsa: called when a new TX SA is added + * @mdo_upd_txsa: called when a certain TX SA is updated + * @mdo_del_txsa: called when a certain TX SA is removed + * @mdo_get_dev_stats: called when dev stats are read + * @mdo_get_tx_sc_stats: called when TX SC stats are read + * @mdo_get_tx_sa_stats: called when TX SA stats are read + * @mdo_get_rx_sc_stats: called when RX SC stats are read + * @mdo_get_rx_sa_stats: called when RX SA stats are read + * @mdo_insert_tx_tag: called to insert the TX tag + * @needed_headroom: number of bytes reserved at the beginning of the sk_buff + * for the TX tag + * @needed_tailroom: number of bytes reserved at the end of the sk_buff for the + * TX tag + * @rx_uses_md_dst: whether MACsec device offload supports sk_buff md_dst */ struct macsec_ops { /* Device wide */ @@ -302,6 +348,12 @@ struct macsec_ops { int (*mdo_get_tx_sa_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sc_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sa_stats)(struct macsec_context *ctx); + /* Offload tag */ + int (*mdo_insert_tx_tag)(struct phy_device *phydev, + struct sk_buff *skb); + unsigned int needed_headroom; + unsigned int needed_tailroom; + bool rx_uses_md_dst; }; void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); @@ -324,4 +376,9 @@ static inline void *macsec_netdev_priv(const struct net_device *dev) return netdev_priv(dev); } +static inline u64 sci_to_cpu(sci_t sci) +{ + return be64_to_cpu((__force __be64)sci); +} + #endif /* _NET_MACSEC_H_ */ diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 88b6ef7ce1a6..eaa27483f99b 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -10,6 +10,7 @@ #include "shm_channel.h" #define GDMA_STATUS_MORE_ENTRIES 0x00000105 +#define GDMA_STATUS_CMD_UNSUPPORTED 0xffffffff /* Structures labeled with "HW DATA" are exchanged with the hardware. All of * them are naturally aligned and hence don't need __packed. @@ -58,14 +59,30 @@ enum gdma_eqe_type { GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, GDMA_EQE_HWC_INIT_DATA = 130, GDMA_EQE_HWC_INIT_DONE = 131, - GDMA_EQE_HWC_SOC_RECONFIG = 132, + GDMA_EQE_HWC_FPGA_RECONFIG = 132, GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, + GDMA_EQE_HWC_SOC_SERVICE = 134, + GDMA_EQE_HWC_RESET_REQUEST = 135, + GDMA_EQE_RNIC_QP_FATAL = 176, }; enum { GDMA_DEVICE_NONE = 0, GDMA_DEVICE_HWC = 1, GDMA_DEVICE_MANA = 2, + GDMA_DEVICE_MANA_IB = 3, +}; + +enum gdma_service_type { + GDMA_SERVICE_TYPE_NONE = 0, + GDMA_SERVICE_TYPE_RDMA_SUSPEND = 1, + GDMA_SERVICE_TYPE_RDMA_RESUME = 2, +}; + +struct mana_service_work { + struct work_struct work; + struct gdma_dev *gdma_dev; + enum gdma_service_type event; }; struct gdma_resource { @@ -149,6 +166,8 @@ struct gdma_general_req { #define GDMA_MESSAGE_V1 1 #define GDMA_MESSAGE_V2 2 +#define GDMA_MESSAGE_V3 3 +#define GDMA_MESSAGE_V4 4 struct gdma_general_resp { struct gdma_resp_hdr hdr; @@ -220,9 +239,19 @@ struct gdma_dev { void *driver_data; struct auxiliary_device *adev; + bool is_suspended; + bool rdma_teardown; }; -#define MINIMUM_SUPPORTED_PAGE_SIZE PAGE_SIZE +/* MANA_PAGE_SIZE is the DMA unit */ +#define MANA_PAGE_SHIFT 12 +#define MANA_PAGE_SIZE BIT(MANA_PAGE_SHIFT) +#define MANA_PAGE_ALIGN(x) ALIGN((x), MANA_PAGE_SIZE) +#define MANA_PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), MANA_PAGE_SIZE) +#define MANA_PFN(a) ((a) >> MANA_PAGE_SHIFT) + +/* Required by HW */ +#define MANA_MIN_QSIZE MANA_PAGE_SIZE #define GDMA_CQE_SIZE 64 #define GDMA_EQE_SIZE 16 @@ -256,7 +285,8 @@ struct gdma_event { struct gdma_queue; struct mana_eq { - struct gdma_queue *eq; + struct gdma_queue *eq; + struct dentry *mana_eq_debugfs; }; typedef void gdma_eq_callback(void *context, struct gdma_queue *q, @@ -293,6 +323,7 @@ struct gdma_queue { u32 head; u32 tail; + struct list_head entry; /* Extra fields specific to EQ/CQ. */ union { @@ -328,6 +359,7 @@ struct gdma_queue_spec { void *context; unsigned long log2_throttle_limit; + unsigned int msix_index; } eq; struct { @@ -344,19 +376,25 @@ struct gdma_queue_spec { struct gdma_irq_context { void (*handler)(void *arg); - void *arg; + /* Protect the eq_list */ + spinlock_t lock; + struct list_head eq_list; char name[MANA_IRQ_NAME_SZ]; }; +enum gdma_context_flags { + GC_PROBE_SUCCEEDED = 0, +}; + struct gdma_context { struct device *dev; + struct dentry *mana_pci_debugfs; /* Per-vPort max number of queues */ unsigned int max_num_queues; unsigned int max_num_msix; unsigned int num_msix_usable; - struct gdma_resource msix_resource; - struct gdma_irq_context *irq_contexts; + struct xarray irq_contexts; /* L2 MTU */ u16 adapter_mtu; @@ -371,6 +409,8 @@ struct gdma_context { u32 test_event_eq_id; bool is_pf; + bool in_service; + phys_addr_t bar0_pa; void __iomem *bar0_va; void __iomem *shm_base; @@ -387,9 +427,16 @@ struct gdma_context { /* Azure network adapter */ struct gdma_dev mana; -}; -#define MAX_NUM_GDMA_DEVICES 4 + /* Azure RDMA adapter */ + struct gdma_dev mana_ib; + + u64 pf_cap_flags1; + + struct workqueue_struct *service_wq; + + unsigned long flags; +}; static inline bool mana_gd_is_mana(struct gdma_dev *gd) { @@ -445,6 +492,8 @@ struct gdma_wqe { #define INLINE_OOB_SMALL_SIZE 8 #define INLINE_OOB_LARGE_SIZE 24 +#define MANA_MAX_TX_WQE_SGL_ENTRIES 30 + #define MAX_TX_WQE_SIZE 512 #define MAX_RX_WQE_SIZE 256 @@ -535,11 +584,44 @@ enum { */ #define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2) #define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3) +#define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4) +#define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5) + +/* Driver can handle holes (zeros) in the device list */ +#define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) + +/* Driver supports dynamic MSI-X vector allocation */ +#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13) + +/* Driver can self reset on EQE notification */ +#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14) + +/* Driver can self reset on FPGA Reconfig EQE notification */ +#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) +#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6) + +/* Driver supports linearizing the skb when num_sge exceeds hardware limit */ +#define GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE BIT(20) + +/* Driver can send HWC periodically to query stats */ +#define GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY BIT(21) + +/* Driver can handle hardware recovery events during probe */ +#define GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY BIT(22) #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ - GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) + GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ + GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ + GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ + GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \ + GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \ + GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \ + GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE | \ + GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY | \ + GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \ + GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY) #define GDMA_DRV_CAP_FLAGS2 0 @@ -600,11 +682,12 @@ struct gdma_query_max_resources_resp { }; /* HW DATA */ /* GDMA_LIST_DEVICES */ +#define GDMA_DEV_LIST_SIZE 64 struct gdma_list_devices_resp { struct gdma_resp_hdr hdr; u32 num_of_devs; u32 reserved; - struct gdma_dev_id devs[64]; + struct gdma_dev_id devs[GDMA_DEV_LIST_SIZE]; }; /* HW DATA */ /* GDMA_REGISTER_DEVICE */ @@ -682,20 +765,6 @@ struct gdma_query_hwc_timeout_resp { u32 reserved; }; -enum atb_page_size { - ATB_PAGE_SIZE_4K, - ATB_PAGE_SIZE_8K, - ATB_PAGE_SIZE_16K, - ATB_PAGE_SIZE_32K, - ATB_PAGE_SIZE_64K, - ATB_PAGE_SIZE_128K, - ATB_PAGE_SIZE_256K, - ATB_PAGE_SIZE_512K, - ATB_PAGE_SIZE_1M, - ATB_PAGE_SIZE_2M, - ATB_PAGE_SIZE_MAX, -}; - enum gdma_mr_access_flags { GDMA_ACCESS_FLAG_LOCAL_READ = BIT_ULL(0), GDMA_ACCESS_FLAG_LOCAL_WRITE = BIT_ULL(1), @@ -754,6 +823,7 @@ struct gdma_destroy_dma_region_req { enum gdma_pd_flags { GDMA_PD_FLAG_INVALID = 0, + GDMA_PD_FLAG_ALLOW_GPA_MR = 1, }; struct gdma_create_pd_req { @@ -779,11 +849,18 @@ struct gdma_destory_pd_resp { };/* HW DATA */ enum gdma_mr_type { + /* + * Guest Physical Address - MRs of this type allow access + * to any DMA-mapped memory using bus-logical address + */ + GDMA_MR_TYPE_GPA = 1, /* Guest Virtual Address - MRs of this type allow access * to memory mapped by PTEs associated with this MR using a virtual * address that is set up in the MST */ GDMA_MR_TYPE_GVA = 2, + /* Guest zero-based address MRs */ + GDMA_MR_TYPE_ZBVA = 4, }; struct gdma_create_mr_params { @@ -795,6 +872,10 @@ struct gdma_create_mr_params { u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; + struct { + u64 dma_region_handle; + enum gdma_mr_access_flags access_flags; + } zbva; }; }; @@ -810,7 +891,10 @@ struct gdma_create_mr_request { u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; - + struct { + u64 dma_region_handle; + enum gdma_mr_access_flags access_flags; + } zbva; }; u32 reserved_2; };/* HW DATA */ @@ -859,5 +943,14 @@ int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req, u32 resp_len, void *resp); int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); +void mana_register_debugfs(void); +void mana_unregister_debugfs(void); + +int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event); + +int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state); +int mana_gd_resume(struct pci_dev *pdev); + +bool mana_need_log(struct gdma_context *gc, int err); #endif /* _GDMA_H */ diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h index 3d3b5c881bc1..16feb39616c1 100644 --- a/include/net/mana/hw_channel.h +++ b/include/net/mana/hw_channel.h @@ -24,6 +24,8 @@ #define HWC_INIT_DATA_PF_DEST_CQ_ID 11 #define HWC_DATA_CFG_HWC_TIMEOUT 1 +#define HWC_DATA_HW_LINK_CONNECT 2 +#define HWC_DATA_HW_LINK_DISCONNECT 3 #define HW_CHANNEL_WAIT_RESOURCE_TIMEOUT_MS 30000 @@ -49,6 +51,15 @@ union hwc_init_type_data { }; }; /* HW DATA */ +union hwc_init_soc_service_type { + u32 as_uint32; + + struct { + u32 value : 28; + u32 type : 4; + }; +}; /* HW DATA */ + struct hwc_rx_oob { u32 type : 6; u32 eom : 1; @@ -121,7 +132,7 @@ struct hwc_dma_buf { u32 gpa_mkey; u32 num_reqs; - struct hwc_work_request reqs[]; + struct hwc_work_request reqs[] __counted_by(num_reqs); }; typedef void hwc_rx_event_handler_t(void *ctx, u32 gdma_rxq_id, diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 9f70b4332238..d7e089c6b694 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -5,6 +5,7 @@ #define _MANA_H #include <net/xdp.h> +#include <net/net_shaper.h> #include "gdma.h" #include "hw_channel.h" @@ -30,20 +31,32 @@ enum TRI_STATE { }; /* Number of entries for hardware indirection table must be in power of 2 */ -#define MANA_INDIRECT_TABLE_SIZE 64 -#define MANA_INDIRECT_TABLE_MASK (MANA_INDIRECT_TABLE_SIZE - 1) +#define MANA_INDIRECT_TABLE_MAX_SIZE 512 +#define MANA_INDIRECT_TABLE_DEF_SIZE 64 /* The Toeplitz hash key's length in bytes: should be multiple of 8 */ #define MANA_HASH_KEY_SIZE 40 #define COMP_ENTRY_SIZE 64 -#define RX_BUFFERS_PER_QUEUE 512 -#define MANA_RX_DATA_ALIGN 64 +/* This Max value for RX buffers is derived from __alloc_page()'s max page + * allocation calculation. It allows maximum 2^(MAX_ORDER -1) pages. RX buffer + * size beyond this value gets rejected by __alloc_page() call. + */ +#define MAX_RX_BUFFERS_PER_QUEUE 8192 +#define DEF_RX_BUFFERS_PER_QUEUE 1024 +#define MIN_RX_BUFFERS_PER_QUEUE 128 + +/* This max value for TX buffers is derived as the maximum allocatable + * pages supported on host per guest through testing. TX buffer size beyond + * this value is rejected by the hardware. + */ +#define MAX_TX_BUFFERS_PER_QUEUE 16384 +#define DEF_TX_BUFFERS_PER_QUEUE 256 +#define MIN_TX_BUFFERS_PER_QUEUE 128 -#define MAX_SEND_BUFFERS_PER_QUEUE 256 +#define EQ_SIZE (8 * MANA_PAGE_SIZE) -#define EQ_SIZE (8 * PAGE_SIZE) #define LOG2_EQ_THROTTLE 3 #define MAX_PORTS_IN_MANA_DEV 256 @@ -52,6 +65,8 @@ enum TRI_STATE { #define MANA_STATS_RX_COUNT 5 #define MANA_STATS_TX_COUNT 11 +#define MANA_RX_FRAG_ALIGNMENT 64 + struct mana_stats_rx { u64 packets; u64 bytes; @@ -98,14 +113,17 @@ struct mana_txq { atomic_t pending_sends; + bool napi_initialized; + struct mana_stats_tx stats; }; /* skb data and frags dma mappings */ struct mana_skb_head { - dma_addr_t dma_handle[MAX_SKB_FRAGS + 1]; + /* GSO pkts may have 2 SGEs for the linear part*/ + dma_addr_t dma_handle[MAX_SKB_FRAGS + 2]; - u32 size[MAX_SKB_FRAGS + 1]; + u32 size[MAX_SKB_FRAGS + 2]; }; #define MANA_HEADROOM sizeof(struct mana_skb_head) @@ -274,6 +292,7 @@ struct mana_cq { /* NAPI data */ struct napi_struct napi; int work_done; + int work_done_since_doorbell; int budget; }; @@ -284,7 +303,7 @@ struct mana_recv_buf_oob { void *buf_va; bool from_pool; /* allocated from a page pool */ - /* SGL of the buffer going to be sent has part of the work request. */ + /* SGL of the buffer going to be sent as part of the work request. */ u32 num_sge; struct gdma_sge sgl[MAX_RX_WQE_SGL_ENTRIES]; @@ -311,6 +330,7 @@ struct mana_rxq { u32 datasize; u32 alloc_size; u32 headroom; + u32 frag_count; mana_handle_t rxobj; @@ -334,11 +354,12 @@ struct mana_rxq { int xdp_rc; /* XDP redirect return code */ struct page_pool *page_pool; + struct dentry *mana_rx_debugfs; /* MUST BE THE LAST MEMBER: * Each receive buffer has an associated mana_recv_buf_oob. */ - struct mana_recv_buf_oob rx_oobs[]; + struct mana_recv_buf_oob rx_oobs[] __counted_by(num_rx_buf); }; struct mana_tx_qp { @@ -347,11 +368,40 @@ struct mana_tx_qp { struct mana_cq tx_cq; mana_handle_t tx_object; + + struct dentry *mana_tx_debugfs; }; struct mana_ethtool_stats { u64 stop_queue; u64 wake_queue; + u64 tx_cqe_err; + u64 tx_cqe_unknown_type; + u64 tx_linear_pkt_cnt; + u64 rx_coalesced_err; + u64 rx_cqe_unknown_type; +}; + +struct mana_ethtool_hc_stats { + u64 hc_rx_discards_no_wqe; + u64 hc_rx_err_vport_disabled; + u64 hc_rx_bytes; + u64 hc_rx_ucast_pkts; + u64 hc_rx_ucast_bytes; + u64 hc_rx_bcast_pkts; + u64 hc_rx_bcast_bytes; + u64 hc_rx_mcast_pkts; + u64 hc_rx_mcast_bytes; + u64 hc_tx_err_gf_disabled; + u64 hc_tx_err_vport_disabled; + u64 hc_tx_err_inval_vportoffset_pkt; + u64 hc_tx_err_vlan_enforcement; + u64 hc_tx_err_eth_type_enforcement; + u64 hc_tx_err_sa_enforcement; + u64 hc_tx_err_sqpdid_enforcement; + u64 hc_tx_err_cqpdid_enforcement; + u64 hc_tx_err_mtu_violation; + u64 hc_tx_err_inval_oob; u64 hc_tx_bytes; u64 hc_tx_ucast_pkts; u64 hc_tx_ucast_bytes; @@ -359,20 +409,87 @@ struct mana_ethtool_stats { u64 hc_tx_bcast_bytes; u64 hc_tx_mcast_pkts; u64 hc_tx_mcast_bytes; - u64 tx_cqe_err; - u64 tx_cqe_unknown_type; - u64 rx_coalesced_err; - u64 rx_cqe_unknown_type; + u64 hc_tx_err_gdma; +}; + +struct mana_ethtool_phy_stats { + /* Drop Counters */ + u64 rx_pkt_drop_phy; + u64 tx_pkt_drop_phy; + + /* Per TC traffic Counters */ + u64 rx_pkt_tc0_phy; + u64 tx_pkt_tc0_phy; + u64 rx_pkt_tc1_phy; + u64 tx_pkt_tc1_phy; + u64 rx_pkt_tc2_phy; + u64 tx_pkt_tc2_phy; + u64 rx_pkt_tc3_phy; + u64 tx_pkt_tc3_phy; + u64 rx_pkt_tc4_phy; + u64 tx_pkt_tc4_phy; + u64 rx_pkt_tc5_phy; + u64 tx_pkt_tc5_phy; + u64 rx_pkt_tc6_phy; + u64 tx_pkt_tc6_phy; + u64 rx_pkt_tc7_phy; + u64 tx_pkt_tc7_phy; + + u64 rx_byte_tc0_phy; + u64 tx_byte_tc0_phy; + u64 rx_byte_tc1_phy; + u64 tx_byte_tc1_phy; + u64 rx_byte_tc2_phy; + u64 tx_byte_tc2_phy; + u64 rx_byte_tc3_phy; + u64 tx_byte_tc3_phy; + u64 rx_byte_tc4_phy; + u64 tx_byte_tc4_phy; + u64 rx_byte_tc5_phy; + u64 tx_byte_tc5_phy; + u64 rx_byte_tc6_phy; + u64 tx_byte_tc6_phy; + u64 rx_byte_tc7_phy; + u64 tx_byte_tc7_phy; + + /* Per TC pause Counters */ + u64 rx_pause_tc0_phy; + u64 tx_pause_tc0_phy; + u64 rx_pause_tc1_phy; + u64 tx_pause_tc1_phy; + u64 rx_pause_tc2_phy; + u64 tx_pause_tc2_phy; + u64 rx_pause_tc3_phy; + u64 tx_pause_tc3_phy; + u64 rx_pause_tc4_phy; + u64 tx_pause_tc4_phy; + u64 rx_pause_tc5_phy; + u64 tx_pause_tc5_phy; + u64 rx_pause_tc6_phy; + u64 tx_pause_tc6_phy; + u64 rx_pause_tc7_phy; + u64 tx_pause_tc7_phy; }; struct mana_context { struct gdma_dev *gdma_dev; u16 num_ports; + u8 bm_hostmode; + struct mana_ethtool_hc_stats hc_stats; struct mana_eq *eqs; + struct dentry *mana_eqs_debugfs; + + /* Workqueue for querying hardware stats */ + struct delayed_work gf_stats_work; + bool hwc_timeout_occurred; struct net_device *ports[MAX_PORTS_IN_MANA_DEV]; + + /* Link state change work */ + struct work_struct link_change_work; + u32 link_event; }; struct mana_port_context { @@ -390,10 +507,11 @@ struct mana_port_context { struct mana_tx_qp *tx_qp; /* Indirection Table for RX & TX. The values are queue indexes */ - u32 indir_table[MANA_INDIRECT_TABLE_SIZE]; + u32 *indir_table; + u32 indir_table_sz; /* Indirection table containing RxObject Handles */ - mana_handle_t rxobj_table[MANA_INDIRECT_TABLE_SIZE]; + mana_handle_t *rxobj_table; /* Hash key used by the NIC */ u8 hashkey[MANA_HASH_KEY_SIZE]; @@ -408,6 +526,7 @@ struct mana_port_context { u32 rxbpre_datasize; u32 rxbpre_alloc_size; u32 rxbpre_headroom; + u32 rxbpre_frag_count; struct bpf_prog *bpf_prog; @@ -415,6 +534,9 @@ struct mana_port_context { unsigned int max_queues; unsigned int num_queues; + unsigned int rx_queue_size; + unsigned int tx_queue_size; + mana_handle_t port_handle; mana_handle_t pf_filter_handle; @@ -422,12 +544,24 @@ struct mana_port_context { struct mutex vport_mutex; int vport_use_count; + /* Net shaper handle*/ + struct net_shaper_handle handle; + u16 port_idx; + /* Currently configured speed (mbps) */ + u32 speed; + /* Maximum speed supported by the SKU (mbps) */ + u32 max_speed; bool port_is_up; bool port_st_save; /* Saved port state */ struct mana_ethtool_stats eth_stats; + + struct mana_ethtool_phy_stats phy_stats; + + /* Debugfs */ + struct dentry *mana_port_debugfs; }; netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev); @@ -441,6 +575,9 @@ int mana_detach(struct net_device *ndev, bool from_close); int mana_probe(struct gdma_dev *gd, bool resuming); void mana_remove(struct gdma_dev *gd, bool suspending); +int mana_rdma_probe(struct gdma_dev *gd); +void mana_rdma_remove(struct gdma_dev *gd); + void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev); int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames, u32 flags); @@ -449,9 +586,17 @@ u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq, struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); -void mana_query_gf_stats(struct mana_port_context *apc); +int mana_query_gf_stats(struct mana_context *ac); +int mana_query_link_cfg(struct mana_port_context *apc); +int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, + int enable_clamping); +void mana_query_phy_stats(struct mana_port_context *apc); +int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues); +void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); +void mana_unmap_skb(struct sk_buff *skb, struct mana_port_context *apc); extern const struct ethtool_ops mana_ethtool_ops; +extern struct dentry *mana_debugfs_root; /* A CQ can be created not associated with any EQ */ #define GDMA_CQ_NO_EQ 0xffff @@ -473,6 +618,9 @@ enum mana_command_code { MANA_FENCE_RQ = 0x20006, MANA_CONFIG_VPORT_RX = 0x20007, MANA_QUERY_VPORT_CONFIG = 0x20008, + MANA_QUERY_LINK_CONFIG = 0x2000A, + MANA_SET_BW_CLAMP = 0x2000B, + MANA_QUERY_PHY_STAT = 0x2000c, /* Privileged commands for the PF mode */ MANA_REGISTER_FILTER = 0x28000, @@ -481,6 +629,35 @@ enum mana_command_code { MANA_DEREGISTER_HW_PORT = 0x28004, }; +/* Query Link Configuration*/ +struct mana_query_link_config_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; +}; /* HW DATA */ + +struct mana_query_link_config_resp { + struct gdma_resp_hdr hdr; + u32 qos_speed_mbps; + u8 qos_unconfigured; + u8 reserved1[3]; + u32 link_speed_mbps; + u8 reserved2[4]; +}; /* HW DATA */ + +/* Set Bandwidth Clamp*/ +struct mana_set_bw_clamp_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + enum TRI_STATE enable_clamping; + u32 link_speed_mbps; +}; /* HW DATA */ + +struct mana_set_bw_clamp_resp { + struct gdma_resp_hdr hdr; + u8 qos_unconfigured; + u8 reserved[7]; +}; /* HW DATA */ + /* Query Device Configuration */ struct mana_query_device_cfg_req { struct gdma_req_hdr hdr; @@ -507,7 +684,8 @@ struct mana_query_device_cfg_resp { u64 pf_cap_flags4; u16 max_num_vports; - u16 reserved; + u8 bm_hostmode; /* response v3: Bare Metal Host Mode */ + u8 reserved; u32 max_num_eqs; /* response v2: */ @@ -601,8 +779,8 @@ struct mana_query_gf_stat_resp { struct gdma_resp_hdr hdr; u64 reported_stats; /* rx errors/discards */ - u64 discard_rx_nowqe; - u64 err_rx_vport_disabled; + u64 rx_discards_nowqe; + u64 rx_err_vport_disabled; /* rx bytes/packets */ u64 hc_rx_bytes; u64 hc_rx_ucast_pkts; @@ -612,16 +790,16 @@ struct mana_query_gf_stat_resp { u64 hc_rx_mcast_pkts; u64 hc_rx_mcast_bytes; /* tx errors */ - u64 err_tx_gf_disabled; - u64 err_tx_vport_disabled; - u64 err_tx_inval_vport_offset_pkt; - u64 err_tx_vlan_enforcement; - u64 err_tx_ethtype_enforcement; - u64 err_tx_SA_enforecement; - u64 err_tx_SQPDID_enforcement; - u64 err_tx_CQPDID_enforcement; - u64 err_tx_mtu_violation; - u64 err_tx_inval_oob; + u64 tx_err_gf_disabled; + u64 tx_err_vport_disabled; + u64 tx_err_inval_vport_offset_pkt; + u64 tx_err_vlan_enforcement; + u64 tx_err_ethtype_enforcement; + u64 tx_err_SA_enforcement; + u64 tx_err_SQPDID_enforcement; + u64 tx_err_CQPDID_enforcement; + u64 tx_err_mtu_violation; + u64 tx_err_inval_oob; /* tx bytes/packets */ u64 hc_tx_bytes; u64 hc_tx_ucast_pkts; @@ -631,7 +809,75 @@ struct mana_query_gf_stat_resp { u64 hc_tx_mcast_pkts; u64 hc_tx_mcast_bytes; /* tx error */ - u64 err_tx_gdma; + u64 tx_err_gdma; +}; /* HW DATA */ + +/* Query phy stats */ +struct mana_query_phy_stat_req { + struct gdma_req_hdr hdr; + u64 req_stats; +}; /* HW DATA */ + +struct mana_query_phy_stat_resp { + struct gdma_resp_hdr hdr; + u64 reported_stats; + + /* Aggregate Drop Counters */ + u64 rx_pkt_drop_phy; + u64 tx_pkt_drop_phy; + + /* Per TC(Traffic class) traffic Counters */ + u64 rx_pkt_tc0_phy; + u64 tx_pkt_tc0_phy; + u64 rx_pkt_tc1_phy; + u64 tx_pkt_tc1_phy; + u64 rx_pkt_tc2_phy; + u64 tx_pkt_tc2_phy; + u64 rx_pkt_tc3_phy; + u64 tx_pkt_tc3_phy; + u64 rx_pkt_tc4_phy; + u64 tx_pkt_tc4_phy; + u64 rx_pkt_tc5_phy; + u64 tx_pkt_tc5_phy; + u64 rx_pkt_tc6_phy; + u64 tx_pkt_tc6_phy; + u64 rx_pkt_tc7_phy; + u64 tx_pkt_tc7_phy; + + u64 rx_byte_tc0_phy; + u64 tx_byte_tc0_phy; + u64 rx_byte_tc1_phy; + u64 tx_byte_tc1_phy; + u64 rx_byte_tc2_phy; + u64 tx_byte_tc2_phy; + u64 rx_byte_tc3_phy; + u64 tx_byte_tc3_phy; + u64 rx_byte_tc4_phy; + u64 tx_byte_tc4_phy; + u64 rx_byte_tc5_phy; + u64 tx_byte_tc5_phy; + u64 rx_byte_tc6_phy; + u64 tx_byte_tc6_phy; + u64 rx_byte_tc7_phy; + u64 tx_byte_tc7_phy; + + /* Per TC(Traffic Class) pause Counters */ + u64 rx_pause_tc0_phy; + u64 tx_pause_tc0_phy; + u64 rx_pause_tc1_phy; + u64 tx_pause_tc1_phy; + u64 rx_pause_tc2_phy; + u64 tx_pause_tc2_phy; + u64 rx_pause_tc3_phy; + u64 tx_pause_tc3_phy; + u64 rx_pause_tc4_phy; + u64 tx_pause_tc4_phy; + u64 rx_pause_tc5_phy; + u64 tx_pause_tc5_phy; + u64 rx_pause_tc6_phy; + u64 tx_pause_tc6_phy; + u64 rx_pause_tc7_phy; + u64 tx_pause_tc7_phy; }; /* HW DATA */ /* Configure vPort Rx Steering */ @@ -650,6 +896,7 @@ struct mana_cfg_rx_steer_req_v2 { u8 hashkey[MANA_HASH_KEY_SIZE]; u8 cqe_coalescing_enable; u8 reserved2[7]; + mana_handle_t indir_tab[] __counted_by(num_indir_entries); }; /* HW DATA */ struct mana_cfg_rx_steer_resp { @@ -775,4 +1022,8 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id, u32 doorbell_pg_id); void mana_uncfg_vport(struct mana_port_context *apc); + +struct net_device *mana_get_primary_netdev(struct mana_context *ac, + u32 port_index, + netdevice_tracker *tracker); #endif /* _MANA_H */ diff --git a/include/net/mctp.h b/include/net/mctp.h index da86e106c91d..c3207ce98f07 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -69,7 +69,10 @@ struct mctp_sock { /* bind() params */ unsigned int bind_net; - mctp_eid_t bind_addr; + mctp_eid_t bind_local_addr; + mctp_eid_t bind_peer_addr; + unsigned int bind_peer_net; + bool bind_peer_set; __u8 bind_type; /* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */ @@ -87,7 +90,7 @@ struct mctp_sock { }; /* Key for matching incoming packets to sockets or reassembly contexts. - * Packets are matched on (src,dest,tag). + * Packets are matched on (peer EID, local EID, tag). * * Lifetime / locking requirements: * @@ -133,6 +136,7 @@ struct mctp_sock { * - through an expiry timeout, on a per-socket timer */ struct mctp_sk_key { + unsigned int net; mctp_eid_t peer_addr; mctp_eid_t local_addr; /* MCTP_ADDR_ANY for local owned tags */ __u8 tag; /* incoming tag match; invert TO for local */ @@ -182,8 +186,8 @@ struct mctp_sk_key { struct mctp_skb_cb { unsigned int magic; unsigned int net; - int ifindex; /* extended/direct addressing if set */ - mctp_eid_t src; + /* fields below provide extended addressing for ingress to recvmsg() */ + int ifindex; unsigned char halen; unsigned char haddr[MAX_ADDR_LEN]; }; @@ -211,7 +215,7 @@ static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb) BUILD_BUG_ON(sizeof(struct mctp_skb_cb) > sizeof(skb->cb)); WARN_ON(cb->magic != 0x4d435450); - return (void *)(skb->cb); + return cb; } /* If CONFIG_MCTP_FLOWS, we may add one of these as a SKB extension, @@ -221,6 +225,8 @@ struct mctp_flow { struct mctp_sk_key *key; }; +struct mctp_dst; + /* Route definition. * * These are held in the pernet->mctp.routes list, with RCU protection for @@ -228,16 +234,25 @@ struct mctp_flow { * dropped on NETDEV_UNREGISTER events. * * Updates to the route table are performed under rtnl; all reads under RCU, - * so routes cannot be referenced over a RCU grace period. Specifically: A - * caller cannot block between mctp_route_lookup and mctp_route_release() + * so routes cannot be referenced over a RCU grace period. */ struct mctp_route { mctp_eid_t min, max; unsigned char type; + unsigned int mtu; - struct mctp_dev *dev; - int (*output)(struct mctp_route *route, + + enum { + MCTP_ROUTE_DIRECT, + MCTP_ROUTE_GATEWAY, + } dst_type; + union { + struct mctp_dev *dev; + struct mctp_fq_addr gateway; + }; + + int (*output)(struct mctp_dst *dst, struct sk_buff *skb); struct list_head list; @@ -245,16 +260,41 @@ struct mctp_route { struct rcu_head rcu; }; +/* Route lookup result: dst. Represents the results of a routing decision, + * but is only held over the individual routing operation. + * + * Will typically be stored on the caller stack, and must be released after + * usage. + */ +struct mctp_dst { + struct mctp_dev *dev; + unsigned int mtu; + mctp_eid_t nexthop; + + /* set for direct addressing */ + unsigned char halen; + unsigned char haddr[MAX_ADDR_LEN]; + + int (*output)(struct mctp_dst *dst, struct sk_buff *skb); +}; + +int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, + unsigned char halen, const unsigned char *haddr); + /* route interfaces */ -struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, - mctp_eid_t daddr); +int mctp_route_lookup(struct net *net, unsigned int dnet, + mctp_eid_t daddr, struct mctp_dst *dst); -int mctp_local_output(struct sock *sk, struct mctp_route *rt, +void mctp_dst_release(struct mctp_dst *dst); + +/* always takes ownership of skb */ +int mctp_local_output(struct sock *sk, struct mctp_dst *dst, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, - mctp_eid_t daddr, mctp_eid_t saddr, + unsigned int netid, + mctp_eid_t local, mctp_eid_t peer, bool manual, u8 *tagp); /* routing <--> device interface */ @@ -292,7 +332,25 @@ void mctp_neigh_remove_dev(struct mctp_dev *mdev); int mctp_routes_init(void); void mctp_routes_exit(void); -void mctp_device_init(void); +int mctp_device_init(void); void mctp_device_exit(void); +/* MCTP IDs and Codes from DMTF specification + * "DSP0239 Management Component Transport Protocol (MCTP) IDs and Codes" + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0239_1.11.1.pdf + */ +enum mctp_phys_binding { + MCTP_PHYS_BINDING_UNSPEC = 0x00, + MCTP_PHYS_BINDING_SMBUS = 0x01, + MCTP_PHYS_BINDING_PCIE_VDM = 0x02, + MCTP_PHYS_BINDING_USB = 0x03, + MCTP_PHYS_BINDING_KCS = 0x04, + MCTP_PHYS_BINDING_SERIAL = 0x05, + MCTP_PHYS_BINDING_I3C = 0x06, + MCTP_PHYS_BINDING_MMBI = 0x07, + MCTP_PHYS_BINDING_PCC = 0x08, + MCTP_PHYS_BINDING_UCIE = 0x09, + MCTP_PHYS_BINDING_VENDOR = 0xFF, +}; + #endif /* __NET_MCTP_H */ diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h index 5c0d04b5c12c..957d9ef924c5 100644 --- a/include/net/mctpdevice.h +++ b/include/net/mctpdevice.h @@ -22,6 +22,7 @@ struct mctp_dev { refcount_t refs; unsigned int net; + enum mctp_phys_binding binding; const struct mctp_netdev_ops *ops; @@ -44,7 +45,8 @@ struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev); struct mctp_dev *__mctp_dev_get(const struct net_device *dev); int mctp_register_netdev(struct net_device *dev, - const struct mctp_netdev_ops *ops); + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding); void mctp_unregister_netdev(struct net_device *dev); void mctp_dev_hold(struct mctp_dev *mdev); diff --git a/include/net/mptcp.h b/include/net/mptcp.h index fb996124b3d5..f7263fe2a2e4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -14,6 +14,7 @@ struct mptcp_info; struct mptcp_sock; +struct mptcp_pm_addr_entry; struct seq_file; /* MPTCP sk_buff extension data */ @@ -97,17 +98,12 @@ struct mptcp_out_options { }; #define MPTCP_SCHED_NAME_MAX 16 -#define MPTCP_SUBFLOWS_MAX 8 - -struct mptcp_sched_data { - bool reinject; - u8 subflows; - struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; -}; +#define MPTCP_SCHED_MAX 128 +#define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) struct mptcp_sched_ops { - int (*get_subflow)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); + int (*get_send)(struct mptcp_sock *msk); + int (*get_retrans)(struct mptcp_sock *msk); char name[MPTCP_SCHED_NAME_MAX]; struct module *owner; @@ -117,6 +113,19 @@ struct mptcp_sched_ops { void (*release)(struct mptcp_sock *msk); } ____cacheline_aligned_in_smp; +#define MPTCP_PM_NAME_MAX 16 +#define MPTCP_PM_MAX 128 +#define MPTCP_PM_BUF_MAX (MPTCP_PM_NAME_MAX * MPTCP_PM_MAX) + +struct mptcp_pm_ops { + char name[MPTCP_PM_NAME_MAX]; + struct module *owner; + struct list_head list; + + void (*init)(struct mptcp_sock *msk); + void (*release)(struct mptcp_sock *msk); +} ____cacheline_aligned_in_smp; + #ifdef CONFIG_MPTCP void mptcp_init(void); @@ -220,6 +229,8 @@ static inline __be32 mptcp_reset_option(const struct sk_buff *skb) return htonl(0u); } + +void mptcp_active_detect_blackhole(struct sock *sk, bool expired); #else static inline void mptcp_init(void) @@ -304,6 +315,8 @@ static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct reques } static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } + +static inline void mptcp_active_detect_blackhole(struct sock *sk, bool expired) { } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 9bbdf6eaa942..d38783a2ce57 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -60,15 +60,6 @@ enum { #include <net/neighbour.h> -/* Set to 3 to get tracing... */ -#define ND_DEBUG 1 - -#define ND_PRINTK(val, level, fmt, ...) \ -do { \ - if (val <= ND_DEBUG) \ - net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ -} while (0) - struct ctl_table; struct inet6_dev; struct net_device; @@ -147,11 +138,6 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, * The following hooks can be defined; unless noted otherwise, they are * optional and can be filled with a null pointer. * - * int (*is_useropt)(u8 nd_opt_type): - * This function is called when IPv6 decide RA userspace options. if - * this function returns 1 then the option given by nd_opt_type will - * be handled as userspace option additional to the IPv6 options. - * * int (*parse_options)(const struct net_device *dev, * struct nd_opt_hdr *nd_opt, * struct ndisc_options *ndopts): @@ -200,7 +186,6 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, * addresses. E.g. 802.15.4 6LoWPAN. */ struct ndisc_ops { - int (*is_useropt)(u8 nd_opt_type); int (*parse_options)(const struct net_device *dev, struct nd_opt_hdr *nd_opt, struct ndisc_options *ndopts); @@ -224,15 +209,6 @@ struct ndisc_ops { }; #if IS_ENABLED(CONFIG_IPV6) -static inline int ndisc_ops_is_useropt(const struct net_device *dev, - u8 nd_opt_type) -{ - if (dev->ndisc_ops && dev->ndisc_ops->is_useropt) - return dev->ndisc_ops->is_useropt(nd_opt_type); - else - return 0; -} - static inline int ndisc_ops_parse_options(const struct net_device *dev, struct nd_opt_hdr *nd_opt, struct ndisc_options *ndopts) @@ -486,7 +462,7 @@ void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, +int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 6da68886fabb..2dfee6d4258a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -29,6 +29,7 @@ #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> +#include <net/neighbour_tables.h> /* * NUD stands for "neighbor unreachability detection" @@ -91,15 +92,17 @@ struct neigh_parms { static inline void neigh_var_set(struct neigh_parms *p, int index, int val) { set_bit(index, p->data_state); - p->data[index] = val; + WRITE_ONCE(p->data[index], val); } -#define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) +#define __NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) +#define NEIGH_VAR(p, attr) READ_ONCE(__NEIGH_VAR(p, attr)) +#define NEIGH_VAR_PTR(p, attr) (&(__NEIGH_VAR(p, attr))) /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used. * In other cases, NEIGH_VAR_SET should be used. */ -#define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val) +#define NEIGH_VAR_INIT(p, attr, val) (__NEIGH_VAR(p, attr) = val) #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val) static inline void neigh_parms_data_state_setall(struct neigh_parms *p) @@ -135,7 +138,8 @@ struct neigh_statistics { #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { - struct neighbour __rcu *next; + struct hlist_node hash; + struct hlist_node dev_list; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; @@ -162,7 +166,7 @@ struct neighbour { struct rcu_head rcu; struct net_device *dev; netdevice_tracker dev_tracker; - u8 primary_key[0]; + u8 primary_key[]; } __randomize_layout; struct neigh_ops { @@ -174,12 +178,17 @@ struct neigh_ops { }; struct pneigh_entry { - struct pneigh_entry *next; + struct pneigh_entry __rcu *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; + union { + struct list_head free_node; + struct rcu_head rcu; + }; u32 flags; u8 protocol; + bool permanent; u32 key[]; }; @@ -190,7 +199,7 @@ struct pneigh_entry { #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { - struct neighbour __rcu **hash_buckets; + struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; @@ -229,19 +238,12 @@ struct neigh_table { atomic_t gc_entries; struct list_head gc_list; struct list_head managed_list; - rwlock_t lock; + spinlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; - struct pneigh_entry **phash_buckets; -}; - -enum { - NEIGH_ARP_TABLE = 0, - NEIGH_ND_TABLE = 1, - NEIGH_DN_TABLE = 2, - NEIGH_NR_TABLES, - NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ + struct mutex phash_lock; + struct pneigh_entry __rcu **phash_buckets; }; static inline int neigh_parms_family(struct neigh_parms *p) @@ -266,16 +268,24 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) #define NEIGH_UPDATE_F_ISROUTER BIT(6) #define NEIGH_UPDATE_F_ADMIN BIT(7) +#define NEIGH_UPDATE_F_EXT_VALIDATED BIT(8) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 -#define NTF_EXT_MASK (NTF_EXT_MANAGED) +#define NTF_EXT_MASK (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED) #define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) +#define NTF_EXT_VALIDATED (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; +#define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash) +#define neigh_for_each_in_bucket_rcu(pos, head) \ + hlist_for_each_entry_rcu(pos, head, hash) +#define neigh_for_each_in_bucket_safe(pos, tmp, head) \ + hlist_for_each_entry_safe(pos, tmp, head, hash) + static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; @@ -304,12 +314,9 @@ static inline struct neighbour *___neigh_lookup_noref( u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); - for (n = rcu_dereference(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference(n->next)) { + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val]) if (n->dev == dev && key_eq(n, pkey)) return n; - } return NULL; } @@ -350,7 +357,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); -bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl); +bool neigh_remove_one(struct neighbour *ndel); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); @@ -373,13 +380,20 @@ struct net *neigh_parms_net(const struct neigh_parms *parms) unsigned long neigh_rand_reach_time(unsigned long base); +static inline void neigh_set_reach_time(struct neigh_parms *p) +{ + unsigned long base = NEIGH_VAR(p, BASE_REACHABLE_TIME); + + WRITE_ONCE(p->reachable_time, neigh_rand_reach_time(base)); +} + void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev, - int creat); -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); + const void *key, struct net_device *dev); +int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, + struct net_device *dev, u32 flags, u8 protocol, + bool permanent); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); @@ -412,12 +426,12 @@ void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, void *neigh_seq_next(struct seq_file *, void *, loff_t *); void neigh_seq_stop(struct seq_file *, void *); -int neigh_proc_dointvec(struct ctl_table *ctl, int write, +int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, +int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); -int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, +int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, @@ -539,7 +553,7 @@ static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); - return n->output(n, skb); + return READ_ONCE(n->output)(n, skb); } static inline struct neighbour * diff --git a/include/net/neighbour_tables.h b/include/net/neighbour_tables.h new file mode 100644 index 000000000000..bcffbe8f7601 --- /dev/null +++ b/include/net/neighbour_tables.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_NEIGHBOUR_TABLES_H +#define _NET_NEIGHBOUR_TABLES_H + +enum { + NEIGH_ARP_TABLE = 0, + NEIGH_ND_TABLE = 1, + NEIGH_NR_TABLES, + NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ +}; + +#endif diff --git a/include/net/net_debug.h b/include/net/net_debug.h index 1e74684cbbdb..47f7a4a878b9 100644 --- a/include/net/net_debug.h +++ b/include/net/net_debug.h @@ -27,7 +27,7 @@ void netdev_info(const struct net_device *dev, const char *format, ...); #define netdev_level_once(level, dev, fmt, ...) \ do { \ - static bool __section(".data.once") __print_once; \ + static bool __section(".data..once") __print_once; \ \ if (!__print_once) { \ __print_once = true; \ @@ -149,9 +149,11 @@ do { \ #if defined(CONFIG_DEBUG_NET) -#define DEBUG_NET_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define DEBUG_NET_WARN_ON_ONCE(cond) ((void)WARN_ON_ONCE(cond)) +#define DEBUG_NET_WARN_ONCE(cond, format...) ((void)WARN_ONCE(cond, format)) #else #define DEBUG_NET_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define DEBUG_NET_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif #endif /* _LINUX_NET_DEBUG_H */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index eb6cd43b1746..cb664f6e3558 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -67,8 +67,6 @@ struct net { */ spinlock_t rules_mod_lock; - atomic_t dev_unreg_count; - unsigned int dev_base_seq; /* protected by rtnl_mutex */ u32 ifindex; @@ -82,8 +80,12 @@ struct net { * or to unregister pernet ops * (pernet_ops_rwsem write locked). */ + struct llist_node defer_free_list; struct llist_node cleanup_list; /* namespaces on death row */ + struct list_head ptype_all; + struct list_head ptype_specific; + #ifdef CONFIG_KEYS struct key_tag *key_domain; /* Key domain of operation tag */ #endif @@ -190,6 +192,10 @@ struct net { #if IS_ENABLED(CONFIG_SMC) struct netns_smc smc; #endif +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + /* Move to a better place when the config guard is removed. */ + struct mutex rtnl_mutex; +#endif } __randomize_layout; #include <linux/seq_file_net.h> @@ -198,7 +204,7 @@ struct net { extern struct net init_net; #ifdef CONFIG_NET_NS -struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net); void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); @@ -207,10 +213,12 @@ void net_ns_barrier(void); struct ns_common *get_net_ns(struct ns_common *ns); struct net *get_net_ns_by_fd(int fd); +extern struct task_struct *cleanup_net_task; + #else /* CONFIG_NET_NS */ #include <linux/sched.h> #include <linux/nsproxy.h> -static inline struct net *copy_net_ns(unsigned long flags, +static inline struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { if (flags & CLONE_NEWNET) @@ -254,10 +262,15 @@ void ipx_unregister_sysctl(void); #ifdef CONFIG_NET_NS void __put_net(struct net *net); +static inline struct net *to_net_ns(struct ns_common *ns) +{ + return container_of(ns, struct net, ns); +} + /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { - refcount_inc(&net->ns.count); + ns_ref_inc(net); return net; } @@ -268,7 +281,7 @@ static inline struct net *maybe_get_net(struct net *net) * exists. If the reference count is zero this * function fails and returns NULL. */ - if (!refcount_inc_not_zero(&net->ns.count)) + if (!ns_ref_get(net)) net = NULL; return net; } @@ -276,7 +289,7 @@ static inline struct net *maybe_get_net(struct net *net) /* Try using put_net_track() instead */ static inline void put_net(struct net *net) { - if (refcount_dec_and_test(&net->ns.count)) + if (ns_ref_put(net)) __put_net(net); } @@ -288,10 +301,11 @@ int net_eq(const struct net *net1, const struct net *net2) static inline int check_net(const struct net *net) { - return refcount_read(&net->ns.count) != 0; + return ns_ref_read(net) != 0; } void net_drop_ns(void *); +void net_passive_dec(struct net *net); #else @@ -321,8 +335,23 @@ static inline int check_net(const struct net *net) } #define net_drop_ns NULL + +static inline void net_passive_dec(struct net *net) +{ + refcount_dec(&net->passive); +} #endif +static inline void net_passive_inc(struct net *net) +{ + refcount_inc(&net->passive); +} + +/* Returns true if the netns initialization is completed successfully */ +static inline bool net_initialized(const struct net *net) +{ + return READ_ONCE(net->list.next); +} static inline void __netns_tracker_alloc(struct net *net, netns_tracker *tracker, @@ -368,21 +397,30 @@ static inline void put_net_track(struct net *net, netns_tracker *tracker) typedef struct { #ifdef CONFIG_NET_NS - struct net *net; + struct net __rcu *net; #endif } possible_net_t; static inline void write_pnet(possible_net_t *pnet, struct net *net) { #ifdef CONFIG_NET_NS - pnet->net = net; + rcu_assign_pointer(pnet->net, net); #endif } static inline struct net *read_pnet(const possible_net_t *pnet) { #ifdef CONFIG_NET_NS - return pnet->net; + return rcu_dereference_protected(pnet->net, true); +#else + return &init_net; +#endif +} + +static inline struct net *read_pnet_rcu(const possible_net_t *pnet) +{ +#ifdef CONFIG_NET_NS + return rcu_dereference(pnet->net); #else return &init_net; #endif @@ -441,8 +479,11 @@ struct pernet_operations { void (*pre_exit)(struct net *net); void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); - unsigned int *id; - size_t size; + /* Following method is called with RTNL held. */ + void (*exit_rtnl)(struct net *net, + struct list_head *dev_kill_list); + unsigned int * const id; + const size_t size; }; /* diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h new file mode 100644 index 000000000000..5c3f49b52fe9 --- /dev/null +++ b/include/net/net_shaper.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_SHAPER_H_ +#define _NET_SHAPER_H_ + +#include <linux/types.h> + +#include <uapi/linux/net_shaper.h> + +struct net_device; +struct devlink; +struct netlink_ext_ack; + +enum net_shaper_binding_type { + NET_SHAPER_BINDING_TYPE_NETDEV, + /* NET_SHAPER_BINDING_TYPE_DEVLINK_PORT */ +}; + +struct net_shaper_binding { + enum net_shaper_binding_type type; + union { + struct net_device *netdev; + struct devlink *devlink; + }; +}; + +struct net_shaper_handle { + enum net_shaper_scope scope; + u32 id; +}; + +/** + * struct net_shaper - represents a shaping node on the NIC H/W + * zeroed field are considered not set. + * @parent: Unique identifier for the shaper parent, usually implied + * @handle: Unique identifier for this shaper + * @metric: Specify if the rate limits refers to PPS or BPS + * @bw_min: Minimum guaranteed rate for this shaper + * @bw_max: Maximum peak rate allowed for this shaper + * @burst: Maximum burst for the peek rate of this shaper + * @priority: Scheduling priority for this shaper + * @weight: Scheduling weight for this shaper + */ +struct net_shaper { + struct net_shaper_handle parent; + struct net_shaper_handle handle; + enum net_shaper_metric metric; + u64 bw_min; + u64 bw_max; + u64 burst; + u32 priority; + u32 weight; + + /* private: */ + u32 leaves; /* accounted only for NODE scope */ + struct rcu_head rcu; +}; + +/** + * struct net_shaper_ops - Operations on device H/W shapers + * + * The operations applies to either net_device and devlink objects. + * The initial shaping configuration at device initialization is empty: + * does not constraint the rate in any way. + * The network core keeps track of the applied user-configuration in + * the net_device or devlink structure. + * The operations are serialized via a per device lock. + * + * Device not supporting any kind of nesting should not provide the + * group operation. + * + * Each shaper is uniquely identified within the device with a 'handle' + * comprising the shaper scope and a scope-specific id. + */ +struct net_shaper_ops { + /** + * @group: create the specified shapers scheduling group + * + * Nest the @leaves shapers identified under the * @node shaper. + * All the shapers belong to the device specified by @binding. + * The @leaves arrays size is specified by @leaves_count. + * Create either the @leaves and the @node shaper; or if they already + * exists, links them together in the desired way. + * @leaves scope must be NET_SHAPER_SCOPE_QUEUE. + */ + int (*group)(struct net_shaper_binding *binding, int leaves_count, + const struct net_shaper *leaves, + const struct net_shaper *node, + struct netlink_ext_ack *extack); + + /** + * @set: Updates the specified shaper + * + * Updates or creates the @shaper on the device specified by @binding. + */ + int (*set)(struct net_shaper_binding *binding, + const struct net_shaper *shaper, + struct netlink_ext_ack *extack); + + /** + * @delete: Removes the specified shaper + * + * Removes the shaper configuration as identified by the given @handle + * on the device specified by @binding, restoring the default behavior. + */ + int (*delete)(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle, + struct netlink_ext_ack *extack); + + /** + * @capabilities: get the shaper features supported by the device + * + * Fills the bitmask @cap with the supported capabilities for the + * specified @scope and device specified by @binding. + */ + void (*capabilities)(struct net_shaper_binding *binding, + enum net_shaper_scope scope, unsigned long *cap); +}; + +#endif diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h new file mode 100644 index 000000000000..3d3aef80beac --- /dev/null +++ b/include/net/netdev_lock.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_NETDEV_LOCK_H +#define _NET_NETDEV_LOCK_H + +#include <linux/lockdep.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> + +static inline bool netdev_trylock(struct net_device *dev) +{ + return mutex_trylock(&dev->lock); +} + +static inline void netdev_assert_locked(const struct net_device *dev) +{ + lockdep_assert_held(&dev->lock); +} + +static inline void +netdev_assert_locked_or_invisible(const struct net_device *dev) +{ + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) + netdev_assert_locked(dev); +} + +static inline bool netdev_need_ops_lock(const struct net_device *dev) +{ + bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; + +#if IS_ENABLED(CONFIG_NET_SHAPER) + ret |= !!dev->netdev_ops->net_shaper_ops; +#endif + + return ret; +} + +static inline void netdev_lock_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); +} + +static inline void netdev_unlock_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); +} + +static inline void netdev_lock_ops_to_full(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_lock(dev); +} + +static inline void netdev_unlock_full_to_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_unlock(dev); +} + +static inline void netdev_ops_assert_locked(const struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + lockdep_assert_held(&dev->lock); + else + ASSERT_RTNL(); +} + +static inline void +netdev_ops_assert_locked_or_invisible(const struct net_device *dev) +{ + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) + netdev_ops_assert_locked(dev); +} + +static inline void netdev_lock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); + else + rtnl_lock(); +} + +static inline void netdev_unlock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); + else + rtnl_unlock(); +} + +static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, + const struct lockdep_map *b) +{ + if (a == b) + return 0; + + /* Allow locking multiple devices only under rtnl_lock, + * the exact order doesn't matter. + * Note that upper devices don't lock their ops, so nesting + * mostly happens in batched device removal for now. + */ + return lockdep_rtnl_is_held() ? -1 : 1; +} + +#define netdev_lockdep_set_classes(dev) \ +{ \ + static struct lock_class_key qdisc_tx_busylock_key; \ + static struct lock_class_key qdisc_xmit_lock_key; \ + static struct lock_class_key dev_addr_list_lock_key; \ + static struct lock_class_key dev_instance_lock_key; \ + unsigned int i; \ + \ + (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ + lockdep_set_class(&(dev)->addr_list_lock, \ + &dev_addr_list_lock_key); \ + lockdep_set_class(&(dev)->lock, \ + &dev_instance_lock_key); \ + lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ + for (i = 0; i < (dev)->num_tx_queues; i++) \ + lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ + &qdisc_xmit_lock_key); \ +} + +#define netdev_lock_dereference(p, dev) \ + rcu_dereference_protected(p, lockdep_is_held(&(dev)->lock)) + +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr); + +#endif diff --git a/include/net/netdev_netlink.h b/include/net/netdev_netlink.h new file mode 100644 index 000000000000..075962dbe743 --- /dev/null +++ b/include/net/netdev_netlink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_NETDEV_NETLINK_H +#define __NET_NETDEV_NETLINK_H + +#include <linux/list.h> + +struct netdev_nl_sock { + struct mutex lock; + struct list_head bindings; +}; + +#endif /* __NET_NETDEV_NETLINK_H */ diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index d68b0a483431..cd00e0406cf4 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -5,6 +5,155 @@ #include <linux/netdevice.h> /** + * struct netdev_config - queue-related configuration for a netdev + * @hds_thresh: HDS Threshold value. + * @hds_config: HDS value from userspace. + */ +struct netdev_config { + u32 hds_thresh; + u8 hds_config; +}; + +/* See the netdev.yaml spec for definition of each statistic */ +struct netdev_queue_stats_rx { + u64 bytes; + u64 packets; + u64 alloc_fail; + + u64 hw_drops; + u64 hw_drop_overruns; + + u64 csum_complete; + u64 csum_unnecessary; + u64 csum_none; + u64 csum_bad; + + u64 hw_gro_packets; + u64 hw_gro_bytes; + u64 hw_gro_wire_packets; + u64 hw_gro_wire_bytes; + + u64 hw_drop_ratelimits; +}; + +struct netdev_queue_stats_tx { + u64 bytes; + u64 packets; + + u64 hw_drops; + u64 hw_drop_errors; + + u64 csum_none; + u64 needs_csum; + + u64 hw_gso_packets; + u64 hw_gso_bytes; + u64 hw_gso_wire_packets; + u64 hw_gso_wire_bytes; + + u64 hw_drop_ratelimits; + + u64 stop; + u64 wake; +}; + +/** + * struct netdev_stat_ops - netdev ops for fine grained stats + * @get_queue_stats_rx: get stats for a given Rx queue + * @get_queue_stats_tx: get stats for a given Tx queue + * @get_base_stats: get base stats (not belonging to any live instance) + * + * Query stats for a given object. The values of the statistics are undefined + * on entry (specifically they are *not* zero-initialized). Drivers should + * assign values only to the statistics they collect. Statistics which are not + * collected must be left undefined. + * + * Queue objects are not necessarily persistent, and only currently active + * queues are queried by the per-queue callbacks. This means that per-queue + * statistics will not generally add up to the total number of events for + * the device. The @get_base_stats callback allows filling in the delta + * between events for currently live queues and overall device history. + * @get_base_stats can also be used to report any miscellaneous packets + * transferred outside of the main set of queues used by the networking stack. + * When the statistics for the entire device are queried, first @get_base_stats + * is issued to collect the delta, and then a series of per-queue callbacks. + * Only statistics which are set in @get_base_stats will be reported + * at the device level, meaning that unlike in queue callbacks, setting + * a statistic to zero in @get_base_stats is a legitimate thing to do. + * This is because @get_base_stats has a second function of designating which + * statistics are in fact correct for the entire device (e.g. when history + * for some of the events is not maintained, and reliable "total" cannot + * be provided). + * + * Ops are called under the instance lock if netdev_need_ops_lock() + * returns true, otherwise under rtnl_lock. + * Device drivers can assume that when collecting total device stats, + * the @get_base_stats and subsequent per-queue calls are performed + * "atomically" (without releasing the relevant lock). + * + * Device drivers are encouraged to reset the per-queue statistics when + * number of queues change. This is because the primary use case for + * per-queue statistics is currently to detect traffic imbalance. + */ +struct netdev_stat_ops { + void (*get_queue_stats_rx)(struct net_device *dev, int idx, + struct netdev_queue_stats_rx *stats); + void (*get_queue_stats_tx)(struct net_device *dev, int idx, + struct netdev_queue_stats_tx *stats); + void (*get_base_stats)(struct net_device *dev, + struct netdev_queue_stats_rx *rx, + struct netdev_queue_stats_tx *tx); +}; + +void netdev_stat_queue_sum(struct net_device *netdev, + int rx_start, int rx_end, + struct netdev_queue_stats_rx *rx_sum, + int tx_start, int tx_end, + struct netdev_queue_stats_tx *tx_sum); + +/** + * struct netdev_queue_mgmt_ops - netdev ops for queue management + * + * @ndo_queue_mem_size: Size of the struct that describes a queue's memory. + * + * @ndo_queue_mem_alloc: Allocate memory for an RX queue at the specified index. + * The new memory is written at the specified address. + * + * @ndo_queue_mem_free: Free memory from an RX queue. + * + * @ndo_queue_start: Start an RX queue with the specified memory and at the + * specified index. + * + * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped + * queue's memory is written at the specified address. + * + * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used + * for this queue. Return NULL on error. + * + * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while + * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only + * be called for an interface which is open. + */ +struct netdev_queue_mgmt_ops { + size_t ndo_queue_mem_size; + int (*ndo_queue_mem_alloc)(struct net_device *dev, + void *per_queue_mem, + int idx); + void (*ndo_queue_mem_free)(struct net_device *dev, + void *per_queue_mem); + int (*ndo_queue_start)(struct net_device *dev, + void *per_queue_mem, + int idx); + int (*ndo_queue_stop)(struct net_device *dev, + void *per_queue_mem, + int idx); + struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, + int idx); +}; + +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); + +/** * DOC: Lockless queue stopping / waking helpers. * * The netif_txq_maybe_stop() and __netif_txq_completed_wake() @@ -128,7 +277,7 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue, netdev_txq_completed_mb(txq, pkts, bytes); \ \ _res = -1; \ - if (pkts && likely(get_desc > start_thrs)) { \ + if (pkts && likely(get_desc >= start_thrs)) { \ _res = 1; \ if (unlikely(netif_tx_queue_stopped(txq)) && \ !(down_cond)) { \ @@ -146,28 +295,39 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue, #define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_try_stop(txq, get_desc, start_thrs); \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_try_stop(_txq, get_desc, start_thrs); \ }) +static inline void netif_subqueue_sent(const struct net_device *dev, + unsigned int idx, unsigned int bytes) +{ + struct netdev_queue *txq; + + txq = netdev_get_tx_queue(dev, idx); + netdev_tx_sent_queue(txq, bytes); +} + #define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_maybe_stop(_txq, get_desc, stop_thrs, start_thrs); \ }) #define netif_subqueue_completed_wake(dev, idx, pkts, bytes, \ get_desc, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_completed_wake(txq, pkts, bytes, \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_completed_wake(_txq, pkts, bytes, \ get_desc, start_thrs); \ }) +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); + #endif diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index cdcafb30d437..8cdcd138b33f 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -6,6 +6,7 @@ #include <linux/netdevice.h> #include <linux/sysfs.h> #include <net/xdp.h> +#include <net/page_pool/types.h> /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { @@ -15,12 +16,18 @@ struct netdev_rx_queue { struct rps_dev_flow_table __rcu *rps_flow_table; #endif struct kobject kobj; + const struct attribute_group **groups; struct net_device *dev; netdevice_tracker dev_tracker; + /* All fields below are "ops protected", + * see comment about net_device::lock + */ #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif + struct napi_struct *napi; + struct pp_memory_provider_params mp_params; } ____cacheline_aligned_in_smp; /* @@ -39,7 +46,6 @@ __netif_get_rx_queue(struct net_device *dev, unsigned int rxq) return dev->_rx + rxq; } -#ifdef CONFIG_SYSFS static inline unsigned int get_netdev_rx_queue_index(struct netdev_rx_queue *queue) { @@ -49,5 +55,7 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue) BUG_ON(index >= dev->num_rx_queues); return index; } -#endif + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); + #endif diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 2c8c2b023848..8d65ffbf57de 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -13,9 +13,6 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; -#ifdef CONFIG_NF_CT_PROTO_DCCP -extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp; #endif diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index c653fcb88354..09de2f2686b5 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -10,14 +10,6 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook); void nf_send_reset(struct net *net, struct sock *, struct sk_buff *oldskb, int hook); -const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *_oth, int hook); -struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int ttl); -void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - const struct tcphdr *oth); - struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index d729344ba644..94ec0b9f2838 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -9,16 +9,6 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char cod unsigned int hooknum); void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen); - struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 4085765c3370..aa0a7c82199e 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -18,7 +18,6 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tcp.h> -#include <linux/netfilter/nf_conntrack_dccp.h> #include <linux/netfilter/nf_conntrack_sctp.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> @@ -31,7 +30,6 @@ struct nf_ct_udp { /* per conntrack: protocol private data */ union nf_conntrack_proto { /* insert conntrack proto private data here */ - struct nf_ct_dccp dccp; struct ip_ct_sctp sctp; struct ip_ct_tcp tcp; struct nf_ct_udp udp; @@ -160,10 +158,6 @@ static inline struct net *nf_ct_net(const struct nf_conn *ct) return read_pnet(&ct->ct_net); } -/* Alter reply tuple (maybe alter helper). */ -void nf_conntrack_alter_reply(struct nf_conn *ct, - const struct nf_conntrack_tuple *newreply); - /* Is this tuple taken? (ignoring any belonging to the given conntrack). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, @@ -208,8 +202,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_tuple *tuple); void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, - u32 extra_jiffies, bool do_acct); + u32 extra_jiffies, unsigned int bytes); /* Refresh conntrack for this many jiffies and do accounting */ static inline void nf_ct_refresh_acct(struct nf_conn *ct, @@ -217,15 +210,14 @@ static inline void nf_ct_refresh_acct(struct nf_conn *ct, const struct sk_buff *skb, u32 extra_jiffies) { - __nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, true); + __nf_ct_refresh_acct(ct, ctinfo, extra_jiffies, skb->len); } /* Refresh conntrack for this many jiffies */ static inline void nf_ct_refresh(struct nf_conn *ct, - const struct sk_buff *skb, u32 extra_jiffies) { - __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, false); + __nf_ct_refresh_acct(ct, 0, extra_jiffies, 0); } /* kill conntrack and do accounting */ @@ -284,6 +276,16 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } +static inline void nf_conntrack_alter_reply(struct nf_conn *ct, + const struct nf_conntrack_tuple *newreply) +{ + /* Must be unconfirmed, so not in hash table yet */ + if (WARN_ON(nf_ct_is_confirmed(ct))) + return; + + ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; +} + #define nfct_time_stamp ((u32)(jiffies)) /* jiffies until ct expires, 0 if already expired */ @@ -302,22 +304,23 @@ static inline bool nf_ct_is_expired(const struct nf_conn *ct) /* use after obtaining a reference count */ static inline bool nf_ct_should_gc(const struct nf_conn *ct) { - return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) && - !nf_ct_is_dying(ct); + if (!nf_ct_is_confirmed(ct)) + return false; + + /* load ct->timeout after is_confirmed() test. + * Pairs with __nf_conntrack_confirm() which: + * 1. Increases ct->timeout value + * 2. Inserts ct into rcu hlist + * 3. Sets the confirmed bit + * 4. Unlocks the hlist lock + */ + smp_acquire__after_ctrl_dep(); + + return nf_ct_is_expired(ct) && !nf_ct_is_dying(ct); } #define NF_CT_DAY (86400 * HZ) -/* Set an arbitrary timeout large enough not to ever expire, this save - * us a check for the IPS_OFFLOAD_BIT from the packet path via - * nf_ct_is_expired(). - */ -static inline void nf_ct_offload_timeout(struct nf_conn *ct) -{ - if (nf_ct_expires(ct) < NF_CT_DAY / 2) - WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY); -} - struct kernel_param; int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp); diff --git a/include/net/netfilter/nf_conntrack_act_ct.h b/include/net/netfilter/nf_conntrack_act_ct.h index 078d3c52c03f..e5f2f0b73a9a 100644 --- a/include/net/netfilter/nf_conntrack_act_ct.h +++ b/include/net/netfilter/nf_conntrack_act_ct.h @@ -20,7 +20,22 @@ static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_find(const struct nf #endif } -static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_add(struct nf_conn *ct) +static inline void nf_conn_act_ct_ext_fill(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ +#if IS_ENABLED(CONFIG_NET_ACT_CT) + struct nf_conn_act_ct_ext *act_ct_ext; + + act_ct_ext = nf_conn_act_ct_ext_find(ct); + if (dev_net(skb->dev) == &init_net && act_ct_ext) + act_ct_ext->ifindex[CTINFO2DIR(ctinfo)] = skb->dev->ifindex; +#endif +} + +static inline struct +nf_conn_act_ct_ext *nf_conn_act_ct_ext_add(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) { #if IS_ENABLED(CONFIG_NET_ACT_CT) struct nf_conn_act_ct_ext *act_ct = nf_ct_ext_find(ct, NF_CT_EXT_ACT_CT); @@ -29,22 +44,11 @@ static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_add(struct nf_conn * return act_ct; act_ct = nf_ct_ext_add(ct, NF_CT_EXT_ACT_CT, GFP_ATOMIC); + nf_conn_act_ct_ext_fill(skb, ct, ctinfo); return act_ct; #else return NULL; #endif } -static inline void nf_conn_act_ct_ext_fill(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ -#if IS_ENABLED(CONFIG_NET_ACT_CT) - struct nf_conn_act_ct_ext *act_ct_ext; - - act_ct_ext = nf_conn_act_ct_ext_find(ct); - if (dev_net(skb->dev) == &init_net && act_ct_ext) - act_ct_ext->ifindex[CTINFO2DIR(ctinfo)] = skb->dev->ifindex; -#endif -} - #endif /* _NF_CONNTRACK_ACT_CT_H */ diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index e227d997fc71..52a06de41aa0 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -15,20 +15,17 @@ struct nf_conncount_list { unsigned int count; /* length of list */ }; -struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, - unsigned int keylen); -void nf_conncount_destroy(struct net *net, unsigned int family, - struct nf_conncount_data *data); - -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); - -int nf_conncount_add(struct net *net, struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); +struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen); +void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data); + +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key); + +int nf_conncount_add_skb(struct net *net, const struct sk_buff *skb, + u16 l3num, struct nf_conncount_list *list); void nf_conncount_list_init(struct nf_conncount_list *list); diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 0c1dac318e02..8dcf7c371ee9 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -12,6 +12,7 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <net/netfilter/nf_conntrack_extend.h> +#include <asm/local64.h> enum nf_ct_ecache_state { NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */ @@ -20,6 +21,9 @@ enum nf_ct_ecache_state { struct nf_conntrack_ecache { unsigned long cache; /* bitops want long */ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + local64_t timestamp; /* event timestamp, in nanoseconds */ +#endif u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ u32 missed; /* missed events */ @@ -108,6 +112,14 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) if (e == NULL) return; +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + /* renew only if this is the first cached event, so that the + * timestamp reflects the first, not the last, generated event. + */ + if (local64_read(&e->timestamp) && READ_ONCE(e->cache) == 0) + local64_set(&e->timestamp, ktime_get_real_ns()); +#endif + set_bit(event, &e->cache); #endif } diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 1f47bef51722..cd5020835a6d 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -30,7 +30,7 @@ struct nf_conntrack_l4proto { /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); - /* convert protoinfo to nfnetink attributes */ + /* convert protoinfo to nfnetlink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy); @@ -117,11 +117,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); -int nf_conntrack_dccp_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state); int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, @@ -137,7 +132,6 @@ void nf_conntrack_generic_init_net(struct net *net); void nf_conntrack_tcp_init_net(struct net *net); void nf_conntrack_udp_init_net(struct net *net); void nf_conntrack_gre_init_net(struct net *net); -void nf_conntrack_dccp_init_net(struct net *net); void nf_conntrack_sctp_init_net(struct net *net); void nf_conntrack_icmp_init_net(struct net *net); void nf_conntrack_icmpv6_init_net(struct net *net); @@ -223,13 +217,6 @@ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct) } #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP -static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.dccp; -} -#endif - #ifdef CONFIG_NF_CT_PROTO_SCTP static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) { diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index fcb19a4e8f2b..6903f72bcc15 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -39,7 +39,7 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_LABELS struct net *net = nf_ct_net(ct); - if (net->ct.labels_used == 0) + if (atomic_read(&net->ct.labels_used) == 0) return NULL; return nf_ct_ext_add(ct, NF_CT_EXT_LABELS, GFP_ATOMIC); diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index d466e1a3b0b1..b09c11c048d5 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -53,6 +53,7 @@ struct nf_flowtable_type { struct list_head list; int family; int (*init)(struct nf_flowtable *ft); + bool (*gc)(const struct flow_offload *flow); int (*setup)(struct nf_flowtable *ft, struct net_device *dev, enum flow_block_command cmd); @@ -61,6 +62,8 @@ struct nf_flowtable_type { enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule); void (*free)(struct nf_flowtable *ft); + void (*get)(struct nf_flowtable *ft); + void (*put)(struct nf_flowtable *ft); nf_hookfn *hook; struct module *owner; }; @@ -71,12 +74,13 @@ enum nf_flowtable_flags { }; struct nf_flowtable { - struct list_head list; - struct rhashtable rhashtable; - int priority; + unsigned int flags; /* readonly in datapath */ + int priority; /* control path (padding hole) */ + struct rhashtable rhashtable; /* datapath, read-mostly members come first */ + + struct list_head list; /* slowpath parts */ const struct nf_flowtable_type *type; struct delayed_work gc_work; - unsigned int flags; struct flow_block flow_block; struct rw_semaphore flow_block_lock; /* Guards flow_block */ possible_net_t net; @@ -103,6 +107,19 @@ enum flow_offload_xmit_type { #define NF_FLOW_TABLE_ENCAP_MAX 2 +struct flow_offload_tunnel { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + + u8 l3_proto; +}; + struct flow_offload_tuple { union { struct in_addr src_v4; @@ -126,22 +143,25 @@ struct flow_offload_tuple { __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; + struct flow_offload_tunnel tun; + /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; u8 dir:2, xmit_type:3, encap_num:2, + tun_num:2, in_vlan_ingress:2; u16 mtu; union { struct { struct dst_entry *dst_cache; + u32 ifidx; u32 dst_cookie; }; struct { u32 ifidx; - u32 hw_ifidx; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; @@ -159,6 +179,7 @@ struct flow_offload_tuple_rhash { enum nf_flow_flags { NF_FLOW_SNAT, NF_FLOW_DNAT, + NF_FLOW_CLOSING, NF_FLOW_TEARDOWN, NF_FLOW_HW, NF_FLOW_HW_DYING, @@ -201,7 +222,9 @@ struct nf_flow_route { u16 id; __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; + struct flow_offload_tunnel tun; u8 num_encaps:2, + num_tuns:2, ingress_vlans:2; } in; struct { @@ -217,6 +240,12 @@ struct nf_flow_route { struct flow_offload *flow_offload_alloc(struct nf_conn *ct); void flow_offload_free(struct flow_offload *flow); +struct nft_flowtable; +struct nft_pktinfo; +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft); + static inline int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, flow_setup_cb_t *cb, void *cb_priv) @@ -239,6 +268,11 @@ nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, } list_add_tail(&block_cb->list, &block->cb_list); + up_write(&flow_table->flow_block_lock); + + if (flow_table->type->get) + flow_table->type->get(flow_table); + return 0; unlock: up_write(&flow_table->flow_block_lock); @@ -261,10 +295,13 @@ nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, WARN_ON(true); } up_write(&flow_table->flow_block_lock); + + if (flow_table->type->put) + flow_table->type->put(flow_table); } void flow_offload_route_init(struct flow_offload *flow, - const struct nf_flow_route *route); + struct nf_flow_route *route); int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); void flow_offload_refresh(struct nf_flowtable *flow_table, @@ -293,11 +330,26 @@ struct flow_ports { __be16 source, dest; }; +struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev); +int nf_flow_offload_xdp_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd); + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); +#if (IS_BUILTIN(CONFIG_NF_FLOW_TABLE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_NF_FLOW_TABLE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) +extern int nf_flow_register_bpf(void); +#else +static inline int nf_flow_register_bpf(void) +{ + return 0; +} +#endif + #define MODULE_ALIAS_NF_FLOWTABLE(family) \ MODULE_ALIAS("nf-flowtable-" __stringify(family)) @@ -324,7 +376,7 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); -static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) { __be16 proto; @@ -340,6 +392,16 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) return 0; } +static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) +{ + if (!pskb_may_pull(skb, ETH_HLEN + PPPOE_SES_HLEN)) + return false; + + *inner_proto = __nf_flow_pppoe_proto(skb); + + return true; +} + #define NF_FLOW_TABLE_STAT_INC(net, count) __this_cpu_inc((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_DEC(net, count) __this_cpu_dec((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count) \ diff --git a/include/net/netfilter/nf_hooks_lwtunnel.h b/include/net/netfilter/nf_hooks_lwtunnel.h index 52e27920f829..cef7a4eb8f97 100644 --- a/include/net/netfilter/nf_hooks_lwtunnel.h +++ b/include/net/netfilter/nf_hooks_lwtunnel.h @@ -2,6 +2,6 @@ #include <linux/types.h> #ifdef CONFIG_SYSCTL -int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write, +int nf_hooks_lwtunnel_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index e55eedc84ed7..00506792a06d 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -59,6 +59,9 @@ extern int sysctl_nf_log_all_netns; int nf_log_register(u_int8_t pf, struct nf_logger *logger); void nf_log_unregister(struct nf_logger *logger); +/* Check if any logger is registered for a given protocol family. */ +bool nf_log_is_registered(u_int8_t pf); + int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger); void nf_log_unset(struct net *net, const struct nf_logger *logger); diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index c81021ab07aa..4aeffddb7586 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -35,7 +35,6 @@ struct nf_queue_handler { void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); -void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h index 7c669792fb9c..f1db33bc6bf8 100644 --- a/include/net/netfilter/nf_reject.h +++ b/include/net/netfilter/nf_reject.h @@ -34,7 +34,6 @@ static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff, /* Protocols with partial checksums. */ case IPPROTO_UDPLITE: - case IPPROTO_DCCP: return false; } return true; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index dd40c75011d2..fab7dc73f738 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -2,7 +2,7 @@ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> @@ -178,9 +178,9 @@ static inline __be32 nft_reg_load_be32(const u32 *sreg) return *(__force __be32 *)sreg; } -static inline void nft_reg_store64(u32 *dreg, u64 val) +static inline void nft_reg_store64(u64 *dreg, u64 val) { - put_unaligned(val, (u64 *)dreg); + put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) @@ -205,9 +205,11 @@ static inline void nft_data_copy(u32 *dst, const struct nft_data *src, * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number + * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message + * @reg_inited: bitmap of initialised registers */ struct nft_ctx { struct net *net; @@ -220,6 +222,7 @@ struct nft_ctx { u8 family; u8 level; bool report; + DECLARE_BITMAP(reg_inited, NFT_REG32_NUM); }; enum nft_data_desc_flags { @@ -253,7 +256,8 @@ static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); -int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); +int nft_parse_register_load(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *sreg, u32 len); int nft_parse_register_store(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *dreg, const struct nft_data *data, @@ -274,11 +278,15 @@ struct nft_userdata { unsigned char data[]; }; +/* placeholder structure for opaque set element backend representation. */ +struct nft_elem_priv { }; + /** * struct nft_set_elem - generic representation of set elements * * @key: element key * @key_end: closing element key + * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { @@ -294,19 +302,39 @@ struct nft_set_elem { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; - void *priv; + struct nft_elem_priv *priv; +}; + +static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) +{ + return (void *)priv; +} + + +/** + * enum nft_iter_type - nftables set iterator type + * + * @NFT_ITER_UNSPEC: unspecified, to catch errors + * @NFT_ITER_READ: read-only iteration over set elements + * @NFT_ITER_UPDATE: iteration under mutex to update set element state + */ +enum nft_iter_type { + NFT_ITER_UNSPEC, + NFT_ITER_READ, + NFT_ITER_UPDATE, }; struct nft_set; struct nft_set_iter { u8 genmask; + enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; int (*fn)(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem); + struct nft_elem_priv *elem_priv); }; /** @@ -317,10 +345,10 @@ struct nft_set_iter { * @dtype: data type * @dlen: data length * @objtype: object type - * @flags: flags * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval + * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions @@ -343,9 +371,9 @@ struct nft_set_desc { /** * enum nft_set_class - performance class * - * @NFT_LOOKUP_O_1: constant, O(1) - * @NFT_LOOKUP_O_LOG_N: logarithmic, O(log N) - * @NFT_LOOKUP_O_N: linear, O(N) + * @NFT_SET_CLASS_O_1: constant, O(1) + * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) + * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, @@ -392,7 +420,7 @@ struct nft_expr_info; int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info); -int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); @@ -414,9 +442,16 @@ struct nft_set_ext; * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements + * @ksize: kernel set size + * @usize: userspace set size + * @adjust_maxsize: delta to adjust maximum set size + * @commit: commit set elements + * @abort: abort set elements * @privsize: function to return size of set private data + * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance + * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster @@ -424,45 +459,43 @@ struct nft_set_ext; * control plane functions. */ struct nft_set_ops { - bool (*lookup)(const struct net *net, + const struct nft_set_ext * (*lookup)(const struct net *net, const struct nft_set *set, + const u32 *key); + const struct nft_set_ext * (*update)(struct nft_set *set, const u32 *key, - const struct nft_set_ext **ext); - bool (*update)(struct nft_set *set, - const u32 *key, - void *(*new)(struct nft_set *, - const struct nft_expr *, - struct nft_regs *), const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_set_ext **ext); + struct nft_regs *regs); bool (*delete)(const struct nft_set *set, const u32 *key); int (*insert)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext); + struct nft_elem_priv **priv); void (*activate)(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem); - void * (*deactivate)(const struct net *net, + struct nft_elem_priv *elem_priv); + struct nft_elem_priv * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); - bool (*flush)(const struct net *net, + void (*flush)(const struct net *net, const struct nft_set *set, - void *priv); + struct nft_elem_priv *priv); void (*remove)(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem); + struct nft_elem_priv *elem_priv); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); - void * (*get)(const struct net *net, + struct nft_elem_priv * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); - void (*commit)(const struct nft_set *set); + u32 (*ksize)(u32 size); + u32 (*usize)(u32 size); + u32 (*adjust_maxsize)(const struct nft_set *set); + void (*commit)(struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); @@ -523,6 +556,7 @@ struct nft_set_elem_expr { * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element + * @in_update_walk: true during ->walk() in transaction phase * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal @@ -531,13 +565,16 @@ struct nft_set_elem_expr { * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data - * @expr: stateful expression + * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length + * @num_exprs: numbers of exprs + * @exprs: stateful expression + * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { @@ -554,6 +591,7 @@ struct nft_set { u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; + bool in_update_walk; u32 use; atomic_t nelems; u32 ndeact; @@ -587,6 +625,11 @@ static inline void *nft_set_priv(const struct nft_set *set) return (void *)set->data; } +static inline enum nft_data_types nft_set_datatype(const struct nft_set *set) +{ + return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; +} + static inline bool nft_set_gc_is_pending(const struct nft_set *s) { return refcount_read(&s->refs) != 1; @@ -646,9 +689,8 @@ void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout - * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element - * @NFT_SET_EXT_EXPRESSIONS: expressions assiciated with the element + * @NFT_SET_EXT_EXPRESSIONS: expressions associated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ @@ -658,7 +700,6 @@ enum nft_set_extensions { NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, - NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, @@ -683,6 +724,7 @@ extern const struct nft_set_ext_type nft_set_ext_types[]; * * @len: length of extension area * @offset: offsets of individual extension types + * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; @@ -693,15 +735,18 @@ struct nft_set_ext_tmpl { /** * struct nft_set_ext - set extensions * - * @genmask: generation mask + * @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT) * @offset: offsets of individual extension types * @data: beginning of extension data + * + * This structure must be aligned to word size, otherwise atomic bitops + * on genmask field can cause alignment failure on some archs. */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; -}; +} __aligned(BITS_PER_LONG / 8); static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { @@ -769,14 +814,14 @@ static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } -static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) -{ - return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); -} +struct nft_timeout { + u64 timeout; + u64 expiration; +}; -static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) +static inline struct nft_timeout *nft_set_ext_timeout(const struct nft_set_ext *ext) { - return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); + return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) @@ -789,16 +834,25 @@ static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ex return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } +static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, + u64 tstamp) +{ + if (!nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) || + READ_ONCE(nft_set_ext_timeout(ext)->timeout) == 0) + return false; + + return time_after_eq64(tstamp, READ_ONCE(nft_set_ext_timeout(ext)->expiration)); +} + static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { - return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && - time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext)); + return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, - void *elem) + const struct nft_elem_priv *elem_priv) { - return elem + set->ops->elemsize; + return (void *)elem_priv + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) @@ -810,16 +864,19 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); -void *nft_set_elem_init(const struct nft_set *set, - const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *key_end, const u32 *data, - u64 timeout, u64 expiration, gfp_t gfp); +struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const u32 *key, const u32 *key_end, + const u32 *data, + u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); -void nft_set_elem_destroy(const struct nft_set *set, void *elem, +void nft_set_elem_destroy(const struct nft_set *set, + const struct nft_elem_priv *elem_priv, bool destroy_expr); void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem); + const struct nft_set *set, + const struct nft_elem_priv *elem_priv); struct nft_expr_ops; /** @@ -828,6 +885,7 @@ struct nft_expr_ops; * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present + * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference @@ -869,14 +927,22 @@ struct nft_offload_ctx; * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function + * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu + * @destroy_clone: destruction clone function * @dump: function to dump parameters - * @type: expression type * @validate: validate expression, called during loop detection + * @reduce: reduce expression + * @gc: garbage collection expression + * @offload: hardware offload expression + * @offload_action: function to report true/false to allocate one slot or not in the flow + * offload array + * @offload_stats: function to synchronize hardware stats via updating the counter expression + * @type: expression type * @data: extra data to attach to this expression operation */ struct nft_expr_ops { @@ -884,7 +950,7 @@ struct nft_expr_ops { struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, - const struct nft_expr *src); + const struct nft_expr *src, gfp_t gfp); unsigned int size; int (*init)(const struct nft_ctx *ctx, @@ -903,8 +969,7 @@ struct nft_expr_ops { const struct nft_expr *expr, bool reset); int (*validate)(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data); + const struct nft_expr *expr); bool (*reduce)(struct nft_regs_track *track, const struct nft_expr *expr); bool (*gc)(struct net *net, @@ -1029,14 +1094,21 @@ struct nft_rule_blob { /** * struct nft_chain - nf_tables chain * + * @blob_gen_0: rule blob pointer to the current generation + * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain - * @flags: bitmask of enum nft_chain_flags + * @flags: bitmask of enum NFTA_CHAIN_FLAGS + * @bound: bind or not + * @genmask: generation mask * @name: name of the chain + * @udlen: user data length + * @udata: user data in the chain + * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; @@ -1061,7 +1133,7 @@ struct nft_chain { int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem); + struct nft_elem_priv *elem_priv); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); @@ -1113,7 +1185,7 @@ static inline bool nft_chain_is_bound(struct nft_chain *chain) int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); -void nf_tables_chain_destroy(struct nft_ctx *ctx); +void nf_tables_chain_destroy(struct nft_chain *chain); struct nft_stats { u64 bytes; @@ -1123,10 +1195,17 @@ struct nft_stats { struct nft_hook { struct list_head list; - struct nf_hook_ops ops; + struct list_head ops_list; struct rcu_head rcu; + char ifname[IFNAMSIZ]; + u8 ifnamelen; }; +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev); +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev); + /** * struct nft_base_chain - nf_tables base chain * @@ -1134,6 +1213,7 @@ struct nft_hook { * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy + * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) @@ -1159,8 +1239,6 @@ static inline bool nft_is_base_chain(const struct nft_chain *chain) return chain->flags & NFT_CHAIN_BASE; } -int __nft_release_basechain(struct nft_ctx *ctx); - unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); static inline bool nft_use_inc(u32 *use) @@ -1198,10 +1276,13 @@ static inline void nft_use_inc_restore(u32 *use) * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table + * @family:address family * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask - * @afinfo: address family info + * @nlpid: netlink port ID * @name: name of the table + * @udlen: length of the user data + * @udata: user data * @validate_state: internal, set when transaction adds jumps */ struct nft_table { @@ -1229,6 +1310,12 @@ static inline bool nft_table_has_owner(const struct nft_table *table) return table->flags & NFT_TABLE_F_OWNER; } +static inline bool nft_table_is_orphan(const struct nft_table *table) +{ + return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) == + NFT_TABLE_F_PERSIST; +} + static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || @@ -1259,11 +1346,13 @@ struct nft_object_hash_key { * struct nft_object - nf_tables stateful object * * @list: table stateful object list node - * @key: keys that identify this object * @rhlhead: nft_objname_ht node + * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle + * @udlen: length of user data + * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ @@ -1307,6 +1396,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute + * @family: address family for AF-specific object types * @policy: netlink attribute policy */ struct nft_object_type { @@ -1316,6 +1406,7 @@ struct nft_object_type { struct list_head list; u32 type; unsigned int maxattr; + u8 family; struct module *owner; const struct nla_policy *policy; }; @@ -1329,6 +1420,7 @@ struct nft_object_type { * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object + * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, @@ -1364,9 +1456,8 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle - * @dev_name: array of device names + * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector - * @ops: array of hooks */ struct nft_flowtable { struct list_head list; @@ -1382,7 +1473,8 @@ struct nft_flowtable { struct nf_flowtable data; }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask); @@ -1536,41 +1628,68 @@ static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) } /** - * struct nft_trans - nf_tables object update in transaction + * struct nft_trans - nf_tables object update in transaction * - * @list: used internally - * @binding_list: list of objects with possible bindings - * @msg_type: message type - * @put_net: ctx->net needs to be put - * @ctx: transaction context - * @data: internal information related to the transaction + * @list: used internally + * @net: struct net + * @table: struct nft_table the object resides in + * @msg_type: message type + * @seq: netlink sequence number + * @flags: modifiers to new request + * @report: notify via unicast netlink message + * @put_net: net needs to be put + * + * This is the information common to all objects in the transaction, + * this must always be the first member of derived sub-types. */ struct nft_trans { struct list_head list; - struct list_head binding_list; + struct net *net; + struct nft_table *table; int msg_type; - bool put_net; - struct nft_ctx ctx; - char data[]; + u32 seq; + u16 flags; + u8 report:1; + u8 put_net:1; +}; + +/** + * struct nft_trans_binding - nf_tables object with binding support in transaction + * @nft_trans: base structure, MUST be first member + * @binding_list: list of objects with possible bindings + * + * This is the base type used by objects that can be bound to a chain. + */ +struct nft_trans_binding { + struct nft_trans nft_trans; + struct list_head binding_list; }; struct nft_trans_rule { + struct nft_trans nft_trans; struct nft_rule *rule; + struct nft_chain *chain; struct nft_flow_rule *flow; u32 rule_id; bool bound; }; -#define nft_trans_rule(trans) \ - (((struct nft_trans_rule *)trans->data)->rule) -#define nft_trans_flow_rule(trans) \ - (((struct nft_trans_rule *)trans->data)->flow) -#define nft_trans_rule_id(trans) \ - (((struct nft_trans_rule *)trans->data)->rule_id) -#define nft_trans_rule_bound(trans) \ - (((struct nft_trans_rule *)trans->data)->bound) +#define nft_trans_container_rule(trans) \ + container_of(trans, struct nft_trans_rule, nft_trans) +#define nft_trans_rule(trans) \ + nft_trans_container_rule(trans)->rule +#define nft_trans_flow_rule(trans) \ + nft_trans_container_rule(trans)->flow +#define nft_trans_rule_id(trans) \ + nft_trans_container_rule(trans)->rule_id +#define nft_trans_rule_bound(trans) \ + nft_trans_container_rule(trans)->bound +#define nft_trans_rule_chain(trans) \ + nft_trans_container_rule(trans)->chain struct nft_trans_set { + struct nft_trans_binding nft_trans_binding; + struct list_head list_trans_newset; struct nft_set *set; u32 set_id; u32 gc_int; @@ -1580,100 +1699,132 @@ struct nft_trans_set { u32 size; }; -#define nft_trans_set(trans) \ - (((struct nft_trans_set *)trans->data)->set) -#define nft_trans_set_id(trans) \ - (((struct nft_trans_set *)trans->data)->set_id) -#define nft_trans_set_bound(trans) \ - (((struct nft_trans_set *)trans->data)->bound) -#define nft_trans_set_update(trans) \ - (((struct nft_trans_set *)trans->data)->update) -#define nft_trans_set_timeout(trans) \ - (((struct nft_trans_set *)trans->data)->timeout) -#define nft_trans_set_gc_int(trans) \ - (((struct nft_trans_set *)trans->data)->gc_int) -#define nft_trans_set_size(trans) \ - (((struct nft_trans_set *)trans->data)->size) +#define nft_trans_container_set(t) \ + container_of(t, struct nft_trans_set, nft_trans_binding.nft_trans) +#define nft_trans_set(trans) \ + nft_trans_container_set(trans)->set +#define nft_trans_set_id(trans) \ + nft_trans_container_set(trans)->set_id +#define nft_trans_set_bound(trans) \ + nft_trans_container_set(trans)->bound +#define nft_trans_set_update(trans) \ + nft_trans_container_set(trans)->update +#define nft_trans_set_timeout(trans) \ + nft_trans_container_set(trans)->timeout +#define nft_trans_set_gc_int(trans) \ + nft_trans_container_set(trans)->gc_int +#define nft_trans_set_size(trans) \ + nft_trans_container_set(trans)->size struct nft_trans_chain { + struct nft_trans_binding nft_trans_binding; struct nft_chain *chain; - bool update; char *name; struct nft_stats __percpu *stats; u8 policy; + bool update; bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; -#define nft_trans_chain(trans) \ - (((struct nft_trans_chain *)trans->data)->chain) -#define nft_trans_chain_update(trans) \ - (((struct nft_trans_chain *)trans->data)->update) -#define nft_trans_chain_name(trans) \ - (((struct nft_trans_chain *)trans->data)->name) -#define nft_trans_chain_stats(trans) \ - (((struct nft_trans_chain *)trans->data)->stats) -#define nft_trans_chain_policy(trans) \ - (((struct nft_trans_chain *)trans->data)->policy) -#define nft_trans_chain_bound(trans) \ - (((struct nft_trans_chain *)trans->data)->bound) -#define nft_trans_chain_id(trans) \ - (((struct nft_trans_chain *)trans->data)->chain_id) -#define nft_trans_basechain(trans) \ - (((struct nft_trans_chain *)trans->data)->basechain) -#define nft_trans_chain_hooks(trans) \ - (((struct nft_trans_chain *)trans->data)->hook_list) +#define nft_trans_container_chain(t) \ + container_of(t, struct nft_trans_chain, nft_trans_binding.nft_trans) +#define nft_trans_chain(trans) \ + nft_trans_container_chain(trans)->chain +#define nft_trans_chain_update(trans) \ + nft_trans_container_chain(trans)->update +#define nft_trans_chain_name(trans) \ + nft_trans_container_chain(trans)->name +#define nft_trans_chain_stats(trans) \ + nft_trans_container_chain(trans)->stats +#define nft_trans_chain_policy(trans) \ + nft_trans_container_chain(trans)->policy +#define nft_trans_chain_bound(trans) \ + nft_trans_container_chain(trans)->bound +#define nft_trans_chain_id(trans) \ + nft_trans_container_chain(trans)->chain_id +#define nft_trans_basechain(trans) \ + nft_trans_container_chain(trans)->basechain +#define nft_trans_chain_hooks(trans) \ + nft_trans_container_chain(trans)->hook_list struct nft_trans_table { + struct nft_trans nft_trans; bool update; }; -#define nft_trans_table_update(trans) \ - (((struct nft_trans_table *)trans->data)->update) +#define nft_trans_container_table(trans) \ + container_of(trans, struct nft_trans_table, nft_trans) +#define nft_trans_table_update(trans) \ + nft_trans_container_table(trans)->update + +enum nft_trans_elem_flags { + NFT_TRANS_UPD_TIMEOUT = (1 << 0), + NFT_TRANS_UPD_EXPIRATION = (1 << 1), +}; + +struct nft_elem_update { + u64 timeout; + u64 expiration; + u8 flags; +}; + +struct nft_trans_one_elem { + struct nft_elem_priv *priv; + struct nft_elem_update *update; +}; struct nft_trans_elem { + struct nft_trans nft_trans; struct nft_set *set; - struct nft_set_elem elem; bool bound; + unsigned int nelems; + struct nft_trans_one_elem elems[] __counted_by(nelems); }; -#define nft_trans_elem_set(trans) \ - (((struct nft_trans_elem *)trans->data)->set) -#define nft_trans_elem(trans) \ - (((struct nft_trans_elem *)trans->data)->elem) -#define nft_trans_elem_set_bound(trans) \ - (((struct nft_trans_elem *)trans->data)->bound) +#define nft_trans_container_elem(t) \ + container_of(t, struct nft_trans_elem, nft_trans) +#define nft_trans_elem_set(trans) \ + nft_trans_container_elem(trans)->set +#define nft_trans_elem_set_bound(trans) \ + nft_trans_container_elem(trans)->bound struct nft_trans_obj { + struct nft_trans nft_trans; struct nft_object *obj; struct nft_object *newobj; bool update; }; -#define nft_trans_obj(trans) \ - (((struct nft_trans_obj *)trans->data)->obj) -#define nft_trans_obj_newobj(trans) \ - (((struct nft_trans_obj *)trans->data)->newobj) -#define nft_trans_obj_update(trans) \ - (((struct nft_trans_obj *)trans->data)->update) +#define nft_trans_container_obj(t) \ + container_of(t, struct nft_trans_obj, nft_trans) +#define nft_trans_obj(trans) \ + nft_trans_container_obj(trans)->obj +#define nft_trans_obj_newobj(trans) \ + nft_trans_container_obj(trans)->newobj +#define nft_trans_obj_update(trans) \ + nft_trans_container_obj(trans)->update struct nft_trans_flowtable { + struct nft_trans nft_trans; struct nft_flowtable *flowtable; - bool update; struct list_head hook_list; u32 flags; + bool update; }; -#define nft_trans_flowtable(trans) \ - (((struct nft_trans_flowtable *)trans->data)->flowtable) -#define nft_trans_flowtable_update(trans) \ - (((struct nft_trans_flowtable *)trans->data)->update) -#define nft_trans_flowtable_hooks(trans) \ - (((struct nft_trans_flowtable *)trans->data)->hook_list) -#define nft_trans_flowtable_flags(trans) \ - (((struct nft_trans_flowtable *)trans->data)->flags) +#define nft_trans_container_flowtable(t) \ + container_of(t, struct nft_trans_flowtable, nft_trans) +#define nft_trans_flowtable(trans) \ + nft_trans_container_flowtable(trans)->flowtable +#define nft_trans_flowtable_update(trans) \ + nft_trans_container_flowtable(trans)->update +#define nft_trans_flowtable_hooks(trans) \ + nft_trans_container_flowtable(trans)->hook_list +#define nft_trans_flowtable_flags(trans) \ + nft_trans_container_flowtable(trans)->flags #define NFT_TRANS_GC_BATCHCOUNT 256 @@ -1682,11 +1833,38 @@ struct nft_trans_gc { struct net *net; struct nft_set *set; u32 seq; - u8 count; - void *priv[NFT_TRANS_GC_BATCHCOUNT]; + u16 count; + struct nft_elem_priv *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; +static inline void nft_ctx_update(struct nft_ctx *ctx, + const struct nft_trans *trans) +{ + switch (trans->msg_type) { + case NFT_MSG_NEWRULE: + case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: + ctx->chain = nft_trans_rule_chain(trans); + break; + case NFT_MSG_NEWCHAIN: + case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: + ctx->chain = nft_trans_chain(trans); + break; + default: + ctx->chain = NULL; + break; + } + + ctx->net = trans->net; + ctx->table = trans->table; + ctx->family = trans->table->family; + ctx->report = trans->report; + ctx->flags = trans->flags; + ctx->seq = trans->seq; +} + struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_destroy(struct nft_trans_gc *trans); @@ -1700,12 +1878,13 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); -struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, - unsigned int gc_seq); +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq); +struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, - struct nft_set_elem *elem); + struct nft_elem_priv *elem_priv); int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); @@ -1713,7 +1892,7 @@ void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); -void nf_tables_trans_destroy_flush_work(void); +void nf_tables_trans_destroy_flush_work(struct net *net); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); @@ -1727,14 +1906,17 @@ static inline int nft_request_module(struct net *net, const char *fmt, ...) { re struct nftables_pernet { struct list_head tables; struct list_head commit_list; + struct list_head destroy_list; + struct list_head commit_set_list; struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; - unsigned int base_seq; + u64 tstamp; unsigned int gc_seq; u8 validate_state; + struct work_struct destroy_work; }; extern unsigned int nf_tables_net_id; @@ -1744,14 +1926,14 @@ static inline struct nftables_pernet *nft_pernet(const struct net *net) return net_generic(net, nf_tables_net_id); } -#define __NFT_REDUCE_READONLY 1UL -#define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY - -static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) +static inline u64 nft_net_tstamp(const struct net *net) { - return expr->ops->reduce == NFT_REDUCE_READONLY; + return nft_pernet(net)->tstamp; } +#define __NFT_REDUCE_READONLY 1UL +#define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY + void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 780a5f6ad4a6..b8df5acbb723 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -73,7 +73,7 @@ struct nft_ct { struct nft_payload { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 dreg; }; @@ -93,35 +93,36 @@ extern const struct nft_set_type nft_set_bitmap_type; extern const struct nft_set_type nft_set_pipapo_type; extern const struct nft_set_type nft_set_pipapo_avx2_type; -#ifdef CONFIG_RETPOLINE -bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -#else -static inline bool -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) -{ - return set->ops->lookup(net, set, key, ext); -} +#ifdef CONFIG_MITIGATION_RETPOLINE +const struct nft_set_ext * +nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); #endif +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); + /* called from nft_pipapo_avx2.c */ -bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +const struct nft_set_ext * +nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); /* called from nft_set_pipapo.c */ -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); void nft_counter_init_seqcount(void); @@ -161,6 +162,7 @@ enum { }; struct nft_inner_tun_ctx { + unsigned long cookie; u16 type; u16 inner_tunoff; u16 inner_lloff; @@ -180,4 +182,7 @@ void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); +struct nft_elem_priv *nft_dynset_new(struct nft_set *set, + const struct nft_expr *expr, + struct nft_regs *regs); #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index 947973623dc7..fcf967286e37 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -19,7 +19,7 @@ static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { struct iphdr *iph, _iph; - u32 len, thoff; + u32 len, thoff, skb_len; iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*iph), &_iph); @@ -31,7 +31,9 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; - if (pkt->skb->len < len) + skb_len = pkt->skb->len - skb_network_offset(pkt->skb); + + if (skb_len < len) return -1; else if (len < thoff) return -1; @@ -40,7 +42,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; - pkt->thoff = thoff; + pkt->thoff = skb_network_offset(pkt->skb) + thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index 467d59b9e533..a0633eeaec97 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -31,8 +31,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; + u32 pkt_len, skb_len; int protohdr; - u32 pkt_len; ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*ip6h), &_ip6h); @@ -43,7 +43,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) return -1; pkt_len = ntohs(ip6h->payload_len); - if (pkt_len + sizeof(*ip6h) > pkt->skb->len) + skb_len = pkt->skb->len - skb_network_offset(pkt->skb); + if (pkt_len + sizeof(*ip6h) > skb_len) return -1; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index faa108b1ba67..06985530517b 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -36,6 +36,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); /** * nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections + * @net: The network namespace. * @skb: The skb being processed. * @laddr: IPv4 address to redirect to or zero. * @lport: TCP port to redirect to or zero. @@ -48,7 +49,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); * * nf_tproxy_handle_time_wait4() consumes the socket reference passed in. * - * Returns the listener socket if there's one, the TIME_WAIT socket if + * Returns: the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ struct sock * @@ -107,7 +108,7 @@ nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, * * nf_tproxy_handle_time_wait6() consumes the socket reference passed in. * - * Returns the listener socket if there's one, the TIME_WAIT socket if + * Returns: the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ struct sock * diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 167640b843ef..7370fba844ef 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -2,6 +2,7 @@ #ifndef _NFT_FIB_H_ #define _NFT_FIB_H_ +#include <net/l3mdev.h> #include <net/netfilter/nf_tables.h> struct nft_fib { @@ -18,12 +19,39 @@ nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in) return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } +static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt) +{ + const struct net_device *indev = nft_in(pkt); + const struct sock *sk; + + switch (nft_hook(pkt)) { + case NF_INET_PRE_ROUTING: + case NF_INET_INGRESS: + case NF_INET_LOCAL_IN: + break; + default: + return false; + } + + sk = pkt->skb->sk; + if (sk && sk_fullsock(sk)) + return sk->sk_rx_dst_ifindex == indev->ifindex; + + return nft_fib_is_loopback(pkt->skb, indev); +} + +static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt, + const struct net_device *iif) +{ + const struct net_device *dev = iif ? iif : pkt->skb->dev; + + return l3mdev_master_ifindex_rcu(dev); +} + int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); -int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data); - +int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr); void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index ba1238f12a48..d602263590fe 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -41,8 +41,7 @@ void nft_meta_set_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr); int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data); + const struct nft_expr *expr); bool nft_meta_get_reduce(struct nft_regs_track *track, const struct nft_expr *expr); diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index 6d9ba62efd75..19060212988a 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -15,8 +15,7 @@ struct nft_reject { extern const struct nla_policy nft_reject_policy[]; int nft_reject_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data); + const struct nft_expr *expr); int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, diff --git a/include/net/netkit.h b/include/net/netkit.h new file mode 100644 index 000000000000..9ec0163739f4 --- /dev/null +++ b/include/net/netkit.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2023 Isovalent */ +#ifndef __NET_NETKIT_H +#define __NET_NETKIT_H + +#include <linux/bpf.h> + +#ifdef CONFIG_NETKIT +int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +INDIRECT_CALLABLE_DECLARE(struct net_device *netkit_peer_dev(struct net_device *dev)); +#else +static inline int netkit_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_prog_detach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} + +static inline struct net_device *netkit_peer_dev(struct net_device *dev) +{ + return NULL; +} +#endif /* CONFIG_NETKIT */ +#endif /* __NET_NETKIT_H */ diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 43ae50337685..02914b1df38b 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -30,7 +30,7 @@ struct calipso_doi; /* * NetLabel - A management interface for maintaining network packet label - * mapping tables for explicit packet labling protocols. + * mapping tables for explicit packet labeling protocols. * * Network protocols such as CIPSO and RIPSO require a label translation layer * to convert the label on the packet into something meaningful on the host @@ -97,7 +97,7 @@ struct calipso_doi; /* NetLabel audit information */ struct netlbl_audit { - u32 secid; + struct lsm_prop prop; kuid_t loginuid; unsigned int sessionid; }; @@ -145,15 +145,14 @@ struct netlbl_lsm_cache { * processing. * */ -#define NETLBL_CATMAP_MAPTYPE u64 #define NETLBL_CATMAP_MAPCNT 4 -#define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8) +#define NETLBL_CATMAP_MAPSIZE (sizeof(u64) * 8) #define NETLBL_CATMAP_SIZE (NETLBL_CATMAP_MAPSIZE * \ NETLBL_CATMAP_MAPCNT) -#define NETLBL_CATMAP_BIT (NETLBL_CATMAP_MAPTYPE)0x01 +#define NETLBL_CATMAP_BIT ((u64)0x01) struct netlbl_lsm_catmap { u32 startbit; - NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT]; + u64 bitmap[NETLBL_CATMAP_MAPCNT]; struct netlbl_lsm_catmap *next; }; @@ -209,6 +208,7 @@ struct netlbl_lsm_secattr { * struct netlbl_calipso_ops - NetLabel CALIPSO operations * @doi_add: add a CALIPSO DOI * @doi_free: free a CALIPSO DOI + * @doi_remove: remove a CALIPSO DOI * @doi_getdef: returns a reference to a DOI * @doi_putdef: releases a reference of a DOI * @doi_walk: enumerate the DOI list @@ -275,15 +275,17 @@ struct netlbl_calipso_ops { * on success, NULL on failure. * */ -static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc(gfp_t flags) +static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc_noprof(gfp_t flags) { struct netlbl_lsm_cache *cache; - cache = kzalloc(sizeof(*cache), flags); + cache = kzalloc_noprof(sizeof(*cache), flags); if (cache) refcount_set(&cache->refcount, 1); return cache; } +#define netlbl_secattr_cache_alloc(...) \ + alloc_hooks(netlbl_secattr_cache_alloc_noprof(__VA_ARGS__)) /** * netlbl_secattr_cache_free - Frees a netlbl_lsm_cache struct @@ -312,10 +314,11 @@ static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache) * on failure. * */ -static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc(gfp_t flags) +static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc_noprof(gfp_t flags) { - return kzalloc(sizeof(struct netlbl_lsm_catmap), flags); + return kzalloc_noprof(sizeof(struct netlbl_lsm_catmap), flags); } +#define netlbl_catmap_alloc(...) alloc_hooks(netlbl_catmap_alloc_noprof(__VA_ARGS__)) /** * netlbl_catmap_free - Free a LSM secattr catmap @@ -377,10 +380,11 @@ static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) * pointer on success, or NULL on failure. * */ -static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc(gfp_t flags) +static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc_noprof(gfp_t flags) { - return kzalloc(sizeof(struct netlbl_lsm_secattr), flags); + return kzalloc_noprof(sizeof(struct netlbl_lsm_secattr), flags); } +#define netlbl_secattr_alloc(...) alloc_hooks(netlbl_secattr_alloc_noprof(__VA_ARGS__)) /** * netlbl_secattr_free - Frees a netlbl_lsm_secattr struct @@ -471,7 +475,8 @@ void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state); int netlbl_enabled(void); int netlbl_sock_setattr(struct sock *sk, u16 family, - const struct netlbl_lsm_secattr *secattr); + const struct netlbl_lsm_secattr *secattr, + bool sk_locked); void netlbl_sock_delattr(struct sock *sk); int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); @@ -488,6 +493,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr); void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway); +bool netlbl_sk_lock_check(struct sock *sk); /* * LSM label mapping cache operations @@ -615,7 +621,8 @@ static inline int netlbl_enabled(void) } static inline int netlbl_sock_setattr(struct sock *sk, u16 family, - const struct netlbl_lsm_secattr *secattr) + const struct netlbl_lsm_secattr *secattr, + bool sk_locked) { return -ENOSYS; } @@ -674,6 +681,11 @@ static inline struct audit_buffer *netlbl_audit_start(int type, { return NULL; } + +static inline bool netlbl_sk_lock_check(struct sock *sk) +{ + return true; +} #endif /* CONFIG_NETLABEL */ const struct netlbl_calipso_ops * diff --git a/include/net/netlink.h b/include/net/netlink.h index 8a7cd1170e1f..1a8356ca4b78 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -41,7 +41,8 @@ * nlmsg_get_pos() return current position in message * nlmsg_trim() trim part of message * nlmsg_cancel() cancel message construction - * nlmsg_free() free a netlink message + * nlmsg_consume() free a netlink message (expected) + * nlmsg_free() free a netlink message (drop) * * Message Sending: * nlmsg_multicast() multicast message to several groups @@ -67,6 +68,8 @@ * nlmsg_for_each_msg() loop over all messages * nlmsg_validate() validate netlink message incl. attrs * nlmsg_for_each_attr() loop over all attributes + * nlmsg_for_each_attr_type() loop over all attributes with the + * given type * * Misc: * nlmsg_report() report back to application? @@ -117,6 +120,7 @@ * nla_nest_start(skb, type) start a nested attribute * nla_nest_end(skb, nla) finalize a nested attribute * nla_nest_cancel(skb, nla) cancel nested attribute construction + * nla_put_empty_nest(skb, type) create an empty nest * * Attribute Length Calculations: * nla_attr_size(payload) length of attribute w/o padding @@ -128,6 +132,8 @@ * nla_len(nla) length of attribute payload * * Attribute Payload Access for Basic Types: + * nla_get_uint(nla) get payload for a uint attribute + * nla_get_sint(nla) get payload for a sint attribute * nla_get_u8(nla) get payload for a u8 attribute * nla_get_u16(nla) get payload for a u16 attribute * nla_get_u32(nla) get payload for a u32 attribute @@ -139,6 +145,8 @@ * nla_get_flag(nla) return 1 if flag is true * nla_get_msecs(nla) get payload for a msecs attribute * + * The same functions also exist with _default(). + * * Attribute Misc: * nla_memcpy(dest, nla, count) copy attribute into memory * nla_memcmp(nla, data, size) compare attribute with memory area @@ -155,7 +163,11 @@ * nla_parse() parse and validate stream of attrs * nla_parse_nested() parse nested attributes * nla_for_each_attr() loop over all attributes + * nla_for_each_attr_type() loop over all attributes with the + * given type * nla_for_each_nested() loop over the nested attributes + * nla_for_each_nested_type() loop over the nested attributes with + * the given type *========================================================================= */ @@ -183,6 +195,8 @@ enum { NLA_REJECT, NLA_BE16, NLA_BE32, + NLA_SINT, + NLA_UINT, __NLA_TYPE_MAX, }; @@ -229,6 +243,7 @@ enum nla_policy_validation { * nested header (or empty); len field is used if * nested_policy is also used, for the max attr * number in the nested policy. + * NLA_SINT, NLA_UINT, * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, * NLA_S8, NLA_S16, @@ -260,12 +275,14 @@ enum nla_policy_validation { * while an array has the nested attributes at another * level down and the attribute types directly in the * nesting don't matter. + * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64, * NLA_BE16, * NLA_BE32, + * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, @@ -280,6 +297,7 @@ enum nla_policy_validation { * or NLA_POLICY_FULL_RANGE_SIGNED() macros instead. * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and * NLA_POLICY_RANGE() macros. + * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, @@ -288,6 +306,7 @@ enum nla_policy_validation { * to a struct netlink_range_validation that indicates * the min/max values. * Use NLA_POLICY_FULL_RANGE(). + * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, @@ -304,7 +323,13 @@ enum nla_policy_validation { * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: + * NLA_U8, NLA_U16, + * NLA_U32, NLA_U64, + * NLA_S8, NLA_S16, + * NLA_S32, NLA_S64, + * NLA_MSECS, * NLA_BINARY Validation function called for the attribute. + * * All other Unused - but note that it's a union * * Example: @@ -351,8 +376,8 @@ struct nla_policy { const u32 mask; const char *reject_message; const struct nla_policy *nested_policy; - struct netlink_range_validation *range; - struct netlink_range_validation_signed *range_signed; + const struct netlink_range_validation *range; + const struct netlink_range_validation_signed *range_signed; struct { s16 min, max; }; @@ -377,9 +402,11 @@ struct nla_policy { #define __NLA_IS_UINT_TYPE(tp) \ (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || \ - tp == NLA_U64 || tp == NLA_BE16 || tp == NLA_BE32) + tp == NLA_U64 || tp == NLA_UINT || \ + tp == NLA_BE16 || tp == NLA_BE32) #define __NLA_IS_SINT_TYPE(tp) \ - (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64) + (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \ + tp == NLA_SINT) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ @@ -453,6 +480,7 @@ struct nla_policy { .max = _len \ } #define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) +#define NLA_POLICY_MAX_LEN(_len) NLA_POLICY_MAX(NLA_BINARY, _len) /** * struct nl_info - netlink source information @@ -592,6 +620,22 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) } /** + * nlmsg_payload - message payload if the data fits in the len + * @nlh: netlink message header + * @len: struct length + * + * Returns: The netlink message payload/data if the length is sufficient, + * otherwise NULL. + */ +static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len) +{ + if (nlh->nlmsg_len < nlmsg_msg_size(len)) + return NULL; + + return nlmsg_data(nlh); +} + +/** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header @@ -630,7 +674,7 @@ static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) * @nlh: netlink message header * @remaining: number of bytes remaining in message stream * - * Returns the next netlink message in the message stream and + * Returns: the next netlink message in the message stream and * decrements remaining by the size of the current message. */ static inline struct nlmsghdr * @@ -657,7 +701,7 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining) * exceeding maxtype will be rejected, policy must be specified, attributes * will be validated in the strictest way possible. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, @@ -682,7 +726,7 @@ static inline int nla_parse(struct nlattr **tb, int maxtype, * exceeding maxtype will be ignored and attributes from the policy are not * always strictly validated (only for new attributes). * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, @@ -707,7 +751,7 @@ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, * exceeding maxtype will be rejected as well as trailing data, but the * policy is not completely strictly validated (only for new attributes). * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, @@ -811,10 +855,10 @@ nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, /** * nlmsg_find_attr - find a specific attribute in a netlink message * @nlh: netlink message header - * @hdrlen: length of familiy specific header + * @hdrlen: length of family specific header * @attrtype: type of attribute to look for * - * Returns the first attribute which matches the specified type. + * Returns: the first attribute which matches the specified type. */ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) @@ -833,9 +877,9 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in liberal mode. - * See documenation of struct nla_policy for more details. + * See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_validate_deprecated(const struct nlattr *head, int len, int maxtype, @@ -856,9 +900,9 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len, * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in strict mode. - * See documenation of struct nla_policy for more details. + * See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, @@ -871,7 +915,7 @@ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, /** * nlmsg_validate_deprecated - validate a netlink message including attributes * @nlh: netlinket message header - * @hdrlen: length of familiy specific header + * @hdrlen: length of family specific header * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct @@ -895,7 +939,7 @@ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, * nlmsg_report - need to report back to application? * @nlh: netlink message header * - * Returns 1 if a report back to the application is requested. + * Returns: 1 if a report back to the application is requested. */ static inline int nlmsg_report(const struct nlmsghdr *nlh) { @@ -906,7 +950,7 @@ static inline int nlmsg_report(const struct nlmsghdr *nlh) * nlmsg_seq - return the seq number of netlink message * @nlh: netlink message header * - * Returns 0 if netlink message is NULL + * Returns: 0 if netlink message is NULL */ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) { @@ -917,7 +961,7 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) * nlmsg_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @nlh: netlink message header - * @hdrlen: length of familiy specific header + * @hdrlen: length of family specific header * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ @@ -925,6 +969,18 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) nlmsg_attrlen(nlh, hdrlen), rem) /** + * nlmsg_for_each_attr_type - iterate over a stream of attributes + * @pos: loop counter, set to the current attribute + * @type: required attribute type for @pos + * @nlh: netlink message header + * @hdrlen: length of the family specific header + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \ + nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ + if (nla_type(pos) == type) + +/** * nlmsg_put - Add a new netlink message to an skb * @skb: socket buffer to store message in * @portid: netlink PORTID of requesting application @@ -933,7 +989,7 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) * @payload: length of message payload * @flags: message flags * - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, @@ -952,7 +1008,7 @@ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 se * * Append data to an existing nlmsg, used when constructing a message * with multiple fixed-format headers (which is rare). - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the extra payload. */ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) @@ -974,7 +1030,7 @@ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) * @payload: length of message payload * @flags: message flags * - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, @@ -1000,11 +1056,25 @@ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) } /** + * nlmsg_new_large - Allocate a new netlink message with non-contiguous + * physical memory + * @payload: size of the message payload + * + * The allocated skb is unable to have frag page for shinfo->frags*, + * as the NULL setting for skb->head in netlink_skb_destructor() will + * bypass most of the handling in skb_release_data() + */ +static inline struct sk_buff *nlmsg_new_large(size_t payload) +{ + return netlink_alloc_large_skb(nlmsg_total_size(payload), 0); +} + +/** * nlmsg_end - Finalize a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * - * Corrects the netlink message header to include the appeneded + * Corrects the netlink message header to include the appended * attributes. Only necessary if attributes have been added to * the message. */ @@ -1017,7 +1087,7 @@ static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) * nlmsg_get_pos - return current position in netlink message * @skb: socket buffer the message is stored in * - * Returns a pointer to the current tail of the message. + * Returns: a pointer to the current tail of the message. */ static inline void *nlmsg_get_pos(struct sk_buff *skb) { @@ -1053,7 +1123,7 @@ static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh) } /** - * nlmsg_free - free a netlink message + * nlmsg_free - drop a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_free(struct sk_buff *skb) @@ -1062,21 +1132,38 @@ static inline void nlmsg_free(struct sk_buff *skb) } /** - * nlmsg_multicast - multicast a netlink message + * nlmsg_consume - free a netlink message + * @skb: socket buffer of netlink message + */ +static inline void nlmsg_consume(struct sk_buff *skb) +{ + consume_skb(skb); +} + +/** + * nlmsg_multicast_filtered - multicast a netlink message with filter function * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags + * @filter: filter function + * @filter_data: filter function private data + * + * Return: 0 on success, negative error code for failure. */ -static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, - u32 portid, unsigned int group, gfp_t flags) +static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb, + u32 portid, unsigned int group, + gfp_t flags, + netlink_filter_fn filter, + void *filter_data) { int err; NETLINK_CB(skb).dst_group = group; - err = netlink_broadcast(sk, skb, portid, group, flags); + err = netlink_broadcast_filtered(sk, skb, portid, group, flags, + filter, filter_data); if (err > 0) err = 0; @@ -1084,6 +1171,21 @@ static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, } /** + * nlmsg_multicast - multicast a netlink message + * @sk: netlink socket to spread messages to + * @skb: netlink message as socket buffer + * @portid: own netlink portid to avoid sending to yourself + * @group: multicast group id + * @flags: allocation flags + */ +static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags) +{ + return nlmsg_multicast_filtered(sk, skb, portid, group, flags, + NULL, NULL); +} + +/** * nlmsg_unicast - unicast a netlink message * @sk: netlink socket to spread message to * @skb: netlink message as socket buffer @@ -1189,7 +1291,7 @@ static inline void *nla_data(const struct nlattr *nla) * nla_len - length of payload * @nla: netlink attribute */ -static inline int nla_len(const struct nlattr *nla) +static inline u16 nla_len(const struct nlattr *nla) { return nla->nla_len - NLA_HDRLEN; } @@ -1211,7 +1313,7 @@ static inline int nla_ok(const struct nlattr *nla, int remaining) * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream * - * Returns the next netlink attribute in the attribute stream and + * Returns: the next netlink attribute in the attribute stream and * decrements remaining by the size of the current attribute. */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) @@ -1227,7 +1329,7 @@ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) * @nla: attribute containing the nested attributes * @attrtype: type of attribute to look for * - * Returns the first attribute which matches the specified type. + * Returns: the first attribute which matches the specified type. */ static inline struct nlattr * nla_find_nested(const struct nlattr *nla, int attrtype) @@ -1358,6 +1460,22 @@ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) } /** + * nla_put_uint - Add a variable-size unsigned int to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @value: numeric value + */ +static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value) +{ + u64 tmp64 = value; + u32 tmp32 = value; + + if (tmp64 == tmp32) + return nla_put_u32(skb, attrtype, tmp32); + return nla_put(skb, attrtype, sizeof(u64), &tmp64); +} + +/** * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type @@ -1512,6 +1630,22 @@ static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, } /** + * nla_put_sint - Add a variable-size signed int to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @value: numeric value + */ +static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value) +{ + s64 tmp64 = value; + s32 tmp32 = value; + + if (tmp64 == tmp32) + return nla_put_s32(skb, attrtype, tmp32); + return nla_put(skb, attrtype, sizeof(s64), &tmp64); +} + +/** * nla_put_string - Add a string netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type @@ -1601,6 +1735,20 @@ static inline u32 nla_get_u32(const struct nlattr *nla) } /** + * nla_get_u32_default - return payload of u32 attribute or default + * @nla: u32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u32 nla_get_u32_default(const struct nlattr *nla, u32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u32(nla); +} + +/** * nla_get_be32 - return payload of __be32 attribute * @nla: __be32 netlink attribute */ @@ -1610,6 +1758,21 @@ static inline __be32 nla_get_be32(const struct nlattr *nla) } /** + * nla_get_be32_default - return payload of be32 attribute or default + * @nla: __be32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be32 nla_get_be32_default(const struct nlattr *nla, + __be32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be32(nla); +} + +/** * nla_get_le32 - return payload of __le32 attribute * @nla: __le32 netlink attribute */ @@ -1619,6 +1782,21 @@ static inline __le32 nla_get_le32(const struct nlattr *nla) } /** + * nla_get_le32_default - return payload of le32 attribute or default + * @nla: __le32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le32 nla_get_le32_default(const struct nlattr *nla, + __le32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le32(nla); +} + +/** * nla_get_u16 - return payload of u16 attribute * @nla: u16 netlink attribute */ @@ -1628,6 +1806,20 @@ static inline u16 nla_get_u16(const struct nlattr *nla) } /** + * nla_get_u16_default - return payload of u16 attribute or default + * @nla: u16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u16 nla_get_u16_default(const struct nlattr *nla, u16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u16(nla); +} + +/** * nla_get_be16 - return payload of __be16 attribute * @nla: __be16 netlink attribute */ @@ -1637,6 +1829,21 @@ static inline __be16 nla_get_be16(const struct nlattr *nla) } /** + * nla_get_be16_default - return payload of be16 attribute or default + * @nla: __be16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be16 nla_get_be16_default(const struct nlattr *nla, + __be16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be16(nla); +} + +/** * nla_get_le16 - return payload of __le16 attribute * @nla: __le16 netlink attribute */ @@ -1646,6 +1853,21 @@ static inline __le16 nla_get_le16(const struct nlattr *nla) } /** + * nla_get_le16_default - return payload of le16 attribute or default + * @nla: __le16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le16 nla_get_le16_default(const struct nlattr *nla, + __le16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le16(nla); +} + +/** * nla_get_u8 - return payload of u8 attribute * @nla: u8 netlink attribute */ @@ -1655,6 +1877,20 @@ static inline u8 nla_get_u8(const struct nlattr *nla) } /** + * nla_get_u8_default - return payload of u8 attribute or default + * @nla: u8 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u8 nla_get_u8_default(const struct nlattr *nla, u8 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u8(nla); +} + +/** * nla_get_u64 - return payload of u64 attribute * @nla: u64 netlink attribute */ @@ -1668,6 +1904,45 @@ static inline u64 nla_get_u64(const struct nlattr *nla) } /** + * nla_get_u64_default - return payload of u64 attribute or default + * @nla: u64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u64 nla_get_u64_default(const struct nlattr *nla, u64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u64(nla); +} + +/** + * nla_get_uint - return payload of uint attribute + * @nla: uint netlink attribute + */ +static inline u64 nla_get_uint(const struct nlattr *nla) +{ + if (nla_len(nla) == sizeof(u32)) + return nla_get_u32(nla); + return nla_get_u64(nla); +} + +/** + * nla_get_uint_default - return payload of uint attribute or default + * @nla: uint netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u64 nla_get_uint_default(const struct nlattr *nla, u64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_uint(nla); +} + +/** * nla_get_be64 - return payload of __be64 attribute * @nla: __be64 netlink attribute */ @@ -1681,6 +1956,21 @@ static inline __be64 nla_get_be64(const struct nlattr *nla) } /** + * nla_get_be64_default - return payload of be64 attribute or default + * @nla: __be64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be64 nla_get_be64_default(const struct nlattr *nla, + __be64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be64(nla); +} + +/** * nla_get_le64 - return payload of __le64 attribute * @nla: __le64 netlink attribute */ @@ -1690,6 +1980,21 @@ static inline __le64 nla_get_le64(const struct nlattr *nla) } /** + * nla_get_le64_default - return payload of le64 attribute or default + * @nla: __le64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le64 nla_get_le64_default(const struct nlattr *nla, + __le64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le64(nla); +} + +/** * nla_get_s32 - return payload of s32 attribute * @nla: s32 netlink attribute */ @@ -1699,6 +2004,20 @@ static inline s32 nla_get_s32(const struct nlattr *nla) } /** + * nla_get_s32_default - return payload of s32 attribute or default + * @nla: s32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s32 nla_get_s32_default(const struct nlattr *nla, s32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s32(nla); +} + +/** * nla_get_s16 - return payload of s16 attribute * @nla: s16 netlink attribute */ @@ -1708,6 +2027,20 @@ static inline s16 nla_get_s16(const struct nlattr *nla) } /** + * nla_get_s16_default - return payload of s16 attribute or default + * @nla: s16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s16 nla_get_s16_default(const struct nlattr *nla, s16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s16(nla); +} + +/** * nla_get_s8 - return payload of s8 attribute * @nla: s8 netlink attribute */ @@ -1717,6 +2050,20 @@ static inline s8 nla_get_s8(const struct nlattr *nla) } /** + * nla_get_s8_default - return payload of s8 attribute or default + * @nla: s8 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s8 nla_get_s8_default(const struct nlattr *nla, s8 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s8(nla); +} + +/** * nla_get_s64 - return payload of s64 attribute * @nla: s64 netlink attribute */ @@ -1730,6 +2077,45 @@ static inline s64 nla_get_s64(const struct nlattr *nla) } /** + * nla_get_s64_default - return payload of s64 attribute or default + * @nla: s64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s64 nla_get_s64_default(const struct nlattr *nla, s64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s64(nla); +} + +/** + * nla_get_sint - return payload of uint attribute + * @nla: uint netlink attribute + */ +static inline s64 nla_get_sint(const struct nlattr *nla) +{ + if (nla_len(nla) == sizeof(s32)) + return nla_get_s32(nla); + return nla_get_s64(nla); +} + +/** + * nla_get_sint_default - return payload of sint attribute or default + * @nla: sint netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s64 nla_get_sint_default(const struct nlattr *nla, s64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_sint(nla); +} + +/** * nla_get_flag - return payload of flag attribute * @nla: flag netlink attribute */ @@ -1742,7 +2128,7 @@ static inline int nla_get_flag(const struct nlattr *nla) * nla_get_msecs - return payload of msecs attribute * @nla: msecs netlink attribute * - * Returns the number of milliseconds in jiffies. + * Returns: the number of milliseconds in jiffies. */ static inline unsigned long nla_get_msecs(const struct nlattr *nla) { @@ -1752,6 +2138,21 @@ static inline unsigned long nla_get_msecs(const struct nlattr *nla) } /** + * nla_get_msecs_default - return payload of msecs attribute or default + * @nla: msecs netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline unsigned long nla_get_msecs_default(const struct nlattr *nla, + unsigned long defvalue) +{ + if (!nla) + return defvalue; + return nla_get_msecs(nla); +} + +/** * nla_get_in_addr - return payload of IPv4 address attribute * @nla: IPv4 address netlink attribute */ @@ -1761,6 +2162,21 @@ static inline __be32 nla_get_in_addr(const struct nlattr *nla) } /** + * nla_get_in_addr_default - return payload of be32 attribute or default + * @nla: IPv4 address netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be32 nla_get_in_addr_default(const struct nlattr *nla, + __be32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_in_addr(nla); +} + +/** * nla_get_in6_addr - return payload of IPv6 address attribute * @nla: IPv6 address netlink attribute */ @@ -1789,10 +2205,11 @@ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla) * @src: netlink attribute to duplicate from * @gfp: GFP mask */ -static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp) +static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) { - return kmemdup(nla_data(src), nla_len(src), gfp); + return kmemdup_noprof(nla_data(src), nla_len(src), gfp); } +#define nla_memdup(...) alloc_hooks(nla_memdup_noprof(__VA_ARGS__)) /** * nla_nest_start_noflag - Start a new level of nested attributes @@ -1803,7 +2220,7 @@ static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp) * marked their nest attributes with NLA_F_NESTED flag. New APIs should use * nla_nest_start() which sets the flag. * - * Returns the container attribute or NULL on error + * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, int attrtype) @@ -1824,7 +2241,7 @@ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED * flag. This is the preferred function to use in new code. * - * Returns the container attribute or NULL on error + * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) { @@ -1837,9 +2254,9 @@ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) * @start: container attribute * * Corrects the container attribute header to include the all - * appeneded attributes. + * appended attributes. * - * Returns the total data length of the skb. + * Returns: the total data length of the skb. */ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) { @@ -1861,6 +2278,20 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) } /** + * nla_put_empty_nest - Create an empty nest + * @skb: socket buffer the message is stored in + * @attrtype: attribute type of the container + * + * This function is a helper for creating empty nests. + * + * Returns: 0 when successful or -EMSGSIZE on failure. + */ +static inline int nla_put_empty_nest(struct sk_buff *skb, int attrtype) +{ + return nla_nest_start(skb, attrtype) ? 0 : -EMSGSIZE; +} + +/** * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute * @maxtype: maximum attribute type to be expected @@ -1870,9 +2301,9 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * * Validates all attributes in the nested attribute stream against the * specified policy. Attributes with a type exceeding maxtype will be - * ignored. See documenation of struct nla_policy for more details. + * ignored. See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, @@ -1905,7 +2336,7 @@ nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute * @skb: socket buffer the message is stored in * - * Return true if padding is needed to align the next attribute (nla_data()) to + * Return: true if padding is needed to align the next attribute (nla_data()) to * a 64-bit aligned area. */ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) @@ -1932,7 +2363,7 @@ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) * This will only be done in architectures which do not have * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined. * - * Returns zero on success or a negative error code. + * Returns: zero on success or a negative error code. */ static inline int nla_align_64bit(struct sk_buff *skb, int padattr) { @@ -1969,6 +2400,18 @@ static inline int nla_total_size_64bit(int payload) pos = nla_next(pos, &(rem))) /** + * nla_for_each_attr_type - iterate over a stream of attributes + * @pos: loop counter, set to current attribute + * @type: required attribute type for @pos + * @head: head of attribute stream + * @len: length of attribute stream + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nla_for_each_attr_type(pos, type, head, len, rem) \ + nla_for_each_attr(pos, head, len, rem) \ + if (nla_type(pos) == type) + +/** * nla_for_each_nested - iterate over nested attributes * @pos: loop counter, set to current attribute * @nla: attribute containing the nested attributes @@ -1978,6 +2421,17 @@ static inline int nla_total_size_64bit(int payload) nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem) /** + * nla_for_each_nested_type - iterate over nested attributes + * @pos: loop counter, set to current attribute + * @type: required attribute type for @pos + * @nla: attribute containing the nested attributes + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nla_for_each_nested_type(pos, type, nla, rem) \ + nla_for_each_nested(pos, nla, rem) \ + if (nla_type(pos) == type) + +/** * nla_is_last - Test if attribute is last in stream * @nla: attribute to test * @rem: bytes remaining in stream diff --git a/include/net/netmem.h b/include/net/netmem.h new file mode 100644 index 000000000000..9e10f4ac50c3 --- /dev/null +++ b/include/net/netmem.h @@ -0,0 +1,414 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Network memory + * + * Author: Mina Almasry <almasrymina@google.com> + */ + +#ifndef _NET_NETMEM_H +#define _NET_NETMEM_H + +#include <linux/dma-mapping.h> +#include <linux/mm.h> +#include <net/net_debug.h> + +/* These fields in struct page are used by the page_pool and net stack: + * + * struct { + * unsigned long pp_magic; + * struct page_pool *pp; + * unsigned long _pp_mapping_pad; + * unsigned long dma_addr; + * atomic_long_t pp_ref_count; + * }; + * + * We mirror the page_pool fields here so the page_pool can access these + * fields without worrying whether the underlying fields belong to a + * page or netmem_desc. + * + * CAUTION: Do not update the fields in netmem_desc without also + * updating the anonymous aliasing union in struct net_iov. + */ +struct netmem_desc { + unsigned long _flags; + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; + unsigned long dma_addr; + atomic_long_t pp_ref_count; +}; + +#define NETMEM_DESC_ASSERT_OFFSET(pg, desc) \ + static_assert(offsetof(struct page, pg) == \ + offsetof(struct netmem_desc, desc)) +NETMEM_DESC_ASSERT_OFFSET(flags, _flags); +NETMEM_DESC_ASSERT_OFFSET(pp_magic, pp_magic); +NETMEM_DESC_ASSERT_OFFSET(pp, pp); +NETMEM_DESC_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad); +NETMEM_DESC_ASSERT_OFFSET(dma_addr, dma_addr); +NETMEM_DESC_ASSERT_OFFSET(pp_ref_count, pp_ref_count); +#undef NETMEM_DESC_ASSERT_OFFSET + +/* + * Since struct netmem_desc uses the space in struct page, the size + * should be checked, until struct netmem_desc has its own instance from + * slab, to avoid conflicting with other members within struct page. + */ +static_assert(sizeof(struct netmem_desc) <= offsetof(struct page, _refcount)); + +/* net_iov */ + +DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); + +/* We overload the LSB of the struct page pointer to indicate whether it's + * a page or net_iov. + */ +#define NET_IOV 0x01UL + +enum net_iov_type { + NET_IOV_DMABUF, + NET_IOV_IOURING, +}; + +/* A memory descriptor representing abstract networking I/O vectors, + * generally for non-pages memory that doesn't have its corresponding + * struct page and needs to be explicitly allocated through slab. + * + * net_iovs are allocated and used by networking code, and the size of + * the chunk is PAGE_SIZE. + * + * This memory can be any form of non-struct paged memory. Examples + * include imported dmabuf memory and imported io_uring memory. See + * net_iov_type for all the supported types. + * + * @pp_magic: pp field, similar to the one in struct page/struct + * netmem_desc. + * @pp: the pp this net_iov belongs to, if any. + * @dma_addr: the dma addrs of the net_iov. Needed for the network + * card to send/receive this net_iov. + * @pp_ref_count: the pp ref count of this net_iov, exactly the same + * usage as struct page/struct netmem_desc. + * @owner: the net_iov_area this net_iov belongs to, if any. + * @type: the type of the memory. Different types of net_iovs are + * supported. + */ +struct net_iov { + union { + struct netmem_desc desc; + + /* XXX: The following part should be removed once all + * the references to them are converted so as to be + * accessed via netmem_desc e.g. niov->desc.pp instead + * of niov->pp. + */ + struct { + unsigned long _flags; + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; + unsigned long dma_addr; + atomic_long_t pp_ref_count; + }; + }; + struct net_iov_area *owner; + enum net_iov_type type; +}; + +struct net_iov_area { + /* Array of net_iovs for this area. */ + struct net_iov *niovs; + size_t num_niovs; + + /* Offset into the dma-buf where this chunk starts. */ + unsigned long base_virtual; +}; + +/* net_iov is union'ed with struct netmem_desc mirroring struct page, so + * the page_pool can access these fields without worrying whether the + * underlying fields are accessed via netmem_desc or directly via + * net_iov, until all the references to them are converted so as to be + * accessed via netmem_desc e.g. niov->desc.pp instead of niov->pp. + * + * The non-net stack fields of struct page are private to the mm stack + * and must never be mirrored to net_iov. + */ +#define NET_IOV_ASSERT_OFFSET(desc, iov) \ + static_assert(offsetof(struct netmem_desc, desc) == \ + offsetof(struct net_iov, iov)) +NET_IOV_ASSERT_OFFSET(_flags, _flags); +NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic); +NET_IOV_ASSERT_OFFSET(pp, pp); +NET_IOV_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad); +NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); +NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); +#undef NET_IOV_ASSERT_OFFSET + +static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov) +{ + return niov->owner; +} + +static inline unsigned int net_iov_idx(const struct net_iov *niov) +{ + return niov - net_iov_owner(niov)->niovs; +} + +/* netmem */ + +/** + * typedef netmem_ref - a nonexistent type marking a reference to generic + * network memory. + * + * A netmem_ref can be a struct page* or a struct net_iov* underneath. + * + * Use the supplied helpers to obtain the underlying memory pointer and fields. + */ +typedef unsigned long __bitwise netmem_ref; + +static inline bool netmem_is_net_iov(const netmem_ref netmem) +{ + return (__force unsigned long)netmem & NET_IOV; +} + +/** + * __netmem_to_page - unsafely get pointer to the &page backing @netmem + * @netmem: netmem reference to convert + * + * Unsafe version of netmem_to_page(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (no check for the LSB, no WARN). When @netmem points to IOV, + * provokes undefined behaviour. + * + * Return: pointer to the &page (garbage if @netmem is not page-backed). + */ +static inline struct page *__netmem_to_page(netmem_ref netmem) +{ + return (__force struct page *)netmem; +} + +static inline struct page *netmem_to_page(netmem_ref netmem) +{ + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return NULL; + + return __netmem_to_page(netmem); +} + +static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return (struct net_iov *)((__force unsigned long)netmem & + ~NET_IOV); + + DEBUG_NET_WARN_ON_ONCE(true); + return NULL; +} + +static inline netmem_ref net_iov_to_netmem(struct net_iov *niov) +{ + return (__force netmem_ref)((unsigned long)niov | NET_IOV); +} + +#define page_to_netmem(p) (_Generic((p), \ + const struct page * : (__force const netmem_ref)(p), \ + struct page * : (__force netmem_ref)(p))) + +/** + * virt_to_netmem - convert virtual memory pointer to a netmem reference + * @data: host memory pointer to convert + * + * Return: netmem reference to the &page backing this virtual address. + */ +static inline netmem_ref virt_to_netmem(const void *data) +{ + return page_to_netmem(virt_to_page(data)); +} + +static inline int netmem_ref_count(netmem_ref netmem) +{ + /* The non-pp refcount of net_iov is always 1. On net_iov, we only + * support pp refcounting which uses the pp_ref_count field. + */ + if (netmem_is_net_iov(netmem)) + return 1; + + return page_ref_count(netmem_to_page(netmem)); +} + +static inline unsigned long netmem_pfn_trace(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return 0; + + return page_to_pfn(netmem_to_page(netmem)); +} + +/* XXX: How to extract netmem_desc from page must be changed, once + * netmem_desc no longer overlays on page and will be allocated through + * slab. + */ +#define __pp_page_to_nmdesc(p) (_Generic((p), \ + const struct page * : (const struct netmem_desc *)(p), \ + struct page * : (struct netmem_desc *)(p))) + +/* CAUTION: Check if the page is a pp page before calling this helper or + * know it's a pp page. + */ +#define pp_page_to_nmdesc(p) \ +({ \ + DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ + __pp_page_to_nmdesc(p); \ +}) + +/** + * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing + * @netmem + * @netmem: netmem reference to convert + * + * Unsafe version that can be used only when @netmem is always backed by + * system memory, performs faster and generates smaller object code (no + * check for the LSB, no WARN). When @netmem points to IOV, provokes + * undefined behaviour. + * + * Return: pointer to the &netmem_desc (garbage if @netmem is not backed + * by system memory). + */ +static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem) +{ + return (__force struct netmem_desc *)netmem; +} + +/* netmem_to_nmdesc - convert netmem_ref to struct netmem_desc * for + * access to common fields. + * @netmem: netmem reference to get netmem_desc. + * + * All the sub types of netmem_ref (netmem_desc, net_iov) have the same + * pp, pp_magic, dma_addr, and pp_ref_count fields via netmem_desc. + * + * Return: the pointer to struct netmem_desc * regardless of its + * underlying type. + */ +static inline struct netmem_desc *netmem_to_nmdesc(netmem_ref netmem) +{ + void *p = (void *)((__force unsigned long)netmem & ~NET_IOV); + + if (netmem_is_net_iov(netmem)) + return &((struct net_iov *)p)->desc; + + return __pp_page_to_nmdesc((struct page *)p); +} + +/** + * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem + * @netmem: netmem reference to get the pointer from + * + * Unsafe version of netmem_get_pp(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (avoids clearing the LSB). When @netmem points to IOV, + * provokes invalid memory access. + * + * Return: pointer to the &page_pool (garbage if @netmem is not page-backed). + */ +static inline struct page_pool *__netmem_get_pp(netmem_ref netmem) +{ + return __netmem_to_nmdesc(netmem)->pp; +} + +static inline struct page_pool *netmem_get_pp(netmem_ref netmem) +{ + return netmem_to_nmdesc(netmem)->pp; +} + +static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem) +{ + return &netmem_to_nmdesc(netmem)->pp_ref_count; +} + +static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid) +{ + /* NUMA node preference only makes sense if we're allocating + * system memory. Memory providers (which give us net_iovs) + * choose for us. + */ + if (netmem_is_net_iov(netmem)) + return true; + + return page_to_nid(netmem_to_page(netmem)) == pref_nid; +} + +static inline netmem_ref netmem_compound_head(netmem_ref netmem) +{ + /* niov are never compounded */ + if (netmem_is_net_iov(netmem)) + return netmem; + + return page_to_netmem(compound_head(netmem_to_page(netmem))); +} + +/** + * __netmem_address - unsafely get pointer to the memory backing @netmem + * @netmem: netmem reference to get the pointer for + * + * Unsafe version of netmem_address(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (no check for the LSB). When @netmem points to IOV, provokes + * undefined behaviour. + * + * Return: pointer to the memory (garbage if @netmem is not page-backed). + */ +static inline void *__netmem_address(netmem_ref netmem) +{ + return page_address(__netmem_to_page(netmem)); +} + +static inline void *netmem_address(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return NULL; + + return __netmem_address(netmem); +} + +/** + * netmem_is_pfmemalloc - check if @netmem was allocated under memory pressure + * @netmem: netmem reference to check + * + * Return: true if @netmem is page-backed and the page was allocated under + * memory pressure, false otherwise. + */ +static inline bool netmem_is_pfmemalloc(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return false; + + return page_is_pfmemalloc(netmem_to_page(netmem)); +} + +static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) +{ + return netmem_to_nmdesc(netmem)->dma_addr; +} + +void get_netmem(netmem_ref netmem); +void put_netmem(netmem_ref netmem); + +#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \ + do { \ + if (!netmem_is_net_iov(NETMEM)) \ + dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \ + else \ + dma_unmap_addr_set(PTR, ADDR_NAME, 0); \ + } while (0) + +static inline void netmem_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + if (!addr) + return; + + dma_unmap_page_attrs(dev, addr, size, dir, attrs); +} + +#endif /* _NET_NETMEM_H */ diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 1f463b3957c7..ab74b5ed0b01 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -7,9 +7,6 @@ #include <linux/atomic.h> #include <linux/workqueue.h> #include <linux/netfilter/nf_conntrack_tcp.h> -#ifdef CONFIG_NF_CT_PROTO_DCCP -#include <linux/netfilter/nf_conntrack_dccp.h> -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP #include <linux/netfilter/nf_conntrack_sctp.h> #endif @@ -50,13 +47,6 @@ struct nf_icmp_net { unsigned int timeout; }; -#ifdef CONFIG_NF_CT_PROTO_DCCP -struct nf_dccp_net { - u8 dccp_loose; - unsigned int dccp_timeout[CT_DCCP_MAX + 1]; -}; -#endif - #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net { unsigned int timeouts[SCTP_CONNTRACK_MAX]; @@ -82,9 +72,6 @@ struct nf_ip_net { struct nf_udp_net udp; struct nf_icmp_net icmp; struct nf_icmp_net icmpv6; -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct nf_dccp_net dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net sctp; #endif @@ -107,7 +94,7 @@ struct netns_ct { struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; struct nf_ip_net nf_ct_proto; #if defined(CONFIG_NF_CONNTRACK_LABELS) - unsigned int labels_used; + atomic_t labels_used; #endif }; #endif diff --git a/include/net/netns/core.h b/include/net/netns/core.h index a91ef9f8de60..9ef3d70e5e9c 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -13,7 +13,11 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + int sysctl_txq_reselection; + int sysctl_optmem_max; u8 sysctl_txrehash; + u8 sysctl_tstamp_allow_data; + u8 sysctl_bypass_prot_mem; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 7a41c4791536..2dbd46fc4734 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -19,8 +19,7 @@ struct hlist_head; struct fib_table; struct sock; struct local_ports { - seqlock_t lock; - int range[2]; + u32 range; /* high << 16 | low */ bool warned; }; @@ -41,10 +40,62 @@ struct inet_timewait_death_row { struct tcp_fastopen_context; +#ifdef CONFIG_IP_ROUTE_MULTIPATH +struct sysctl_fib_multipath_hash_seed { + u32 user_seed; + u32 mp_seed; +}; +#endif + +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. + * Please update the document when adding new fields. + */ + + /* TX readonly hotpath cache lines */ + __cacheline_group_begin(netns_ipv4_read_tx); + u8 sysctl_tcp_early_retrans; + u8 sysctl_tcp_tso_win_divisor; + u8 sysctl_tcp_tso_rtt_log; + u8 sysctl_tcp_autocorking; + int sysctl_tcp_min_snd_mss; + unsigned int sysctl_tcp_notsent_lowat; + int sysctl_tcp_limit_output_bytes; + int sysctl_tcp_min_rtt_wlen; + int sysctl_tcp_wmem[3]; + u8 sysctl_ip_fwd_use_pmtu; + __cacheline_group_end(netns_ipv4_read_tx); + + /* TXRX readonly hotpath cache lines */ + __cacheline_group_begin(netns_ipv4_read_txrx); + __cacheline_group_end(netns_ipv4_read_txrx); + + /* RX readonly hotpath cache line */ + __cacheline_group_begin(netns_ipv4_read_rx); + u8 sysctl_tcp_moderate_rcvbuf; + u8 sysctl_ip_early_demux; + u8 sysctl_tcp_early_demux; + u8 sysctl_tcp_l3mdev_accept; + /* 3 bytes hole, try to pack */ + int sysctl_tcp_reordering; + int sysctl_tcp_rmem[3]; + int sysctl_tcp_rcvbuf_low_rtt; + __cacheline_group_end(netns_ipv4_read_rx); + struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -71,6 +122,9 @@ struct netns_ipv4 { #endif struct hlist_head *fib_table_hash; struct sock *fibnl; + struct hlist_head *fib_info_hash; + unsigned int fib_info_hash_bits; + unsigned int fib_info_cnt; struct sock *mc_autojoin_sk; @@ -82,9 +136,13 @@ struct netns_ipv4 { u8 sysctl_icmp_echo_ignore_broadcasts; u8 sysctl_icmp_ignore_bogus_error_responses; u8 sysctl_icmp_errors_use_inbound_ifaddr; + u8 sysctl_icmp_errors_extension_mask; int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; - + int sysctl_icmp_msgs_per_sec; + int sysctl_icmp_msgs_burst; + atomic_t icmp_global_credit; + u32 icmp_global_stamp; u32 ip_rt_min_pmtu; int ip_rt_mtu_expires; int ip_rt_min_advmss; @@ -92,34 +150,29 @@ struct netns_ipv4 { struct local_ports ip_local_ports; u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_option; + u8 sysctl_tcp_ecn_option_beacon; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; u8 sysctl_ip_no_pmtu_disc; - u8 sysctl_ip_fwd_use_pmtu; u8 sysctl_ip_fwd_update_priority; u8 sysctl_ip_nonlocal_bind; u8 sysctl_ip_autobind_reuse; /* Shall we try to damage output packets if routing dev changes? */ u8 sysctl_ip_dynaddr; - u8 sysctl_ip_early_demux; #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_raw_l3mdev_accept; #endif - u8 sysctl_tcp_early_demux; u8 sysctl_udp_early_demux; u8 sysctl_nexthop_compat_mode; u8 sysctl_fwmark_reflect; u8 sysctl_tcp_fwmark_accept; -#ifdef CONFIG_NET_L3_MASTER_DEV - u8 sysctl_tcp_l3mdev_accept; -#endif u8 sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probe_floor; int sysctl_tcp_base_mss; - int sysctl_tcp_min_snd_mss; int sysctl_tcp_probe_threshold; u32 sysctl_tcp_probe_interval; @@ -132,17 +185,20 @@ struct netns_ipv4 { u8 sysctl_tcp_syncookies; u8 sysctl_tcp_migrate_req; u8 sysctl_tcp_comp_sack_nr; - int sysctl_tcp_reordering; + u8 sysctl_tcp_backlog_ack_defer; + u8 sysctl_tcp_pingpong_thresh; + u8 sysctl_tcp_retries1; u8 sysctl_tcp_retries2; u8 sysctl_tcp_orphan_retries; u8 sysctl_tcp_tw_reuse; + unsigned int sysctl_tcp_tw_reuse_delay; int sysctl_tcp_fin_timeout; - unsigned int sysctl_tcp_notsent_lowat; u8 sysctl_tcp_sack; u8 sysctl_tcp_window_scaling; u8 sysctl_tcp_timestamps; - u8 sysctl_tcp_early_retrans; + int sysctl_tcp_rto_min_us; + int sysctl_tcp_rto_max_ms; u8 sysctl_tcp_recovery; u8 sysctl_tcp_thin_linear_timeouts; u8 sysctl_tcp_slow_start_after_idle; @@ -158,22 +214,15 @@ struct netns_ipv4 { u8 sysctl_tcp_frto; u8 sysctl_tcp_nometrics_save; u8 sysctl_tcp_no_ssthresh_metrics_save; - u8 sysctl_tcp_moderate_rcvbuf; - u8 sysctl_tcp_tso_win_divisor; u8 sysctl_tcp_workaround_signed_windows; - int sysctl_tcp_limit_output_bytes; int sysctl_tcp_challenge_ack_limit; - int sysctl_tcp_min_rtt_wlen; u8 sysctl_tcp_min_tso_segs; - u8 sysctl_tcp_tso_rtt_log; - u8 sysctl_tcp_autocorking; u8 sysctl_tcp_reflect_tos; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; - int sysctl_tcp_wmem[3]; - int sysctl_tcp_rmem[3]; unsigned int sysctl_tcp_child_ehash_entries; + int sysctl_tcp_comp_sack_rtt_percent; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; int sysctl_max_syn_backlog; @@ -207,6 +256,7 @@ struct netns_ipv4 { int sysctl_igmp_qrv; struct ping_group_range ping_group_range; + u16 ping_port_rover; atomic_t dev_addr_genid; @@ -226,18 +276,22 @@ struct netns_ipv4 { #endif #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH + struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; u32 sysctl_fib_multipath_hash_fields; u8 sysctl_fib_multipath_use_neigh; u8 sysctl_fib_multipath_hash_policy; #endif struct fib_notifier_ops *notifier_ops; - unsigned int fib_seq; /* protected by rtnl_mutex */ + unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct fib_notifier_ops *ipmr_notifier_ops; unsigned int ipmr_seq; /* protected by rtnl_mutex */ atomic_t rt_genid; siphash_key_t ip_id_key; + struct hlist_head *inet_addr_lst; + struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 5f2cfd84570a..08d2ecc96e2b 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -56,6 +56,7 @@ struct netns_sysctl_ipv6 { u8 skip_notify_on_dev_down; u8 fib_notify_on_flag_change; u8 icmpv6_error_anycast_as_unicast; + u8 icmpv6_errors_extension_mask; }; struct netns_ipv6 { @@ -72,6 +73,7 @@ struct netns_ipv6 { struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; + spinlock_t fib_table_hash_lock; struct fib6_table *fib6_main_tbl; struct list_head fib6_walkers; rwlock_t fib6_walker_lock; diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h index 1db8f9aaddb4..89555f90b97b 100644 --- a/include/net/netns/mctp.h +++ b/include/net/netns/mctp.h @@ -6,19 +6,25 @@ #ifndef __NETNS_MCTP_H__ #define __NETNS_MCTP_H__ +#include <linux/hash.h> +#include <linux/hashtable.h> #include <linux/mutex.h> #include <linux/types.h> +#define MCTP_BINDS_BITS 7 + struct netns_mctp { /* Only updated under RTNL, entries freed via RCU */ struct list_head routes; - /* Bound sockets: list of sockets bound by type. - * This list is updated from non-atomic contexts (under bind_lock), - * and read (under rcu) in packet rx + /* Bound sockets: hash table of sockets, keyed by + * (type, src_eid, dest_eid). + * Specific src_eid/dest_eid entries also have an entry for + * MCTP_ADDR_ANY. This list is updated from non-atomic contexts + * (under bind_lock), and read (under rcu) in packet rx. */ struct mutex bind_lock; - struct hlist_head binds; + DECLARE_HASHTABLE(binds, MCTP_BINDS_BITS); /* tag allocations. This list is read and updated from atomic contexts, * but elements are free()ed after a RCU grace-period @@ -34,4 +40,10 @@ struct netns_mctp { struct list_head neighbours; }; +static inline u32 mctp_bind_hash(u8 type, u8 local_addr, u8 peer_addr) +{ + return hash_32(type | (u32)local_addr << 8 | (u32)peer_addr << 16, + MCTP_BINDS_BITS); +} + #endif /* __NETNS_MCTP_H__ */ diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h index 19ad2574b267..6682e51513ef 100644 --- a/include/net/netns/mpls.h +++ b/include/net/netns/mpls.h @@ -16,6 +16,7 @@ struct netns_mpls { int default_ttl; size_t platform_labels; struct mpls_route __rcu * __rcu *platform_label; + struct mutex platform_mutex; struct ctl_table_header *ctl; }; diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 02bbdc577f8e..a6a0bf4a247e 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -15,6 +15,9 @@ struct netns_nf { const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO]; #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; +#ifdef CONFIG_LWTUNNEL + struct ctl_table_header *nf_lwtnl_dir_header; +#endif #endif struct nf_hook_entries __rcu *hooks_ipv4[NF_INET_NUMHOOKS]; struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS]; diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index cc8060c017d5..99dd166c5d07 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -3,6 +3,7 @@ #define _NETNS_NFTABLES_H_ struct netns_nftables { + unsigned int base_seq; u8 gencursor; }; diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index 7eff3d981b89..c0f97f36389e 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -75,8 +75,8 @@ struct netns_sctp { /* Whether Cookie Preservative is enabled(1) or not(0) */ int cookie_preserve_enable; - /* The namespace default hmac alg */ - char *sctp_hmac_alg; + /* Whether cookie authentication is enabled(1) or not(0) */ + int cookie_auth_enable; /* Valid.Cookie.Life - 60 seconds */ unsigned int valid_cookie_life; @@ -125,14 +125,14 @@ struct netns_sctp { int pf_expose; /* - * Policy for preforming sctp/socket accounting + * Policy for performing sctp/socket accounting * 0 - do socket level accounting, all assocs share sk_sndbuf * 1 - do sctp accounting, each asoc may use sk_sndbuf bytes */ int sndbuf_policy; /* - * Policy for preforming sctp/socket accounting + * Policy for performing sctp/socket accounting * 0 - do socket level accounting, all assocs share sk_rcvbuf * 1 - do sctp accounting, each asoc may use sk_rcvbuf bytes */ diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 582212ada3ba..ed24c9f638ee 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -17,10 +17,17 @@ struct netns_smc { #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + struct smc_hs_ctrl __rcu *hs_ctrl; +#endif /* CONFIG_SMC_HS_CTRL_BPF */ unsigned int sysctl_autocorking_size; unsigned int sysctl_smcr_buf_type; int sysctl_smcr_testlink_time; int sysctl_wmem; int sysctl_rmem; + int sysctl_max_links_per_lgr; + int sysctl_max_conns_per_lgr; + unsigned int sysctl_smcr_max_send_wr; + unsigned int sysctl_smcr_max_recv_wr; }; #endif diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index bd7c3be4af5d..23dd647fe024 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -43,6 +43,7 @@ struct netns_xfrm { struct hlist_head __rcu *state_bysrc; struct hlist_head __rcu *state_byspi; struct hlist_head __rcu *state_byseq; + struct hlist_head __percpu *state_cache_input; unsigned int state_hmask; unsigned int state_num; struct work_struct state_hash_work; @@ -50,7 +51,7 @@ struct netns_xfrm { struct list_head policy_all; struct hlist_head *policy_byidx; unsigned int policy_idx_hmask; - struct hlist_head policy_inexact[XFRM_POLICY_MAX]; + unsigned int idx_generator; struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX]; unsigned int policy_count[XFRM_POLICY_MAX * 2]; struct work_struct policy_hash_work; @@ -82,6 +83,7 @@ struct netns_xfrm { spinlock_t xfrm_policy_lock; struct mutex xfrm_cfg_mutex; + struct delayed_work nat_keepalive_work; }; #endif diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 2b12725de9c0..572e69cda476 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -47,6 +47,8 @@ struct nh_config { bool nh_grp_res_has_idle_timer; bool nh_grp_res_has_unbalanced_timer; + bool nh_hw_stats; + struct nlattr *nh_encap; u16 nh_encap_type; @@ -92,12 +94,18 @@ struct nh_res_table { u32 unbalanced_timer; u16 num_nh_buckets; - struct nh_res_bucket nh_buckets[]; + struct nh_res_bucket nh_buckets[] __counted_by(num_nh_buckets); +}; + +struct nh_grp_entry_stats { + u64_stats_t packets; + struct u64_stats_sync syncp; }; struct nh_grp_entry { struct nexthop *nh; - u8 weight; + struct nh_grp_entry_stats __percpu *stats; + u16 weight; union { struct { @@ -114,6 +122,7 @@ struct nh_grp_entry { struct list_head nh_list; struct nexthop *nh_parent; /* nexthop of group with this entry */ + u64 packets_hw; }; struct nh_group { @@ -124,9 +133,10 @@ struct nh_group { bool resilient; bool fdb_nh; bool has_v4; + bool hw_stats; struct nh_res_table __rcu *res_table; - struct nh_grp_entry nh_entries[]; + struct nh_grp_entry nh_entries[] __counted_by(num_nh); }; struct nexthop { @@ -142,6 +152,8 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; + bool dead; + spinlock_t lock; /* protect dead and f6i_list */ refcount_t refcnt; struct rcu_head rcu; @@ -157,6 +169,7 @@ enum nexthop_event_type { NEXTHOP_EVENT_REPLACE, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, NEXTHOP_EVENT_BUCKET_REPLACE, + NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, }; enum nh_notifier_info_type { @@ -164,6 +177,7 @@ enum nh_notifier_info_type { NH_NOTIFIER_INFO_TYPE_GRP, NH_NOTIFIER_INFO_TYPE_RES_TABLE, NH_NOTIFIER_INFO_TYPE_RES_BUCKET, + NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS, }; struct nh_notifier_single_info { @@ -173,21 +187,22 @@ struct nh_notifier_single_info { __be32 ipv4; struct in6_addr ipv6; }; + u32 id; u8 is_reject:1, is_fdb:1, has_encap:1; }; struct nh_notifier_grp_entry_info { - u8 weight; - u32 id; + u16 weight; struct nh_notifier_single_info nh; }; struct nh_notifier_grp_info { u16 num_nh; bool is_fdb; - struct nh_notifier_grp_entry_info nh_entries[]; + bool hw_stats; + struct nh_notifier_grp_entry_info nh_entries[] __counted_by(num_nh); }; struct nh_notifier_res_bucket_info { @@ -200,7 +215,19 @@ struct nh_notifier_res_bucket_info { struct nh_notifier_res_table_info { u16 num_nh_buckets; - struct nh_notifier_single_info nhs[]; + bool hw_stats; + struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets); +}; + +struct nh_notifier_grp_hw_stats_entry_info { + u32 id; + u64 packets; +}; + +struct nh_notifier_grp_hw_stats_info { + u16 num_nh; + bool hw_stats_used; + struct nh_notifier_grp_hw_stats_entry_info stats[] __counted_by(num_nh); }; struct nh_notifier_info { @@ -213,17 +240,22 @@ struct nh_notifier_info { struct nh_notifier_grp_info *nh_grp; struct nh_notifier_res_table_info *nh_res_table; struct nh_notifier_res_bucket_info *nh_res_bucket; + struct nh_notifier_grp_hw_stats_info *nh_grp_hw_stats; }; }; int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); +int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity); +void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, + unsigned int nh_idx, + u64 delta_packets); /* caller is holding rcu or rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); @@ -237,7 +269,7 @@ static inline bool nexthop_get(struct nexthop *nh) static inline void nexthop_put(struct nexthop *nh) { if (refcount_dec_and_test(&nh->refcnt)) - call_rcu(&nh->rcu, nexthop_free_rcu); + call_rcu_hurry(&nh->rcu, nexthop_free_rcu); } static inline bool nexthop_cmp(const struct nexthop *nh1, @@ -316,7 +348,7 @@ static inline int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh, u8 rt_family) { - struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { diff --git a/include/net/nfc/nci.h b/include/net/nfc/nci.h index e82f55f543bb..09efcaed7c3f 100644 --- a/include/net/nfc/nci.h +++ b/include/net/nfc/nci.h @@ -332,7 +332,7 @@ struct nci_core_init_rsp_1 { __le32 nfcc_features; __u8 num_supported_rf_interfaces; __u8 supported_rf_interfaces[]; /* variable size array */ - /* continuted in nci_core_init_rsp_2 */ + /* continued in nci_core_init_rsp_2 */ } __packed; struct nci_core_init_rsp_2 { @@ -475,7 +475,7 @@ struct nci_rf_discover_ntf { #define NCI_OP_RF_INTF_ACTIVATED_NTF nci_opcode_pack(NCI_GID_RF_MGMT, 0x05) struct activation_params_nfca_poll_iso_dep { __u8 rats_res_len; - __u8 rats_res[20]; + __u8 rats_res[NFC_ATS_MAXSIZE]; }; struct activation_params_nfcb_poll_iso_dep { diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index ea8595651c38..664d5058e66e 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -52,7 +52,7 @@ enum nci_state { #define NCI_RF_DISC_SELECT_TIMEOUT 5000 #define NCI_RF_DEACTIVATE_TIMEOUT 30000 #define NCI_CMD_TIMEOUT 5000 -#define NCI_DATA_TIMEOUT 700 +#define NCI_DATA_TIMEOUT 3000 struct nci_dev; @@ -265,6 +265,10 @@ struct nci_dev { /* stored during intf_activated_ntf */ __u8 remote_gb[NFC_MAX_GT_LEN]; __u8 remote_gb_len; + + /* stored during intf_activated_ntf */ + __u8 target_ats[NFC_ATS_MAXSIZE]; + __u8 target_ats_len; }; /* ----- NCI Devices ----- */ diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 5dee575fbe86..127e6c7d910d 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -80,12 +80,14 @@ struct nfc_ops { #define NFC_ATR_REQ_GT_OFFSET 14 /** - * struct nfc_target - NFC target descriptiom + * struct nfc_target - NFC target description * * @sens_res: 2 bytes describing the target SENS_RES response, if the target * is a type A one. The %sens_res most significant byte must be byte 2 * as described by the NFC Forum digital specification (i.e. the platform * configuration one) while %sens_res least significant byte is byte 1. + * @ats_len: length of Answer To Select in bytes + * @ats: Answer To Select returned by an ISO 14443 Type A target upon activation */ struct nfc_target { u32 idx; @@ -105,6 +107,8 @@ struct nfc_target { u8 is_iso15693; u8 iso15693_dsfid; u8 iso15693_uid[NFC_ISO15693_UID_MAXSIZE]; + u8 ats_len; + u8 ats[NFC_ATS_MAXSIZE]; }; /** @@ -196,7 +200,7 @@ struct nfc_dev { }; #define to_nfc_dev(_dev) container_of(_dev, struct nfc_dev, dev) -extern struct class nfc_class; +extern const struct class nfc_class; struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, @@ -230,10 +234,10 @@ static inline void nfc_set_parent_dev(struct nfc_dev *nfc_dev, } /** - * nfc_set_drvdata - set driver specifc data + * nfc_set_drvdata - set driver specific data * * @dev: The nfc device - * @data: Pointer to driver specifc data + * @data: Pointer to driver specific data */ static inline void nfc_set_drvdata(struct nfc_dev *dev, void *data) { @@ -241,7 +245,7 @@ static inline void nfc_set_drvdata(struct nfc_dev *dev, void *data) } /** - * nfc_get_drvdata - get driver specifc data + * nfc_get_drvdata - get driver specific data * * @dev: The nfc device */ diff --git a/include/net/nl802154.h b/include/net/nl802154.h index 8cd9d141f5af..442822746e92 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -78,6 +78,10 @@ enum nl802154_commands { NL802154_CMD_SCAN_DONE, NL802154_CMD_SEND_BEACONS, NL802154_CMD_STOP_BEACONS, + NL802154_CMD_ASSOCIATE, + NL802154_CMD_DISASSOCIATE, + NL802154_CMD_SET_MAX_ASSOCIATIONS, + NL802154_CMD_LIST_ASSOCIATIONS, /* add new commands above here */ @@ -147,6 +151,8 @@ enum nl802154_attrs { NL802154_ATTR_SCAN_DURATION, NL802154_ATTR_SCAN_DONE_REASON, NL802154_ATTR_BEACON_INTERVAL, + NL802154_ATTR_MAX_ASSOCIATIONS, + NL802154_ATTR_PEER, /* add attributes here, update the policy in nl802154.c */ @@ -185,14 +191,12 @@ enum nl802154_iftype { * @NL802154_CAP_ATTR_CHANNELS: a nested attribute for nl802154_channel_attr * @NL802154_CAP_ATTR_TX_POWERS: a nested attribute for * nl802154_wpan_phy_tx_power - * @NL802154_CAP_ATTR_MIN_CCA_ED_LEVEL: minimum value for cca_ed_level - * @NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL: maxmimum value for cca_ed_level * @NL802154_CAP_ATTR_CCA_MODES: nl802154_cca_modes flags * @NL802154_CAP_ATTR_CCA_OPTS: nl802154_cca_opts flags * @NL802154_CAP_ATTR_MIN_MINBE: minimum of minbe value * @NL802154_CAP_ATTR_MAX_MINBE: maximum of minbe value * @NL802154_CAP_ATTR_MIN_MAXBE: minimum of maxbe value - * @NL802154_CAP_ATTR_MAX_MINBE: maximum of maxbe value + * @NL802154_CAP_ATTR_MAX_MAXBE: maximum of maxbe value * @NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS: minimum of csma backoff value * @NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS: maximum of csma backoffs value * @NL802154_CAP_ATTR_MIN_FRAME_RETRIES: minimum of frame retries value @@ -358,6 +362,7 @@ enum nl802154_cca_opts { NL802154_CCA_OPT_ENERGY_CARRIER_AND, NL802154_CCA_OPT_ENERGY_CARRIER_OR, + /* private: */ /* keep last */ __NL802154_CCA_OPT_ATTR_AFTER_LAST, NL802154_CCA_OPT_ATTR_MAX = __NL802154_CCA_OPT_ATTR_AFTER_LAST - 1 @@ -385,8 +390,6 @@ enum nl802154_supported_bool_states { NL802154_SUPPORTED_BOOL_MAX = __NL802154_SUPPORTED_BOOL_AFTER_LAST - 1 }; -#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL - enum nl802154_dev_addr_modes { NL802154_DEV_ADDR_NONE, __NL802154_DEV_ADDR_INVALID, @@ -406,12 +409,26 @@ enum nl802154_dev_addr_attrs { NL802154_DEV_ADDR_ATTR_SHORT, NL802154_DEV_ADDR_ATTR_EXTENDED, NL802154_DEV_ADDR_ATTR_PAD, + NL802154_DEV_ADDR_ATTR_PEER_TYPE, /* keep last */ __NL802154_DEV_ADDR_ATTR_AFTER_LAST, NL802154_DEV_ADDR_ATTR_MAX = __NL802154_DEV_ADDR_ATTR_AFTER_LAST - 1 }; +enum nl802154_peer_type { + NL802154_PEER_TYPE_UNSPEC, + + NL802154_PEER_TYPE_PARENT, + NL802154_PEER_TYPE_CHILD, + + /* keep last */ + __NL802154_PEER_TYPE_AFTER_LAST, + NL802154_PEER_TYPE_MAX = __NL802154_PEER_TYPE_AFTER_LAST - 1 +}; + +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + enum nl802154_key_id_modes { NL802154_KEY_ID_MODE_IMPLICIT, NL802154_KEY_ID_MODE_INDEX, diff --git a/include/net/p8022.h b/include/net/p8022.h deleted file mode 100644 index a29e224ac498..000000000000 --- a/include/net/p8022.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_P8022_H -#define _NET_P8022_H - -struct net_device; -struct packet_type; -struct sk_buff; - -struct datalink_proto * -register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)); -void unregister_8022_client(struct datalink_proto *proto); -#endif diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 94231533a369..3247026e096a 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -8,40 +8,63 @@ /** * DOC: page_pool allocator * - * The page_pool allocator is optimized for the XDP mode that - * uses one frame per-page, but it can fallback on the - * regular page allocator APIs. - * - * Basic use involves replacing alloc_pages() calls with the - * page_pool_alloc_pages() call. Drivers should use - * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). - * - * API keeps track of in-flight pages, in order to let API user know - * when it is safe to free a page_pool object. Thus, API users - * must call page_pool_put_page() to free the page, or attach - * the page to a page_pool-aware objects like skbs marked with + * The page_pool allocator is optimized for recycling page or page fragment used + * by skb packet and xdp frame. + * + * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), + * which allocate memory with or without page splitting depending on the + * requested memory size. + * + * If the driver knows that it always requires full pages or its allocations are + * always smaller than half a page, it can use one of the more specific API + * calls: + * + * 1. page_pool_alloc_pages(): allocate memory without page splitting when + * driver knows that the memory it need is always bigger than half of the page + * allocated from page pool. There is no cache line dirtying for 'struct page' + * when a page is recycled back to the page pool. + * + * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver + * knows that the memory it need is always smaller than or equal to half of the + * page allocated from page pool. Page splitting enables memory saving and thus + * avoids TLB/cache miss for data access, but there also is some cost to + * implement page splitting, mainly some cache line dirtying/bouncing for + * 'struct page' and atomic operation for page->pp_ref_count. + * + * The API keeps track of in-flight pages, in order to let API users know when + * it is safe to free a page_pool object, the API users must call + * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or + * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * - * API user must call page_pool_put_page() once on a page, as it - * will either recycle the page, or in case of refcnt > 1, it will - * release the DMA mapping and in-flight state accounting. + * page_pool_put_page() may be called multiple times on the same page if a page + * is split into multiple fragments. For the last fragment, it will either + * recycle the page, or in case of page->_refcount > 1, it will release the DMA + * mapping and in-flight state accounting. + * + * dma_sync_single_range_for_device() is only called for the last fragment when + * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the + * last freed fragment to do the sync_for_device operation for all fragments in + * the same page when a page is split. The API user must setup pool->p.max_len + * and pool->p.offset correctly and ensure that page_pool_put_page() is called + * with dma_sync_size being -1 for fragment API. */ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H +#include <linux/dma-mapping.h> + #include <net/page_pool/types.h> +#include <net/net_debug.h> +#include <net/netmem.h> #ifdef CONFIG_PAGE_POOL_STATS +/* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats); -/* - * Drivers that wish to harvest page pool stats and report them to users - * (perhaps via ethtool, debugfs, or another mechanism) can allocate a - * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. - */ -bool page_pool_get_stats(struct page_pool *pool, +bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) @@ -54,7 +77,7 @@ static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) return data; } -static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { return data; } @@ -73,6 +96,16 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) return page_pool_alloc_pages(pool, gfp); } +/** + * page_pool_dev_alloc_frag() - allocate a page fragment. + * @pool: pool from which to allocate + * @offset: offset to the allocated page + * @size: requested size + * + * Get a page fragment from the page allocator or page_pool caches. + * + * Return: allocated page fragment, otherwise return NULL. + */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size) @@ -82,6 +115,112 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, return page_pool_alloc_frag(pool, offset, size, gfp); } +static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + netmem_ref netmem; + + if ((*size << 1) > max_size) { + *size = max_size; + *offset = 0; + return page_pool_alloc_netmems(pool, gfp); + } + + netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); + if (unlikely(!netmem)) + return 0; + + /* There is very likely not enough space for another fragment, so append + * the remaining size to the current fragment to avoid truesize + * underestimate problem. + */ + if (pool->frag_offset + *size > max_size) { + *size = max_size - *offset; + pool->frag_offset = max_size; + } + + return netmem; +} + +static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size) +{ + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + + return page_pool_alloc_netmem(pool, offset, size, gfp); +} + +static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool) +{ + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + + return page_pool_alloc_netmems(pool, gfp); +} + +static inline struct page *page_pool_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); +} + +/** + * page_pool_dev_alloc() - allocate a page or a page fragment. + * @pool: pool from which to allocate + * @offset: offset to the allocated page + * @size: in as the requested size, out as the allocated size + * + * Get a page or a page fragment from the page allocator or page_pool caches + * depending on the requested size in order to allocate memory with least memory + * utilization and performance penalty. + * + * Return: allocated page or page fragment, otherwise return NULL. + */ +static inline struct page *page_pool_dev_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc(pool, offset, size, gfp); +} + +static inline void *page_pool_alloc_va(struct page_pool *pool, + unsigned int *size, gfp_t gfp) +{ + unsigned int offset; + struct page *page; + + /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ + page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); + if (unlikely(!page)) + return NULL; + + return page_address(page) + offset; +} + +/** + * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its + * va. + * @pool: pool from which to allocate + * @size: in as the requested size, out as the allocated size + * + * This is just a thin wrapper around the page_pool_alloc() API, and + * it returns va of the allocated page or page fragment. + * + * Return: the va for the allocated page or page fragment, otherwise return NULL. + */ +static inline void *page_pool_dev_alloc_va(struct page_pool *pool, + unsigned int *size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_va(pool, size, gfp); +} + /** * page_pool_get_dma_dir() - Retrieve the stored DMA direction. * @pool: pool from which page was allocated @@ -89,54 +228,120 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, * Get the stored dma direction. A driver might decide to store this locally * and avoid the extra cache line from page_pool to determine the direction. */ -static -inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) +static inline enum dma_data_direction +page_pool_get_dma_dir(const struct page_pool *pool) { return pool->p.dma_dir; } -/* pp_frag_count represents the number of writers who can update the page - * either by updating skb->data or via DMA mappings for the device. - * We can't rely on the page refcnt for that as we don't know who might be - * holding page references and we can't reliably destroy or sync DMA mappings - * of the fragments. +static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr) +{ + atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr); +} + +/** + * page_pool_fragment_page() - split a fresh page into fragments + * @page: page to split + * @nr: references to set * - * When pp_frag_count reaches 0 we can either recycle the page if the page - * refcnt is 1 or return it back to the memory allocator and destroy any - * mappings we have. + * pp_ref_count represents the number of outstanding references to the page, + * which will be freed using page_pool APIs (rather than page allocator APIs + * like put_page()). Such references are usually held by page_pool-aware + * objects like skbs marked for page pool recycling. + * + * This helper allows the caller to take (set) multiple references to a + * freshly allocated page. The page must be freshly allocated (have a + * pp_ref_count of 1). This is commonly done by drivers and + * "fragment allocators" to save atomic operations - either when they know + * upfront how many references they will need; or to take MAX references and + * return the unused ones with a single atomic dec(), instead of performing + * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { - atomic_long_set(&page->pp_frag_count, nr); + page_pool_fragment_netmem(page_to_netmem(page), nr); } -static inline long page_pool_defrag_page(struct page *page, long nr) +static inline long page_pool_unref_netmem(netmem_ref netmem, long nr) { + atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem); long ret; - /* If nr == pp_frag_count then we have cleared all remaining - * references to the page. No need to actually overwrite it, instead - * we can leave this to be overwritten by the calling function. + /* If nr == pp_ref_count then we have cleared all remaining + * references to the page: + * 1. 'n == 1': no need to actually overwrite it. + * 2. 'n != 1': overwrite it with one, which is the rare case + * for pp_ref_count draining. * - * The main advantage to doing this is that an atomic_read is - * generally a much cheaper operation than an atomic update, - * especially when dealing with a page that may be partitioned - * into only 2 or 3 pieces. + * The main advantage to doing this is that not only we avoid a atomic + * update, as an atomic_read is generally a much cheaper operation than + * an atomic update, especially when dealing with a page that may be + * referenced by only 2 or 3 users; but also unify the pp_ref_count + * handling by ensuring all pages have partitioned into only 1 piece + * initially, and only overwrite it when the page is partitioned into + * more than one piece. */ - if (atomic_long_read(&page->pp_frag_count) == nr) + if (atomic_long_read(pp_ref_count) == nr) { + /* As we have ensured nr is always one for constant case using + * the BUILD_BUG_ON(), only need to handle the non-constant case + * here for pp_ref_count draining, which is a rare case. + */ + BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); + if (!__builtin_constant_p(nr)) + atomic_long_set(pp_ref_count, 1); + return 0; + } - ret = atomic_long_sub_return(nr, &page->pp_frag_count); + ret = atomic_long_sub_return(nr, pp_ref_count); WARN_ON(ret < 0); + + /* We are the last user here too, reset pp_ref_count back to 1 to + * ensure all pages have been partitioned into 1 piece initially, + * this should be the rare case when the last two fragment users call + * page_pool_unref_page() currently. + */ + if (unlikely(!ret)) + atomic_long_set(pp_ref_count, 1); + return ret; } -static inline bool page_pool_is_last_frag(struct page_pool *pool, - struct page *page) +static inline long page_pool_unref_page(struct page *page, long nr) +{ + return page_pool_unref_netmem(page_to_netmem(page), nr); +} + +static inline void page_pool_ref_netmem(netmem_ref netmem) +{ + atomic_long_inc(netmem_get_pp_ref_count_ref(netmem)); +} + +static inline void page_pool_ref_page(struct page *page) +{ + page_pool_ref_netmem(page_to_netmem(page)); +} + +static inline bool page_pool_unref_and_test(netmem_ref netmem) +{ + /* If page_pool_unref_page() returns 0, we were the last user */ + return page_pool_unref_netmem(netmem, 1) == 0; +} + +static inline void page_pool_put_netmem(struct page_pool *pool, + netmem_ref netmem, + unsigned int dma_sync_size, + bool allow_direct) { - /* If fragments aren't enabled or count is 0 we were the last user */ - return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || - (page_pool_defrag_page(page, 1) == 0); + /* When page_pool isn't compiled-in, net/core/xdp.c doesn't + * allow registering MEM_TYPE_PAGE_POOL, but shield linker. + */ +#ifdef CONFIG_PAGE_POOL + if (!page_pool_unref_and_test(netmem)) + return; + + page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct); +#endif } /** @@ -157,15 +362,15 @@ static inline void page_pool_put_page(struct page_pool *pool, unsigned int dma_sync_size, bool allow_direct) { - /* When page_pool isn't compiled-in, net/core/xdp.c doesn't - * allow registering MEM_TYPE_PAGE_POOL, but shield linker. - */ -#ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_frag(pool, page)) - return; + page_pool_put_netmem(pool, page_to_netmem(page), dma_sync_size, + allow_direct); +} - page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); -#endif +static inline void page_pool_put_full_netmem(struct page_pool *pool, + netmem_ref netmem, + bool allow_direct) +{ + page_pool_put_netmem(pool, netmem, -1, allow_direct); } /** @@ -180,7 +385,7 @@ static inline void page_pool_put_page(struct page_pool *pool, static inline void page_pool_put_full_page(struct page_pool *pool, struct page *page, bool allow_direct) { - page_pool_put_page(pool, page, -1, allow_direct); + page_pool_put_netmem(pool, page_to_netmem(page), -1, allow_direct); } /** @@ -197,31 +402,96 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); } -#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ +static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, + netmem_ref netmem) +{ + page_pool_put_full_netmem(pool, netmem, true); +} + +#define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) /** + * page_pool_free_va() - free a va into the page_pool + * @pool: pool from which va was allocated + * @va: va to be freed + * @allow_direct: freed by the consumer, allow lockless caching + * + * Free a va allocated from page_pool_allo_va(). + */ +static inline void page_pool_free_va(struct page_pool *pool, void *va, + bool allow_direct) +{ + page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); +} + +static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) +{ + dma_addr_t ret = netmem_get_dma_addr(netmem); + + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) + ret <<= PAGE_SHIFT; + + return ret; +} + +/** * page_pool_get_dma_addr() - Retrieve the stored DMA address. * @page: page allocated from a page pool * * Fetch the DMA address of the page. The page pool to which the page belongs * must had been created with PP_FLAG_DMA_MAP. */ -static inline dma_addr_t page_pool_get_dma_addr(struct page *page) +static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { - dma_addr_t ret = page->dma_addr; + return page_pool_get_dma_addr_netmem(page_to_netmem(page)); +} - if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) - ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; +static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, + const dma_addr_t dma_addr, + u32 offset, u32 dma_sync_size) +{ + dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, + offset + pool->p.offset, dma_sync_size, + page_pool_get_dma_dir(pool)); +} - return ret; +/** + * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW + * @pool: &page_pool the @page belongs to + * @page: page to sync + * @offset: offset from page start to "hard" start if using PP frags + * @dma_sync_size: size of the data written to the page + * + * Can be used as a shorthand to sync Rx pages before accessing them in the + * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``. + * Note that this version performs DMA sync unconditionally, even if the + * associated PP doesn't perform sync-for-device. + */ +static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, + const struct page *page, + u32 offset, u32 dma_sync_size) +{ + __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, + dma_sync_size); +} + +static inline void +page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, + const netmem_ref netmem, u32 offset, + u32 dma_sync_size) +{ + if (!pool->dma_sync_for_cpu) + return; + + __page_pool_dma_sync_for_cpu(pool, + page_pool_get_dma_addr_netmem(netmem), + offset, dma_sync_size); } -static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) +static inline void page_pool_get(struct page_pool *pool) { - page->dma_addr = addr; - if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) - page->dma_addr_upper = upper_32_bits(addr); + refcount_inc(&pool->user_cnt); } static inline bool page_pool_put(struct page_pool *pool) @@ -235,4 +505,21 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +/** + * page_pool_is_unreadable() - will allocated buffers be unreadable for the CPU + * @pool: queried page pool + * + * Check if page pool will return buffers which are unreadable to the CPU / + * kernel. This will only be the case if user space bound a memory provider (mp) + * which returns unreadable memory to the queue served by the page pool. + * If %PP_FLAG_ALLOW_UNREADABLE_NETMEM was set but there is no mp bound + * this helper will return false. See also netif_rxq_has_unreadable_mp(). + * + * Return: true if memory allocated by the page pool may be unreadable + */ +static inline bool page_pool_is_unreadable(struct page_pool *pool) +{ + return !!pool->mp_ops; +} + #endif /* _NET_PAGE_POOL_HELPERS_H */ diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h new file mode 100644 index 000000000000..ada4f968960a --- /dev/null +++ b/include/net/page_pool/memory_provider.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H +#define _NET_PAGE_POOL_MEMORY_PROVIDER_H + +#include <net/netmem.h> +#include <net/page_pool/types.h> + +struct netdev_rx_queue; +struct netlink_ext_ack; +struct sk_buff; + +struct memory_provider_ops { + netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); + bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); + int (*init)(struct page_pool *pool); + void (*destroy)(struct page_pool *pool); + int (*nl_fill)(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq); + void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq); +}; + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); +void net_mp_niov_clear_page_pool(struct net_iov *niov); + +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p); +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack); +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p); +void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *old_p); + +/** + * net_mp_netmem_place_in_cache() - give a netmem to a page pool + * @pool: the page pool to place the netmem into + * @netmem: netmem to give + * + * Push an accounted netmem into the page pool's allocation cache. The caller + * must ensure that there is space in the cache. It should only be called off + * the mp_ops->alloc_netmems() path. + */ +static inline void net_mp_netmem_place_in_cache(struct page_pool *pool, + netmem_ref netmem) +{ + pool->alloc.cache[pool->alloc.count++] = netmem; +} + +#endif diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 887e7946a597..1509a536cb85 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -5,6 +5,9 @@ #include <linux/dma-direction.h> #include <linux/ptr_ring.h> +#include <linux/types.h> +#include <linux/xarray.h> +#include <net/netmem.h> #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA * map/unmap @@ -17,10 +20,22 @@ * Please note DMA-sync-for-CPU is still * device driver responsibility */ -#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ - PP_FLAG_DMA_SYNC_DEV |\ - PP_FLAG_PAGE_FRAG) +#define PP_FLAG_SYSTEM_POOL BIT(2) /* Global system page_pool */ + +/* Allow unreadable (net_iov backed) netmem in this page_pool. Drivers setting + * this must be able to support unreadable netmem, where netmem_address() would + * return NULL. This flag should not be set for header page_pools. + * + * If the driver sets PP_FLAG_ALLOW_UNREADABLE_NETMEM, it should also set + * page_pool_params.slow.queue_idx. + */ +#define PP_FLAG_ALLOW_UNREADABLE_NETMEM BIT(3) + +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ + PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) + +/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */ +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) /* * Fast allocation side cache array/stack @@ -40,12 +55,12 @@ #define PP_ALLOC_CACHE_REFILL 64 struct pp_alloc_cache { u32 count; - struct page *cache[PP_ALLOC_CACHE_SIZE]; + netmem_ref cache[PP_ALLOC_CACHE_SIZE]; }; /** * struct page_pool_params - page pool parameters - * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG + * @fast: params accessed frequently on hotpath * @order: 2^order pages on allocation * @pool_size: size of the ptr_ring * @nid: NUMA node id to allocate from pages from @@ -54,20 +69,31 @@ struct pp_alloc_cache { * @dma_dir: DMA mapping direction * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV + * @slow: params with slowpath access only (initialization and Netlink) + * @netdev: netdev this pool will serve (leave as NULL if none or multiple) + * @queue_idx: queue idx this page_pool is being created for. + * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_SYSTEM_POOL, + * PP_FLAG_ALLOW_UNREADABLE_NETMEM. */ struct page_pool_params { - unsigned int flags; - unsigned int order; - unsigned int pool_size; - int nid; - struct device *dev; - struct napi_struct *napi; - enum dma_data_direction dma_dir; - unsigned int max_len; - unsigned int offset; + struct_group_tagged(page_pool_params_fast, fast, + unsigned int order; + unsigned int pool_size; + int nid; + struct device *dev; + struct napi_struct *napi; + enum dma_data_direction dma_dir; + unsigned int max_len; + unsigned int offset; + ); + struct_group_tagged(page_pool_params_slow, slow, + struct net_device *netdev; + unsigned int queue_idx; + unsigned int flags; /* private: used by test code only */ - void (*init_callback)(struct page *page, void *arg); - void *init_arg; + void (*init_callback)(netmem_ref netmem, void *arg); + void *init_arg; + ); }; #ifdef CONFIG_PAGE_POOL_STATS @@ -120,13 +146,42 @@ struct page_pool_stats { }; #endif +/* The whole frag API block must stay within one cacheline. On 32-bit systems, + * sizeof(long) == sizeof(int), so that the block size is ``3 * sizeof(long)``. + * On 64-bit systems, the actual size is ``2 * sizeof(long) + sizeof(int)``. + * The closest pow-2 to both of them is ``4 * sizeof(long)``, so just use that + * one for simplicity. + * Having it aligned to a cacheline boundary may be excessive and doesn't bring + * any good. + */ +#define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long)) + +struct memory_provider_ops; + +struct pp_memory_provider_params { + void *mp_priv; + const struct memory_provider_ops *mp_ops; +}; + struct page_pool { - struct page_pool_params p; + struct page_pool_params_fast p; + + int cpuid; + u32 pages_state_hold_cnt; + + bool has_init_callback:1; /* slow::init_callback is set */ + bool dma_map:1; /* Perform DMA mapping */ + bool dma_sync:1; /* Perform DMA sync for device */ + bool dma_sync_for_cpu:1; /* Perform DMA sync for cpu */ +#ifdef CONFIG_PAGE_POOL_STATS + bool system:1; /* This is a global percpu pool */ +#endif + __cacheline_group_begin_aligned(frag, PAGE_POOL_FRAG_GROUP_ALIGN); long frag_users; - struct page *frag_page; + netmem_ref frag_page; unsigned int frag_offset; - u32 pages_state_hold_cnt; + __cacheline_group_end_aligned(frag, PAGE_POOL_FRAG_GROUP_ALIGN); struct delayed_work release_dw; void (*disconnect)(void *pool); @@ -167,6 +222,11 @@ struct page_pool { */ struct ptr_ring ring; + void *mp_priv; + const struct memory_provider_ops *mp_ops; + + struct xarray dma_mapped; + #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ struct page_pool_recycle_stats __percpu *recycle_stats; @@ -180,46 +240,60 @@ struct page_pool { refcount_t user_cnt; u64 destroy_cnt; + + /* Slow/Control-path information follows */ + struct page_pool_params_slow slow; + /* User-facing fields, protected by page_pools_lock */ + struct { + struct hlist_node list; + u64 detach_time; + u32 id; + } user; }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); +netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, + unsigned int *offset, unsigned int size, + gfp_t gfp); struct page_pool *page_pool_create(const struct page_pool_params *params); +struct page_pool *page_pool_create_percpu(const struct page_pool_params *params, + int cpuid); struct xdp_mem_info; #ifdef CONFIG_PAGE_POOL -void page_pool_unlink_napi(struct page_pool *pool); +void page_pool_enable_direct_recycling(struct page_pool *pool, + struct napi_struct *napi); +void page_pool_disable_direct_recycling(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem); -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count); + const struct xdp_mem_info *mem); +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count); #else -static inline void page_pool_unlink_napi(struct page_pool *pool) -{ -} - static inline void page_pool_destroy(struct page_pool *pool) { } static inline void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem) + const struct xdp_mem_info *mem) { } -static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +static inline void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) { } #endif -void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, +void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct); +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, + bool allow_direct); static inline bool is_page_pool_compiled_in(void) { diff --git a/include/net/pfcp.h b/include/net/pfcp.h new file mode 100644 index 000000000000..639553797d3e --- /dev/null +++ b/include/net/pfcp.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PFCP_H_ +#define _PFCP_H_ + +#include <uapi/linux/if_ether.h> +#include <net/dst_metadata.h> +#include <linux/netdevice.h> +#include <uapi/linux/ipv6.h> +#include <net/udp_tunnel.h> +#include <uapi/linux/udp.h> +#include <uapi/linux/ip.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bits.h> + +#define PFCP_PORT 8805 + +/* PFCP protocol header */ +struct pfcphdr { + u8 flags; + u8 message_type; + __be16 message_length; +}; + +/* PFCP header flags */ +#define PFCP_SEID_FLAG BIT(0) +#define PFCP_MP_FLAG BIT(1) + +#define PFCP_VERSION_MASK GENMASK(4, 0) + +#define PFCP_HLEN (sizeof(struct udphdr) + sizeof(struct pfcphdr)) + +/* PFCP node related messages */ +struct pfcphdr_node { + u8 seq_number[3]; + u8 reserved; +}; + +/* PFCP session related messages */ +struct pfcphdr_session { + __be64 seid; + u8 seq_number[3]; +#ifdef __LITTLE_ENDIAN_BITFIELD + u8 message_priority:4, + reserved:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:4, + message_priority:4; +#else +#error "Please fix <asm/byteorder>" +#endif +}; + +struct pfcp_metadata { + u8 type; + __be64 seid; +} __packed; + +enum { + PFCP_TYPE_NODE = 0, + PFCP_TYPE_SESSION = 1, +}; + +#define PFCP_HEADROOM (sizeof(struct iphdr) + sizeof(struct udphdr) + \ + sizeof(struct pfcphdr) + sizeof(struct ethhdr)) +#define PFCP6_HEADROOM (sizeof(struct ipv6hdr) + sizeof(struct udphdr) + \ + sizeof(struct pfcphdr) + sizeof(struct ethhdr)) + +static inline struct pfcphdr *pfcp_hdr(struct sk_buff *skb) +{ + return (struct pfcphdr *)(udp_hdr(skb) + 1); +} + +static inline struct pfcphdr_node *pfcp_hdr_node(struct sk_buff *skb) +{ + return (struct pfcphdr_node *)(pfcp_hdr(skb) + 1); +} + +static inline struct pfcphdr_session *pfcp_hdr_session(struct sk_buff *skb) +{ + return (struct pfcphdr_session *)(pfcp_hdr(skb) + 1); +} + +static inline bool netif_is_pfcp(const struct net_device *dev) +{ + return dev->rtnl_link_ops && + !strcmp(dev->rtnl_link_ops->kind, "pfcp"); +} + +#endif diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index e9dc8dca5817..37a3e83531c6 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -11,13 +11,13 @@ #define PN_DEV_H #include <linux/list.h> -#include <linux/mutex.h> +#include <linux/spinlock.h> struct net; struct phonet_device_list { struct list_head list; - struct mutex lock; + spinlock_t lock; }; struct phonet_device_list *phonet_device_list(struct net *net); @@ -38,11 +38,11 @@ int phonet_address_add(struct net_device *dev, u8 addr); int phonet_address_del(struct net_device *dev, u8 addr); u8 phonet_address_get(struct net_device *dev, u8 addr); int phonet_address_lookup(struct net *net, u8 addr); -void phonet_address_notify(int event, struct net_device *dev, u8 addr); +void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr); int phonet_route_add(struct net_device *dev, u8 daddr); int phonet_route_del(struct net_device *dev, u8 daddr); -void rtm_phonet_notify(int event, struct net_device *dev, u8 dst); +void rtm_phonet_notify(struct net *net, int event, u32 ifindex, u8 dst); struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr); struct net_device *phonet_route_output(struct net *net, u8 daddr); diff --git a/include/net/ping.h b/include/net/ping.h index bc7779262e60..05bfd594a64c 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -54,12 +54,11 @@ struct pingfakehdr { }; int ping_get_port(struct sock *sk, unsigned short ident); -int ping_hash(struct sock *sk); void ping_unhash(struct sock *sk); int ping_init_sock(struct sock *sk); void ping_close(struct sock *sk, long timeout); -int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); void ping_err(struct sk_buff *skb, int offset, u32 info); int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *); diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index f308e8268651..99ac747b7906 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -24,6 +24,8 @@ struct tcf_walker { int register_tcf_proto_ops(struct tcf_proto_ops *ops); void unregister_tcf_proto_ops(struct tcf_proto_ops *ops); +#define NET_CLS_ALIAS_PREFIX "net-cls-" +#define MODULE_ALIAS_NET_CLS(kind) MODULE_ALIAS(NET_CLS_ALIAS_PREFIX kind) struct tcf_block_ext_info { enum flow_block_binder_type binder_type; @@ -72,6 +74,15 @@ static inline bool tcf_block_non_null_shared(struct tcf_block *block) return block && block->index; } +#ifdef CONFIG_NET_CLS_ACT +DECLARE_STATIC_KEY_FALSE(tcf_sw_enabled_key); + +static inline bool tcf_block_bypass_sw(struct tcf_block *block) +{ + return block && !atomic_read(&block->useswcnt); +} +#endif + static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); @@ -308,7 +319,7 @@ tcf_exts_hw_stats_update(const struct tcf_exts *exts, * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * - * Returns true if at least one action is present. + * Returns: true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { @@ -480,7 +491,7 @@ int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); /** - * tcf_em_tree_match - evaulate an ematch tree + * tcf_em_tree_match - evaluate an ematch tree * * @skb: socket buffer of the packet in question * @tree: ematch tree to be used for evaluation @@ -490,7 +501,7 @@ int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, * through all ematches respecting their logic relations returning * as soon as the result is obvious. * - * Returns 1 if the ematch tree as-one matches, no ematches are configured + * Returns: 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, @@ -525,6 +536,8 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: + if (!skb_transport_header_was_set(skb)) + break; return skb_transport_header(skb); } @@ -744,10 +757,20 @@ tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; + cls_common->skip_sw = tc_skip_sw(flags); if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } +static inline void tcf_proto_update_usesw(struct tcf_proto *tp, u32 flags) +{ + if (tp->usesw) + return; + if (tc_skip_sw(flags) && tc_in_hw(flags)) + return; + tp->usesw = true; +} + #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) { diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 15960564e0c3..e703c507d0da 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -20,15 +20,10 @@ struct qdisc_walker { int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; -static inline void *qdisc_priv(struct Qdisc *q) -{ - return &q->privdata; -} - -static inline struct Qdisc *qdisc_from_priv(void *priv) -{ - return container_of(priv, struct Qdisc, privdata); -} +#define qdisc_priv(q) \ + _Generic(q, \ + const struct Qdisc * : (const void *)&q->privdata, \ + struct Qdisc * : (void *)&q->privdata) /* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth @@ -48,7 +43,6 @@ static inline struct Qdisc *qdisc_from_priv(void *priv) */ typedef u64 psched_time_t; -typedef long psched_tdiff_t; /* Avoid doing 64 bit divide */ #define PSCHED_SHIFT 6 @@ -100,6 +94,8 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, int register_qdisc(struct Qdisc_ops *qops); void unregister_qdisc(struct Qdisc_ops *qops); +#define NET_SCH_ALIAS_PREFIX "net-sch-" +#define MODULE_ALIAS_NET_SCH(id) MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id) void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); @@ -112,19 +108,19 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); -void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); -static inline void qdisc_run(struct Qdisc *q) +static inline struct sk_buff *qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); - qdisc_run_end(q); + return qdisc_run_end(q); } + return NULL; } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -275,24 +271,6 @@ static inline void skb_txtime_consumed(struct sk_buff *skb) skb->tstamp = ktime_set(0, 0); } -struct tc_skb_cb { - struct qdisc_skb_cb qdisc_cb; - - u16 mru; - u8 post_ct:1; - u8 post_ct_snat:1; - u8 post_ct_dnat:1; - u16 zone; /* Only valid if post_ct = true */ -}; - -static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) -{ - struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; - - BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); - return cb; -} - static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, unsigned long cl, struct qdisc_walker *arg) @@ -306,4 +284,28 @@ static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, return true; } +static inline void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) +{ + if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { + pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", + txt, qdisc->ops->id, qdisc->handle >> 16); + qdisc->flags |= TCQ_F_WARN_NONWC; + } +} + +static inline unsigned int qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + unsigned int len; + + skb = sch->ops->peek(sch); + if (unlikely(skb == NULL)) { + qdisc_warn_nonwc("qdisc_peek_len", sch); + return 0; + } + len = qdisc_pkt_len(skb); + + return len; +} + #endif diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h new file mode 100644 index 000000000000..ad6d703ce6fe --- /dev/null +++ b/include/net/proto_memory.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PROTO_MEMORY_H +#define _PROTO_MEMORY_H + +#include <net/sock.h> +#include <net/hotdata.h> + +/* 1 MB per cpu, in page units */ +#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) + +static inline bool sk_has_memory_pressure(const struct sock *sk) +{ + return sk->sk_prot->memory_pressure != NULL; +} + +static inline bool +proto_memory_pressure(const struct proto *prot) +{ + if (!prot->memory_pressure) + return false; + return !!READ_ONCE(*prot->memory_pressure); +} + +static inline bool sk_under_global_memory_pressure(const struct sock *sk) +{ + return proto_memory_pressure(sk->sk_prot); +} + +static inline bool sk_under_memory_pressure(const struct sock *sk) +{ + if (!sk->sk_prot->memory_pressure) + return false; + + if (mem_cgroup_sk_enabled(sk) && + mem_cgroup_sk_under_memory_pressure(sk)) + return true; + + if (sk->sk_bypass_prot_mem) + return false; + + return !!READ_ONCE(*sk->sk_prot->memory_pressure); +} + +static inline long +proto_memory_allocated(const struct proto *prot) +{ + return max(0L, atomic_long_read(prot->memory_allocated)); +} + +static inline long +sk_memory_allocated(const struct sock *sk) +{ + return proto_memory_allocated(sk->sk_prot); +} + +static inline void proto_memory_pcpu_drain(struct proto *proto) +{ + int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); + + if (val) + atomic_long_add(val, proto->memory_allocated); +} + +static inline void +sk_memory_allocated_add(const struct sock *sk, int val) +{ + struct proto *proto = sk->sk_prot; + + val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val >= READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); +} + +static inline void +sk_memory_allocated_sub(const struct sock *sk, int val) +{ + struct proto *proto = sk->sk_prot; + + val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val <= -READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); +} + +#endif /* _PROTO_MEMORY_H */ diff --git a/include/net/protocol.h b/include/net/protocol.h index 6aef8cb11cc8..b2499f88f8f8 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -46,6 +46,7 @@ struct net_protocol { * socket lookup? */ icmp_strict_tag_validation:1; + u32 secret; }; #if IS_ENABLED(CONFIG_IPV6) @@ -59,6 +60,7 @@ struct inet6_protocol { __be32 info); unsigned int flags; /* INET6_PROTO_xxx */ + u32 secret; }; #define INET6_PROTO_NOPOLICY 0x1 @@ -68,6 +70,7 @@ struct inet6_protocol { struct net_offload { struct offload_callbacks callbacks; unsigned int flags; /* Flags used by IPv6 for now */ + u32 secret; }; /* This should be set for any extension header which is compatible with GSO. */ #define INET6_PROTO_GSO_EXTHDR 0x1 diff --git a/include/net/psample.h b/include/net/psample.h index 0509d2d6be67..5071b5fc2b59 100644 --- a/include/net/psample.h +++ b/include/net/psample.h @@ -24,7 +24,10 @@ struct psample_metadata { u8 out_tc_valid:1, out_tc_occ_valid:1, latency_valid:1, - unused:5; + rate_as_probability:1, + unused:4; + const u8 *user_cookie; + u32 user_cookie_len; }; struct psample_group *psample_group_get(struct net *net, u32 group_num); @@ -35,13 +38,15 @@ struct sk_buff; #if IS_ENABLED(CONFIG_PSAMPLE) -void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, - u32 sample_rate, const struct psample_metadata *md); +void psample_sample_packet(struct psample_group *group, + const struct sk_buff *skb, u32 sample_rate, + const struct psample_metadata *md); #else static inline void psample_sample_packet(struct psample_group *group, - struct sk_buff *skb, u32 sample_rate, + const struct sk_buff *skb, + u32 sample_rate, const struct psample_metadata *md) { } diff --git a/include/net/psp.h b/include/net/psp.h new file mode 100644 index 000000000000..33bb4d1dc46e --- /dev/null +++ b/include/net/psp.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_ALL_H +#define __NET_PSP_ALL_H + +#include <uapi/linux/psp.h> +#include <net/psp/functions.h> +#include <net/psp/types.h> + +/* Do not add any code here. Put it in the sub-headers instead. */ + +#endif /* __NET_PSP_ALL_H */ diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h new file mode 100644 index 000000000000..c5c23a54774e --- /dev/null +++ b/include/net/psp/functions.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_HELPERS_H +#define __NET_PSP_HELPERS_H + +#include <linux/skbuff.h> +#include <linux/rcupdate.h> +#include <linux/udp.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/psp/types.h> + +struct inet_timewait_sock; + +/* Driver-facing API */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, + struct psp_dev_caps *psd_caps, void *priv_ptr); +void psp_dev_unregister(struct psp_dev *psd); +bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, __be16 sport); +int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv); + +/* Kernel-facing API */ +void psp_assoc_put(struct psp_assoc *pas); + +static inline void *psp_assoc_drv_data(struct psp_assoc *pas) +{ + return pas->drv_data; +} + +#if IS_ENABLED(CONFIG_INET_PSP) +unsigned int psp_key_size(u32 version); +void psp_sk_assoc_free(struct sock *sk); +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); +void psp_twsk_assoc_free(struct inet_timewait_sock *tw); +void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb); + +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk)); +} + +static inline void +psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) +{ + struct psp_assoc *pas; + + pas = psp_sk_assoc(sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; +} + +static inline unsigned long +__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, + unsigned long diffs) +{ + struct psp_skb_ext *a, *b; + + a = skb_ext_find(one, SKB_EXT_PSP); + b = skb_ext_find(two, SKB_EXT_PSP); + + diffs |= (!!a) ^ (!!b); + if (!diffs && unlikely(a)) + diffs |= memcmp(a, b, sizeof(*a)); + return diffs; +} + +static inline bool +psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas) +{ + bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + u32 seq = TCP_SKB_CB(skb)->seq; + bool pure_fin; + + pure_fin = fin && end_seq - seq == 1; + + return seq == end_seq || (pure_fin && seq == pas->upgrade_seq); +} + +static inline bool +psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas) +{ + return pse && pas->rx.spi == pse->spi && + pas->generation == pse->generation && + pas->version == pse->version && + pas->dev_id == pse->dev_id; +} + +static inline enum skb_drop_reason +__psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); + + if (!pas) + return pse ? SKB_DROP_REASON_PSP_INPUT : 0; + + if (likely(psp_pse_matches_pas(pse, pas))) { + if (unlikely(!pas->peer_tx)) + pas->peer_tx = 1; + + return 0; + } + + if (!pse) { + if (!pas->tx.spi || + (!pas->peer_tx && psp_is_allowed_nondata(skb, pas))) + return 0; + } + + return SKB_DROP_REASON_PSP_INPUT; +} + +static inline enum skb_drop_reason +psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) +{ + return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk)); +} + +static inline enum skb_drop_reason +psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) +{ + return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); +} + +static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) +{ + struct psp_assoc *pas; + int state; + + state = READ_ONCE(sk->sk_state); + if (!sk_is_inet(sk) || state == TCP_NEW_SYN_RECV) + return NULL; + + pas = state == TCP_TIME_WAIT ? + rcu_dereference(inet_twsk(sk)->psp_assoc) : + rcu_dereference(sk->psp_assoc); + return pas; +} + +static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) +{ + if (!skb->decrypted || !skb->sk) + return NULL; + + return psp_sk_get_assoc_rcu(skb->sk); +} + +static inline unsigned int psp_sk_overhead(const struct sock *sk) +{ + int psp_encap = sizeof(struct udphdr) + PSP_HDR_SIZE + PSP_TRL_SIZE; + bool has_psp = rcu_access_pointer(sk->psp_assoc); + + return has_psp ? psp_encap : 0; +} +#else +static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void +psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } +static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } +static inline void +psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { } + +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return NULL; +} + +static inline void +psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } + +static inline unsigned long +__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, + unsigned long diffs) +{ + return diffs; +} + +static inline enum skb_drop_reason +psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +static inline enum skb_drop_reason +psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) +{ + return 0; +} + +static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) +{ + return NULL; +} + +static inline unsigned int psp_sk_overhead(const struct sock *sk) +{ + return 0; +} +#endif + +static inline unsigned long +psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two) +{ + return __psp_skb_coalesce_diff(one, two, 0); +} + +#endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h new file mode 100644 index 000000000000..25a9096d4e7d --- /dev/null +++ b/include/net/psp/types.h @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_H +#define __NET_PSP_H + +#include <linux/mutex.h> +#include <linux/refcount.h> + +struct netlink_ext_ack; + +#define PSP_DEFAULT_UDP_PORT 1000 + +struct psphdr { + u8 nexthdr; + u8 hdrlen; + u8 crypt_offset; + u8 verfl; + __be32 spi; + __be64 iv; + __be64 vc[]; /* optional */ +}; + +#define PSP_ENCAP_HLEN (sizeof(struct udphdr) + sizeof(struct psphdr)) + +#define PSP_SPI_KEY_ID GENMASK(30, 0) +#define PSP_SPI_KEY_PHASE BIT(31) + +#define PSPHDR_CRYPT_OFFSET GENMASK(5, 0) + +#define PSPHDR_VERFL_SAMPLE BIT(7) +#define PSPHDR_VERFL_DROP BIT(6) +#define PSPHDR_VERFL_VERSION GENMASK(5, 2) +#define PSPHDR_VERFL_VIRT BIT(1) +#define PSPHDR_VERFL_ONE BIT(0) + +#define PSP_HDRLEN_NOOPT ((sizeof(struct psphdr) - 8) / 8) + +/** + * struct psp_dev_config - PSP device configuration + * @versions: PSP versions enabled on the device + */ +struct psp_dev_config { + u32 versions; +}; + +/** + * struct psp_dev - PSP device struct + * @main_netdev: original netdevice of this PSP device + * @ops: driver callbacks + * @caps: device capabilities + * @drv_priv: driver priv pointer + * @lock: instance lock, protects all fields + * @refcnt: reference count for the instance + * @id: instance id + * @generation: current generation of the device key + * @config: current device configuration + * @active_assocs: list of registered associations + * @prev_assocs: associations which use old (but still usable) + * device key + * @stale_assocs: associations which use a rotated out key + * + * @stats: statistics maintained by the core + * @stats.rotations: See stats attr key-rotations + * @stats.stales: See stats attr stale-events + * + * @rcu: RCU head for freeing the structure + */ +struct psp_dev { + struct net_device *main_netdev; + + struct psp_dev_ops *ops; + struct psp_dev_caps *caps; + void *drv_priv; + + struct mutex lock; + refcount_t refcnt; + + u32 id; + + u8 generation; + + struct psp_dev_config config; + + struct list_head active_assocs; + struct list_head prev_assocs; + struct list_head stale_assocs; + + struct { + unsigned long rotations; + unsigned long stales; + } stats; + + struct rcu_head rcu; +}; + +#define PSP_GEN_VALID_MASK 0x7f + +/** + * struct psp_dev_caps - PSP device capabilities + */ +struct psp_dev_caps { + /** + * @versions: mask of supported PSP versions + * Set this field to 0 to indicate PSP is not supported at all. + */ + u32 versions; + + /** + * @assoc_drv_spc: size of driver-specific state in Tx assoc + * Determines the size of struct psp_assoc::drv_data + */ + u32 assoc_drv_spc; +}; + +#define PSP_MAX_KEY 32 + +#define PSP_HDR_SIZE 16 /* We don't support optional fields, yet */ +#define PSP_TRL_SIZE 16 /* AES-GCM/GMAC trailer size */ + +struct psp_skb_ext { + __be32 spi; + u16 dev_id; + u8 generation; + u8 version; +}; + +struct psp_key_parsed { + __be32 spi; + u8 key[PSP_MAX_KEY]; +}; + +struct psp_assoc { + struct psp_dev *psd; + + u16 dev_id; + u8 generation; + u8 version; + u8 peer_tx; + + u32 upgrade_seq; + + struct psp_key_parsed tx; + struct psp_key_parsed rx; + + refcount_t refcnt; + struct rcu_head rcu; + struct work_struct work; + struct list_head assocs_list; + + u8 drv_data[] __aligned(8); +}; + +struct psp_dev_stats { + union { + struct { + u64 rx_packets; + u64 rx_bytes; + u64 rx_auth_fail; + u64 rx_error; + u64 rx_bad; + u64 tx_packets; + u64 tx_bytes; + u64 tx_error; + }; + DECLARE_FLEX_ARRAY(u64, required); + }; +}; + +/** + * struct psp_dev_ops - netdev driver facing PSP callbacks + */ +struct psp_dev_ops { + /** + * @set_config: set configuration of a PSP device + * Driver can inspect @psd->config for the previous configuration. + * Core will update @psd->config with @config on success. + */ + int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, + struct netlink_ext_ack *extack); + + /** + * @key_rotate: rotate the device key + */ + int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); + + /** + * @rx_spi_alloc: allocate an Rx SPI+key pair + * Allocate an Rx SPI and resulting derived key. + * This key should remain valid until key rotation. + */ + int (*rx_spi_alloc)(struct psp_dev *psd, u32 version, + struct psp_key_parsed *assoc, + struct netlink_ext_ack *extack); + + /** + * @tx_key_add: add a Tx key to the device + * Install an association in the device. Core will allocate space + * for the driver to use at drv_data. + */ + int (*tx_key_add)(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack); + /** + * @tx_key_del: remove a Tx key from the device + * Remove an association from the device. + */ + void (*tx_key_del)(struct psp_dev *psd, struct psp_assoc *pas); + + /** + * @get_stats: get statistics from the device + * Stats required by the spec must be maintained and filled in. + * Stats must be filled in member-by-member, never memset the struct. + */ + void (*get_stats)(struct psp_dev *psd, struct psp_dev_stats *stats); +}; + +#endif /* __NET_PSP_H */ diff --git a/include/net/raw.h b/include/net/raw.h index 32a61481a253..66c0ffeada2e 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,6 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; + struct numa_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/include/net/red.h b/include/net/red.h index 425364de0df7..159a09359fc0 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -40,7 +40,7 @@ max_P should be small (not 1), usually 0.01..0.02 is good value. max_P is chosen as a number, so that max_P/(th_max-th_min) - is a negative power of two in order arithmetics to contain + is a negative power of two in order arithmetic to contain only shifts. @@ -159,7 +159,7 @@ static inline u32 red_maxp(u8 Plog) static inline void red_set_vars(struct red_vars *v) { /* Reset average queue length, the value is strictly bound - * to the parameters below, reseting hurts a bit but leaving + * to the parameters below, resetting hurts a bit but leaving * it might result in an unreasonable qavg for a while. --TGR */ v->qavg = 0; @@ -233,10 +233,10 @@ static inline void red_set_parms(struct red_parms *p, int delta = qth_max - qth_min; u32 max_p_delta; - p->qth_min = qth_min << Wlog; - p->qth_max = qth_max << Wlog; - p->Wlog = Wlog; - p->Plog = Plog; + WRITE_ONCE(p->qth_min, qth_min << Wlog); + WRITE_ONCE(p->qth_max, qth_max << Wlog); + WRITE_ONCE(p->Wlog, Wlog); + WRITE_ONCE(p->Plog, Plog); if (delta <= 0) delta = 1; p->qth_delta = delta; @@ -244,7 +244,7 @@ static inline void red_set_parms(struct red_parms *p, max_P = red_maxp(Plog); max_P *= delta; /* max_P = (qth_max - qth_min)/2^Plog */ } - p->max_P = max_P; + WRITE_ONCE(p->max_P, max_P); max_p_delta = max_P / delta; max_p_delta = max(max_p_delta, 1U); p->max_P_reciprocal = reciprocal_value(max_p_delta); @@ -257,7 +257,7 @@ static inline void red_set_parms(struct red_parms *p, p->target_min = qth_min + 2*delta; p->target_max = qth_min + 3*delta; - p->Scell_log = Scell_log; + WRITE_ONCE(p->Scell_log, Scell_log); p->Scell_max = (255 << Scell_log); if (stab) @@ -340,7 +340,7 @@ static inline unsigned long red_calc_qavg_no_idle_time(const struct red_parms *p { /* * NOTE: v->qavg is fixed point number with point at Wlog. - * The formula below is equvalent to floating point + * The formula below is equivalent to floating point * version: * * qavg = qavg*(1-W) + backlog*W; @@ -375,7 +375,7 @@ static inline int red_mark_probability(const struct red_parms *p, OK. qR is random number in the interval (0..1/max_P)*(qth_max-qth_min) i.e. 0..(2^Plog). If we used floating point - arithmetics, it would be: (2^Plog)*rnd_num, + arithmetic, it would be: (2^Plog)*rnd_num, where rnd_num is less 1. Taking into account, that qavg have fixed diff --git a/include/net/regulatory.h b/include/net/regulatory.h index b2cb4a9eb04d..6633627f6e76 100644 --- a/include/net/regulatory.h +++ b/include/net/regulatory.h @@ -71,8 +71,6 @@ enum environment_cap { * CRDA and can be used by other regulatory requests. When a * the last request is not yet processed we must yield until it * is processed before processing any new requests. - * @country_ie_checksum: checksum of the last processed and accepted - * country IE * @country_ie_env: lets us know if the AP is telling us we are outdoor, * indoor, or if it doesn't matter * @list: used to insert into the reg_requests_list linked list @@ -123,7 +121,7 @@ struct regulatory_request { * @REGULATORY_DISABLE_BEACON_HINTS: enable this if your driver needs to * ensure that passive scan flags and beaconing flags may not be lifted by * cfg80211 due to regulatory beacon hints. For more information on beacon - * hints read the documenation for regulatory_hint_found_beacon() + * hints read the documentation for regulatory_hint_found_beacon() * @REGULATORY_COUNTRY_IE_FOLLOW_POWER: for devices that have a preference * that even though they may have programmed their own custom power * setting prior to wiphy registration, they want to ensure their channel @@ -213,6 +211,7 @@ struct ieee80211_reg_rule { u32 flags; u32 dfs_cac_ms; bool has_wmm; + s8 psd; }; struct ieee80211_regdomain { diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 144c39db9898..9b9e04f6bb89 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -18,6 +18,7 @@ #include <linux/refcount.h> #include <net/sock.h> +#include <net/rstreason.h> struct request_sock; struct sk_buff; @@ -29,18 +30,14 @@ struct request_sock_ops { unsigned int obj_size; struct kmem_cache *slab; char *slab_name; - int (*rtx_syn_ack)(const struct sock *sk, - struct request_sock *req); void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); void (*send_reset)(const struct sock *sk, - struct sk_buff *skb); + struct sk_buff *skb, + enum sk_rst_reason reason); void (*destructor)(struct request_sock *req); - void (*syn_ack_timeout)(const struct request_sock *req); }; -int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); - struct saved_syn { u32 mac_hdrlen; u32 network_hdrlen; @@ -61,7 +58,11 @@ struct request_sock { struct request_sock *dl_next; u16 mss; u8 num_retrans; /* number of retransmits */ - u8 syncookie:1; /* syncookie: encode tcpopts in timestamp */ + u8 syncookie:1; /* True if + * 1) tcpopts needs to be encoded in + * TS of SYN+ACK + * 2) ACK is validated by BPF kfunc. + */ u8 num_timeout:7; /* number of timeouts */ u32 ts_recent; struct timer_list rsk_timer; @@ -83,35 +84,43 @@ static inline struct sock *req_to_sk(struct request_sock *req) return (struct sock *)req; } -static inline struct request_sock * -reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, - bool attach_listener) +/** + * skb_steal_sock - steal a socket from an sk_buff + * @skb: sk_buff to steal the socket from + * @refcounted: is set to true if the socket is reference-counted + * @prefetched: is set to true if the socket was assigned from bpf + */ +static inline struct sock *skb_steal_sock(struct sk_buff *skb, + bool *refcounted, bool *prefetched) { - struct request_sock *req; + struct sock *sk = skb->sk; - req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN); - if (!req) + if (!sk) { + *prefetched = false; + *refcounted = false; return NULL; - req->rsk_listener = NULL; - if (attach_listener) { - if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) { - kmem_cache_free(ops->slab, req); - return NULL; + } + + *prefetched = skb_sk_is_prefetched(skb); + if (*prefetched) { +#if IS_ENABLED(CONFIG_SYN_COOKIES) + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + struct request_sock *req = inet_reqsk(sk); + + *refcounted = false; + sk = req->rsk_listener; + req->rsk_listener = NULL; + return sk; } - req->rsk_listener = sk_listener; +#endif + *refcounted = sk_is_refcounted(sk); + } else { + *refcounted = true; } - req->rsk_ops = ops; - req_to_sk(req)->sk_prot = sk_listener->sk_prot; - sk_node_init(&req_to_sk(req)->sk_node); - sk_tx_queue_clear(req_to_sk(req)); - req->saved_syn = NULL; - req->timeout = 0; - req->num_timeout = 0; - req->num_retrans = 0; - req->sk = NULL; - refcount_set(&req->rsk_refcnt, 0); - return req; + skb->destructor = NULL; + skb->sk = NULL; + return sk; } static inline void __reqsk_free(struct request_sock *req) @@ -125,14 +134,14 @@ static inline void __reqsk_free(struct request_sock *req) static inline void reqsk_free(struct request_sock *req) { - WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); + DEBUG_NET_WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); __reqsk_free(req); } static inline void reqsk_put(struct request_sock *req) { if (refcount_dec_and_test(&req->rsk_refcnt)) - reqsk_free(req); + __reqsk_free(req); } /* @@ -175,8 +184,8 @@ struct fastopen_queue { struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + u8 synflood_warned; - u32 synflood_warned; atomic_t qlen; atomic_t young; @@ -238,4 +247,16 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) return atomic_read(&queue->young); } +/* RFC 7323 2.3 Using the Window Scale Option + * The window field (SEG.WND) of every outgoing segment, with the + * exception of <SYN> segments, MUST be right-shifted by + * Rcv.Wind.Shift bits. + * + * This means the SEG.WND carried in SYNACK can not exceed 65535. + * We use this property to harden TCP stack while in NEW_SYN_RECV state. + */ +static inline u32 tcp_synack_window(const struct request_sock *req) +{ + return min(req->rsk_rcv_wnd, 65535U); +} #endif /* _REQUEST_SOCK_H */ diff --git a/include/net/rose.h b/include/net/rose.h index 23267b4efcfa..2b5491bbf39a 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -8,6 +8,7 @@ #ifndef _ROSE_H #define _ROSE_H +#include <linux/refcount.h> #include <linux/rose.h> #include <net/ax25.h> #include <net/sock.h> @@ -96,7 +97,7 @@ struct rose_neigh { ax25_cb *ax25; struct net_device *dev; unsigned short count; - unsigned short use; + refcount_t use; unsigned int number; char restarted; char dce_mode; @@ -151,6 +152,21 @@ struct rose_sock { #define rose_sk(sk) ((struct rose_sock *)(sk)) +static inline void rose_neigh_hold(struct rose_neigh *rose_neigh) +{ + refcount_inc(&rose_neigh->use); +} + +static inline void rose_neigh_put(struct rose_neigh *rose_neigh) +{ + if (refcount_dec_and_test(&rose_neigh->use)) { + if (rose_neigh->ax25) + ax25_cb_put(rose_neigh->ax25); + kfree(rose_neigh->digipeat); + kfree(rose_neigh); + } +} + /* af_rose.c */ extern ax25_address rose_callsign; extern int sysctl_rose_restart_request_timeout; diff --git a/include/net/route.h b/include/net/route.h index 51a45b1887b5..f90106f383c5 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -27,6 +27,8 @@ #include <net/ip_fib.h> #include <net/arp.h> #include <net/ndisc.h> +#include <net/inet_dscp.h> +#include <net/sock.h> #include <linux/in_route.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> @@ -35,11 +37,6 @@ #include <linux/cache.h> #include <linux/security.h> -#define RTO_ONLINK 0x01 - -#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) -#define RT_CONN_FLAGS_TOS(sk,tos) (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE)) - static inline __u8 ip_sock_rt_scope(const struct sock *sk) { if (sock_flag(sk, SOCK_LOCALROUTE)) @@ -50,7 +47,7 @@ static inline __u8 ip_sock_rt_scope(const struct sock *sk) static inline __u8 ip_sock_rt_tos(const struct sock *sk) { - return RT_TOS(inet_sk(sk)->tos); + return READ_ONCE(inet_sk(sk)->tos) & INET_DSCP_MASK; } struct ip_tunnel_info; @@ -80,6 +77,17 @@ struct rtable { rt_pmtu:31; }; +#define dst_rtable(_ptr) container_of_const(_ptr, struct rtable, dst) + +/** + * skb_rtable - Returns the skb &rtable + * @skb: buffer + */ +static inline struct rtable *skb_rtable(const struct sk_buff *skb) +{ + return dst_rtable(skb_dst(skb)); +} + static inline bool rt_is_input_route(const struct rtable *rt) { return rt->rt_is_input != 0; @@ -122,6 +130,33 @@ struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); + +static inline void inet_sk_init_flowi4(const struct inet_sock *inet, + struct flowi4 *fl4) +{ + const struct ip_options_rcu *ip4_opt; + const struct sock *sk; + __be32 daddr; + + rcu_read_lock(); + ip4_opt = rcu_dereference(inet->inet_opt); + + /* Source routing option overrides the socket destination address */ + if (ip4_opt && ip4_opt->opt.srr) + daddr = ip4_opt->opt.faddr; + else + daddr = inet->inet_daddr; + rcu_read_unlock(); + + sk = &inet->sk; + flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), + sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, + inet->inet_saddr, inet->inet_dport, + inet->inet_sport, sk_uid(sk)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); +} + struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp, const struct sk_buff *skb); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp, @@ -136,12 +171,6 @@ static inline struct rtable *__ip_route_output_key(struct net *net, struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, const struct sock *sk); -struct rtable *ip_route_output_tunnel(struct sk_buff *skb, - struct net_device *dev, - struct net *net, __be32 *saddr, - const struct ip_tunnel_info *info, - u8 protocol, bool use_cache); - struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig); @@ -150,15 +179,22 @@ static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 return ip_route_output_flow(net, flp, NULL); } +/* Simplistic IPv4 route lookup function. + * This is only suitable for some particular use cases: since the flowi4 + * structure is only partially set, it may bypass some fib-rules. + */ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, - __be32 saddr, u8 tos, int oif) + __be32 saddr, dscp_t dscp, + int oif, __u8 scope) { struct flowi4 fl4 = { .flowi4_oif = oif, - .flowi4_tos = tos, + .flowi4_dscp = dscp, + .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, }; + return ip_route_output_key(net, &fl4); } @@ -169,51 +205,42 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi __u8 proto, __u8 tos, int oif) { flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos, - RT_SCOPE_UNIVERSE, proto, - sk ? inet_sk_flowi_flags(sk) : 0, + sk ? ip_sock_rt_scope(sk) : RT_SCOPE_UNIVERSE, + proto, sk ? inet_sk_flowi_flags(sk) : 0, daddr, saddr, dport, sport, sock_net_uid(net, sk)); if (sk) security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } -static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4, - __be32 daddr, __be32 saddr, - __be32 gre_key, __u8 tos, int oif) +enum skb_drop_reason +ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct in_device *in_dev, u32 *itag); +enum skb_drop_reason +ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev); +enum skb_drop_reason +ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + const struct sk_buff *hint); + +static inline enum skb_drop_reason +ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, + struct net_device *devin) { - memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; - fl4->daddr = daddr; - fl4->saddr = saddr; - fl4->flowi4_tos = tos; - fl4->flowi4_proto = IPPROTO_GRE; - fl4->fl4_gre_key = gre_key; - return ip_route_output_key(net, fl4); -} -int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, - struct in_device *in_dev, u32 *itag); -int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin); -int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, - const struct sk_buff *hint); - -static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) -{ - int err; + enum skb_drop_reason reason; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, tos, devin); - if (!err) { + reason = ip_route_input_noref(skb, dst, src, dscp, devin); + if (!reason) { skb_dst_force(skb); if (!skb_dst(skb)) - err = -EINVAL; + reason = SKB_DROP_REASON_NOT_SPECIFIED; } rcu_read_unlock(); - return err; + return reason; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, @@ -258,8 +285,6 @@ static inline void ip_rt_put(struct rtable *rt) dst_release(&rt->dst); } -#define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) - extern const __u8 ip_tos2prio[16]; static inline char rt_tos2priority(u8 tos) @@ -301,9 +326,12 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport) + flow_flags |= FLOWI_FLAG_ANY_SPORT; + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, - src, dport, sport, sk->sk_uid); + src, dport, sport, sk_uid(sk)); } static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, @@ -357,10 +385,15 @@ static inline int inet_iif(const struct sk_buff *skb) static inline int ip4_dst_hoplimit(const struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); - struct net *net = dev_net(dst->dev); - if (hoplimit == 0) + if (hoplimit == 0) { + const struct net *net; + + rcu_read_lock(); + net = dst_dev_net_rcu(dst); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); + rcu_read_unlock(); + } return hoplimit; } diff --git a/include/net/rps.h b/include/net/rps.h new file mode 100644 index 000000000000..f1794cd2e7fb --- /dev/null +++ b/include/net/rps.h @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_RPS_H +#define _NET_RPS_H + +#include <linux/types.h> +#include <linux/static_key.h> +#include <net/sock.h> +#include <net/hotdata.h> + +#ifdef CONFIG_RPS + +extern struct static_key_false rps_needed; +extern struct static_key_false rfs_needed; + +/* + * This structure holds an RPS map which can be of variable length. The + * map is an array of CPUs. + */ +struct rps_map { + unsigned int len; + struct rcu_head rcu; + u16 cpus[]; +}; +#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) + +/* + * The rps_dev_flow structure contains the mapping of a flow to a CPU, the + * tail pointer for that CPU's input queue at the time of last enqueue, a + * hardware filter index, and the hash of the flow if aRFS is enabled. + */ +struct rps_dev_flow { + u16 cpu; + u16 filter; + unsigned int last_qtail; +#ifdef CONFIG_RFS_ACCEL + u32 hash; +#endif +}; +#define RPS_NO_FILTER 0xffff + +/* + * The rps_dev_flow_table structure contains a table of flow mappings. + */ +struct rps_dev_flow_table { + u8 log; + struct rcu_head rcu; + struct rps_dev_flow flows[]; +}; +#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ + ((_num) * sizeof(struct rps_dev_flow))) + +/* + * The rps_sock_flow_table contains mappings of flows to the last CPU + * on which they were processed by the application (set in recvmsg). + * Each entry is a 32bit value. Upper part is the high-order bits + * of flow hash, lower part is CPU number. + * rps_cpu_mask is used to partition the space, depending on number of + * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 + * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, + * meaning we use 32-6=26 bits for the hash. + */ +struct rps_sock_flow_table { + struct rcu_head rcu; + u32 mask; + + u32 ents[] ____cacheline_aligned_in_smp; +}; +#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) + +#define RPS_NO_CPU 0xffff + +static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, + u32 hash) +{ + unsigned int index = hash & table->mask; + u32 val = hash & ~net_hotdata.rps_cpu_mask; + + /* We only give a hint, preemption can change CPU under us */ + val |= raw_smp_processor_id(); + + /* The following WRITE_ONCE() is paired with the READ_ONCE() + * here, and another one in get_rps_cpu(). + */ + if (READ_ONCE(table->ents[index]) != val) + WRITE_ONCE(table->ents[index], val); +} + +static inline void _sock_rps_record_flow_hash(__u32 hash) +{ + struct rps_sock_flow_table *sock_flow_table; + + if (!hash) + return; + rcu_read_lock(); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); + if (sock_flow_table) + rps_record_sock_flow(sock_flow_table, hash); + rcu_read_unlock(); +} + +static inline void _sock_rps_record_flow(const struct sock *sk) +{ + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) { + /* This READ_ONCE() is paired with the WRITE_ONCE() + * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). + */ + _sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); + } +} + +static inline void _sock_rps_delete_flow(const struct sock *sk) +{ + struct rps_sock_flow_table *table; + u32 hash, index; + + hash = READ_ONCE(sk->sk_rxhash); + if (!hash) + return; + + rcu_read_lock(); + table = rcu_dereference(net_hotdata.rps_sock_flow_table); + if (table) { + index = hash & table->mask; + if (READ_ONCE(table->ents[index]) != RPS_NO_CPU) + WRITE_ONCE(table->ents[index], RPS_NO_CPU); + } + rcu_read_unlock(); +} +#endif /* CONFIG_RPS */ + +static inline bool rfs_is_needed(void) +{ +#ifdef CONFIG_RPS + return static_branch_unlikely(&rfs_needed); +#else + return false; +#endif +} + +static inline void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow_hash(hash); +#endif +} + +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow(sk); +#endif +} + +static inline void sock_rps_delete_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_delete_flow(sk); +#endif +} + +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return ++sd->input_queue_tail; +#else + return 0; +#endif +} + +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) +{ +#ifdef CONFIG_RPS + WRITE_ONCE(*dest, tail); +#endif +} + +static inline void rps_input_queue_head_add(struct softnet_data *sd, int val) +{ +#ifdef CONFIG_RPS + WRITE_ONCE(sd->input_queue_head, sd->input_queue_head + val); +#endif +} + +static inline void rps_input_queue_head_incr(struct softnet_data *sd) +{ + rps_input_queue_head_add(sd, 1); +} + +#endif /* _NET_RPS_H */ diff --git a/include/net/rstreason.h b/include/net/rstreason.h new file mode 100644 index 000000000000..979ac87b5d99 --- /dev/null +++ b/include/net/rstreason.h @@ -0,0 +1,221 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_RSTREASON_H +#define _LINUX_RSTREASON_H +#include <net/dropreason-core.h> +#include <uapi/linux/mptcp.h> + +#define DEFINE_RST_REASON(FN, FNe) \ + FN(NOT_SPECIFIED) \ + FN(NO_SOCKET) \ + FN(TCP_INVALID_ACK_SEQUENCE) \ + FN(TCP_RFC7323_PAWS) \ + FN(TCP_TOO_OLD_ACK) \ + FN(TCP_ACK_UNSENT_DATA) \ + FN(TCP_FLAGS) \ + FN(TCP_OLD_ACK) \ + FN(TCP_ABORT_ON_DATA) \ + FN(TCP_TIMEWAIT_SOCKET) \ + FN(INVALID_SYN) \ + FN(TCP_ABORT_ON_CLOSE) \ + FN(TCP_ABORT_ON_LINGER) \ + FN(TCP_ABORT_ON_MEMORY) \ + FN(TCP_STATE) \ + FN(TCP_KEEPALIVE_TIMEOUT) \ + FN(TCP_DISCONNECT_WITH_DATA) \ + FN(MPTCP_RST_EUNSPEC) \ + FN(MPTCP_RST_EMPTCP) \ + FN(MPTCP_RST_ERESOURCE) \ + FN(MPTCP_RST_EPROHIBIT) \ + FN(MPTCP_RST_EWQ2BIG) \ + FN(MPTCP_RST_EBADPERF) \ + FN(MPTCP_RST_EMIDDLEBOX) \ + FN(ERROR) \ + FNe(MAX) + +/** + * enum sk_rst_reason - the reasons of socket reset + * + * The reasons of sk reset, which are used in TCP/MPTCP protocols. + * + * There are three parts in order: + * 1) skb drop reasons: relying on drop reasons for such as passive reset + * 2) independent reset reasons: such as active reset reasons + * 3) reset reasons in MPTCP: only for MPTCP use + */ +enum sk_rst_reason { + /* Refer to include/net/dropreason-core.h + * Rely on skb drop reasons because it indicates exactly why RST + * could happen. + */ + /** @SK_RST_REASON_NOT_SPECIFIED: reset reason is not specified */ + SK_RST_REASON_NOT_SPECIFIED, + /** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */ + SK_RST_REASON_NO_SOCKET, + /** + * @SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ + * field because ack sequence is not in the window between snd_una + * and snd_nxt + */ + SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE, + /** + * @SK_RST_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to + * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED + */ + SK_RST_REASON_TCP_RFC7323_PAWS, + /** @SK_RST_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */ + SK_RST_REASON_TCP_TOO_OLD_ACK, + /** + * @SK_RST_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't + * sent yet + */ + SK_RST_REASON_TCP_ACK_UNSENT_DATA, + /** @SK_RST_REASON_TCP_FLAGS: TCP flags invalid */ + SK_RST_REASON_TCP_FLAGS, + /** @SK_RST_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */ + SK_RST_REASON_TCP_OLD_ACK, + /** + * @SK_RST_REASON_TCP_ABORT_ON_DATA: abort on data + * corresponding to LINUX_MIB_TCPABORTONDATA + */ + SK_RST_REASON_TCP_ABORT_ON_DATA, + + /* Here start with the independent reasons */ + /** @SK_RST_REASON_TCP_TIMEWAIT_SOCKET: happen on the timewait socket */ + SK_RST_REASON_TCP_TIMEWAIT_SOCKET, + /** + * @SK_RST_REASON_INVALID_SYN: receive bad syn packet + * RFC 793 says if the state is not CLOSED/LISTEN/SYN-SENT then + * "fourth, check the SYN bit,...If the SYN is in the window it is + * an error, send a reset" + */ + SK_RST_REASON_INVALID_SYN, + /** + * @SK_RST_REASON_TCP_ABORT_ON_CLOSE: abort on close + * corresponding to LINUX_MIB_TCPABORTONCLOSE + */ + SK_RST_REASON_TCP_ABORT_ON_CLOSE, + /** + * @SK_RST_REASON_TCP_ABORT_ON_LINGER: abort on linger + * corresponding to LINUX_MIB_TCPABORTONLINGER + */ + SK_RST_REASON_TCP_ABORT_ON_LINGER, + /** + * @SK_RST_REASON_TCP_ABORT_ON_MEMORY: abort on memory + * corresponding to LINUX_MIB_TCPABORTONMEMORY + */ + SK_RST_REASON_TCP_ABORT_ON_MEMORY, + /** + * @SK_RST_REASON_TCP_STATE: abort on tcp state + * Please see RFC 9293 for all possible reset conditions + */ + SK_RST_REASON_TCP_STATE, + /** + * @SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT: time to timeout + * When we have already run out of all the chances, which means + * keepalive timeout, we have to reset the connection + */ + SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT, + /** + * @SK_RST_REASON_TCP_DISCONNECT_WITH_DATA: disconnect when write + * queue is not empty + * It means user has written data into the write queue when doing + * disconnecting, so we have to send an RST. + */ + SK_RST_REASON_TCP_DISCONNECT_WITH_DATA, + + /* Copy from include/uapi/linux/mptcp.h. + * These reset fields will not be changed since they adhere to + * RFC 8684. So do not touch them. I'm going to list each definition + * of them respectively. + */ + /** + * @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error. + * This is the default error; it implies that the subflow is no + * longer available. The presence of this option shows that the + * RST was generated by an MPTCP-aware device. + */ + SK_RST_REASON_MPTCP_RST_EUNSPEC, + /** + * @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error. + * An error has been detected in the processing of MPTCP options. + * This is the usual reason code to return in the cases where a RST + * is being sent to close a subflow because of an invalid response. + */ + SK_RST_REASON_MPTCP_RST_EMPTCP, + /** + * @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources. + * This code indicates that the sending host does not have enough + * resources to support the terminated subflow. + */ + SK_RST_REASON_MPTCP_RST_ERESOURCE, + /** + * @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited. + * This code indicates that the requested subflow is prohibited by + * the policies of the sending host. + */ + SK_RST_REASON_MPTCP_RST_EPROHIBIT, + /** + * @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data. + * This code indicates that there is an excessive amount of data + * that needs to be transmitted over the terminated subflow while + * having already been acknowledged over one or more other subflows. + * This may occur if a path has been unavailable for a short period + * and it is more efficient to reset and start again than it is to + * retransmit the queued data. + */ + SK_RST_REASON_MPTCP_RST_EWQ2BIG, + /** + * @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable performance. + * This code indicates that the performance of this subflow was + * too low compared to the other subflows of this Multipath TCP + * connection. + */ + SK_RST_REASON_MPTCP_RST_EBADPERF, + /** + * @SK_RST_REASON_MPTCP_RST_EMIDDLEBOX: Middlebox interference. + * Middlebox interference has been detected over this subflow, + * making MPTCP signaling invalid. For example, this may be sent + * if the checksum does not validate. + */ + SK_RST_REASON_MPTCP_RST_EMIDDLEBOX, + + /** @SK_RST_REASON_ERROR: unexpected error happens */ + SK_RST_REASON_ERROR, + + /** + * @SK_RST_REASON_MAX: Maximum of socket reset reasons. + * It shouldn't be used as a real 'reason'. + */ + SK_RST_REASON_MAX, +}; + +/* Convert skb drop reasons to enum sk_rst_reason type */ +static inline enum sk_rst_reason +sk_rst_convert_drop_reason(enum skb_drop_reason reason) +{ + switch (reason) { + case SKB_DROP_REASON_NOT_SPECIFIED: + return SK_RST_REASON_NOT_SPECIFIED; + case SKB_DROP_REASON_NO_SOCKET: + return SK_RST_REASON_NO_SOCKET; + case SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: + return SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE; + case SKB_DROP_REASON_TCP_RFC7323_PAWS: + return SK_RST_REASON_TCP_RFC7323_PAWS; + case SKB_DROP_REASON_TCP_TOO_OLD_ACK: + return SK_RST_REASON_TCP_TOO_OLD_ACK; + case SKB_DROP_REASON_TCP_ACK_UNSENT_DATA: + return SK_RST_REASON_TCP_ACK_UNSENT_DATA; + case SKB_DROP_REASON_TCP_FLAGS: + return SK_RST_REASON_TCP_FLAGS; + case SKB_DROP_REASON_TCP_OLD_ACK: + return SK_RST_REASON_TCP_OLD_ACK; + case SKB_DROP_REASON_TCP_ABORT_ON_DATA: + return SK_RST_REASON_TCP_ABORT_ON_DATA; + default: + /* If we don't have our own corresponding reason */ + return SK_RST_REASON_NOT_SPECIFIED; + } +} +#endif diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 6506221c5fe3..ec65a8cebb99 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -3,6 +3,7 @@ #define __NET_RTNETLINK_H #include <linux/rtnetlink.h> +#include <linux/srcu.h> #include <net/netlink.h> typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, @@ -11,7 +12,11 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), +#define RTNL_FLAG_DOIT_PERNET RTNL_FLAG_DOIT_UNLOCKED +#define RTNL_FLAG_DOIT_PERNET_WIP RTNL_FLAG_DOIT_UNLOCKED RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), + RTNL_FLAG_DUMP_UNLOCKED = BIT(2), + RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */ }; enum rtnl_kinds { @@ -27,13 +32,35 @@ static inline enum rtnl_kinds rtnl_msgtype_kind(int msgtype) return msgtype & RTNL_KIND_MASK; } -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -int rtnl_register_module(struct module *owner, int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -int rtnl_unregister(int protocol, int msgtype); +/** + * struct rtnl_msg_handler - rtnetlink message type and handlers + * + * @owner: NULL for built-in, THIS_MODULE for module + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions + */ +struct rtnl_msg_handler { + struct module *owner; + int protocol; + int msgtype; + rtnl_doit_func doit; + rtnl_dumpit_func dumpit; + int flags; +}; + void rtnl_unregister_all(int protocol); +int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n); +void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n); + +#define rtnl_register_many(handlers) \ + __rtnl_register_many(handlers, ARRAY_SIZE(handlers)) +#define rtnl_unregister_many(handlers) \ + __rtnl_unregister_many(handlers, ARRAY_SIZE(handlers)) + static inline int rtnl_msg_family(const struct nlmsghdr *nlh) { if (nlmsg_len(nlh) >= sizeof(struct rtgenmsg)) @@ -43,11 +70,47 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) } /** + * struct rtnl_newlink_params - parameters of rtnl_link_ops::newlink() + * + * @src_net: Source netns of rtnetlink socket + * @link_net: Link netns by IFLA_LINK_NETNSID, NULL if not specified + * @peer_net: Peer netns + * @tb: IFLA_* attributes + * @data: IFLA_INFO_DATA attributes + */ +struct rtnl_newlink_params { + struct net *src_net; + struct net *link_net; + struct net *peer_net; + struct nlattr **tb; + struct nlattr **data; +}; + +/* Get effective link netns from newlink params. Generally, this is link_net + * and falls back to src_net. But for compatibility, a driver may * choose to + * use dev_net(dev) instead. + */ +static inline struct net *rtnl_newlink_link_net(struct rtnl_newlink_params *p) +{ + return p->link_net ? : p->src_net; +} + +/* Get peer netns from newlink params. Fallback to link netns if peer netns is + * not specified explicitly. + */ +static inline struct net *rtnl_newlink_peer_net(struct rtnl_newlink_params *p) +{ + return p->peer_net ? : rtnl_newlink_link_net(p); +} + +/** * struct rtnl_link_ops - rtnetlink link operations * - * @list: Used internally + * @list: Used internally, protected by link_ops_mutex and SRCU + * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit + * @peer_type: Peer device specific netlink attribute number (e.g. VETH_INFO_PEER) * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters @@ -76,6 +139,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) */ struct rtnl_link_ops { struct list_head list; + struct srcu_struct srcu; const char *kind; @@ -88,16 +152,15 @@ struct rtnl_link_ops { void (*setup)(struct net_device *dev); bool netns_refund; + const u16 peer_type; unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); - int (*newlink)(struct net *src_net, - struct net_device *dev, - struct nlattr *tb[], - struct nlattr *data[], + int (*newlink)(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); int (*changelink)(struct net_device *dev, struct nlattr *tb[], @@ -136,16 +199,14 @@ struct rtnl_link_ops { int *prividx, int attr); }; -int __rtnl_link_register(struct rtnl_link_ops *ops); -void __rtnl_link_unregister(struct rtnl_link_ops *ops); - int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); /** * struct rtnl_af_ops - rtnetlink address family operations * - * @list: Used internally + * @list: Used internally, protected by RTNL and SRCU + * @srcu: Used internally * @family: Address family * @fill_link_af: Function to fill IFLA_AF_SPEC with address family * specific netlink attributes. @@ -158,6 +219,8 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops); */ struct rtnl_af_ops { struct list_head list; + struct srcu_struct srcu; + int family; int (*fill_link_af)(struct sk_buff *skb, @@ -177,7 +240,7 @@ struct rtnl_af_ops { size_t (*get_stats_af_size)(const struct net_device *dev); }; -void rtnl_af_register(struct rtnl_af_ops *ops); +int rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f232512505f8..c3a7268b567e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -19,6 +19,7 @@ #include <net/gen_stats.h> #include <net/rtnetlink.h> #include <net/flow_offload.h> +#include <linux/xarray.h> struct Qdisc_ops; struct qdisc_walker; @@ -40,13 +41,6 @@ enum qdisc_state_t { __QDISC_STATE_DRAINING, }; -enum qdisc_state2_t { - /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. - * Use qdisc_run_begin/end() or qdisc_is_running() instead. - */ - __QDISC_STATE2_RUNNING, -}; - #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) @@ -94,6 +88,8 @@ struct Qdisc { #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ #define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ #define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ +#define TCQ_F_DEQUEUE_DROPS 0x400 /* ->dequeue() can drop packets in q->to_free */ + u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; @@ -109,23 +105,35 @@ struct Qdisc { int pad; refcount_t refcnt; - /* - * For performance sake on SMP, we put highly modified fields at the end - */ - struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; - struct qdisc_skb_head q; - struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - unsigned long state; - unsigned long state2; /* must be written under qdisc spinlock */ - struct Qdisc *next_sched; - struct sk_buff_head skb_bad_txq; - - spinlock_t busylock ____cacheline_aligned_in_smp; + /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */ + __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned; + struct sk_buff_head gso_skb; + struct Qdisc *next_sched; + struct sk_buff_head skb_bad_txq; + __cacheline_group_end(Qdisc_read_mostly); + + /* Fields dirtied in dequeue() fast path. */ + __cacheline_group_begin(Qdisc_write) ____cacheline_aligned; + struct qdisc_skb_head q; + unsigned long state; + struct gnet_stats_basic_sync bstats; + bool running; /* must be written under qdisc spinlock */ + + /* Note : we only change qstats.backlog in fast path. */ + struct gnet_stats_queue qstats; + + struct sk_buff *to_free; + __cacheline_group_end(Qdisc_write); + + + atomic_long_t defer_count ____cacheline_aligned_in_smp; + struct llist_head defer_list; + spinlock_t seqlock; struct rcu_head rcu; netdevice_tracker dev_tracker; + struct lock_class_key root_lock_key; /* private data */ long privdata[] ____cacheline_aligned; }; @@ -165,7 +173,7 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); - return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + return READ_ONCE(qdisc->running); } static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) @@ -208,11 +216,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) */ return spin_trylock(&qdisc->seqlock); } - return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + if (READ_ONCE(qdisc->running)) + return false; + WRITE_ONCE(qdisc->running, true); + return true; } -static inline void qdisc_run_end(struct Qdisc *qdisc) +static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc) { + struct sk_buff *to_free = NULL; + if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); @@ -225,9 +238,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) if (unlikely(test_bit(__QDISC_STATE_MISSED, &qdisc->state))) __netif_schedule(qdisc); - } else { - __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + return NULL; + } + + if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) { + to_free = qdisc->to_free; + if (to_free) + qdisc->to_free = NULL; } + WRITE_ONCE(qdisc->running, false); + return to_free; } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) @@ -237,12 +257,7 @@ static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) { -#ifdef CONFIG_BQL - /* Non-BQL migrated drivers will return 0, too. */ - return dql_avail(&txq->dql); -#else - return 0; -#endif + return netdev_queue_dql_avail(txq); } struct Qdisc_class_ops { @@ -324,7 +339,6 @@ struct Qdisc_ops { struct module *owner; }; - struct tcf_result { union { struct { @@ -332,7 +346,6 @@ struct tcf_result { u32 classid; }; const struct tcf_proto *goto_tp; - }; }; @@ -376,6 +389,10 @@ struct tcf_proto_ops { struct nlattr **tca, struct netlink_ext_ack *extack); void (*tmplt_destroy)(void *tmplt_priv); + void (*tmplt_reoffload)(struct tcf_chain *chain, + bool add, + flow_setup_cb_t *cb, + void *cb_priv); struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, u32 handle); @@ -424,19 +441,24 @@ struct tcf_proto { */ spinlock_t lock; bool deleting; + bool counted; + bool usesw; refcount_t refcnt; struct rcu_head rcu; struct hlist_node destroy_ht_node; }; struct qdisc_skb_cb { - struct { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; - }; + unsigned int pkt_len; + u16 pkt_segs; + u16 tc_classid; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; + + u16 slave_dev_queue_mapping; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); @@ -458,6 +480,7 @@ struct tcf_chain { }; struct tcf_block { + struct xarray ports; /* datapath accessible */ /* Lock protects tcf_block and lifetime-management data of chains * attached to the block (refcnt, action_refcnt, explicitly_created). */ @@ -472,6 +495,7 @@ struct tcf_block { struct flow_block flow_block; struct list_head owner_list; bool keep_dst; + atomic_t useswcnt; atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */ @@ -484,6 +508,8 @@ struct tcf_block { struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */ }; +struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index); + static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain) { return lockdep_is_held(&chain->filter_chain_lock); @@ -587,6 +613,7 @@ static inline void sch_tree_unlock(struct Qdisc *q) extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; +extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1]; extern struct Qdisc_ops mq_qdisc_ops; extern struct Qdisc_ops noqueue_qdisc_ops; extern const struct Qdisc_ops *default_qdisc_ops; @@ -796,6 +823,14 @@ static inline bool qdisc_tx_changing(const struct net_device *dev) return false; } +/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */ +static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq) +{ + struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc); + + return qdisc->enqueue == NULL; +} + /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { @@ -814,6 +849,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) return qdisc_skb_cb(skb)->pkt_len; } +static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb) +{ + u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs; + + DEBUG_NET_WARN_ON_ONCE(pkt_segs != + (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1)); + return pkt_segs; +} + /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */ enum net_xmit_qdisc_t { __NET_XMIT_STOLEN = 0x00010000, @@ -840,12 +884,11 @@ static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - qdisc_calculate_pkt_len(skb, sch); return sch->enqueue(skb, sch, to_free); } static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, - __u64 bytes, __u32 packets) + __u64 bytes, __u64 packets) { u64_stats_update_begin(&bstats->syncp); u64_stats_add(&bstats->bytes, bytes); @@ -856,9 +899,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { - _bstats_update(bstats, - qdisc_pkt_len(skb), - skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); + _bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb)); } static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, @@ -959,14 +1000,6 @@ static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, *backlog = qstats.backlog; } -static inline void qdisc_tree_flush_backlog(struct Qdisc *sch) -{ - __u32 qlen, backlog; - - qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); - qdisc_tree_reduce_backlog(sch, qlen, backlog); -} - static inline void qdisc_purge_queue(struct Qdisc *sch) { __u32 qlen, backlog; @@ -1025,6 +1058,26 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) return skb; } +static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct) +{ + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->gso_skb); + if (skb) { + sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); + return skb; + } + if (direct) { + skb = __qdisc_dequeue_head(&sch->q); + if (skb) + qdisc_qstats_backlog_dec(sch, skb); + return skb; + } else { + return sch->dequeue(sch); + } +} + static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); @@ -1037,6 +1090,56 @@ static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) return skb; } +struct tc_skb_cb { + struct qdisc_skb_cb qdisc_cb; + u32 drop_reason; + + u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */ + u16 mru; +}; + +static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) +{ + struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; + + BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); + return cb; +} + +static inline enum skb_drop_reason +tcf_get_drop_reason(const struct sk_buff *skb) +{ + return tc_skb_cb(skb)->drop_reason; +} + +static inline void tcf_set_drop_reason(const struct sk_buff *skb, + enum skb_drop_reason reason) +{ + tc_skb_cb(skb)->drop_reason = reason; +} + +static inline void tcf_kfree_skb_list(struct sk_buff *skb) +{ + while (unlikely(skb)) { + struct sk_buff *next = skb->next; + + prefetch(next); + kfree_skb_reason(skb, tcf_get_drop_reason(skb)); + skb = next; + } +} + +static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb, + enum skb_drop_reason reason) +{ + DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS)); + DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK); + + tcf_set_drop_reason(skb, reason); + skb->next = q->to_free; + q->to_free = skb; +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ @@ -1207,6 +1310,14 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_DROP; } +static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free, + enum skb_drop_reason reason) +{ + tcf_set_drop_reason(skb, reason); + return qdisc_drop(skb, sch, to_free); +} + static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { diff --git a/include/net/scm.h b/include/net/scm.h index c5bcdf65f55c..c52519669349 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -5,10 +5,12 @@ #include <linux/limits.h> #include <linux/net.h> #include <linux/cred.h> +#include <linux/file.h> #include <linux/security.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/sched/signal.h> +#include <net/compat.h> /* Well, we should have at least one descriptor open * to accept passed FDs 8) @@ -21,9 +23,20 @@ struct scm_creds { kgid_t gid; }; +#ifdef CONFIG_UNIX +struct unix_edge; +#endif + struct scm_fp_list { short count; + short count_unix; short max; +#ifdef CONFIG_UNIX + bool inflight; + bool dead; + struct list_head vertices; + struct unix_edge *edges; +#endif struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; @@ -56,7 +69,7 @@ static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_co static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { - scm->pid = get_pid(pid); + scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; @@ -65,7 +78,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm, static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); - scm->pid = NULL; + scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) @@ -89,119 +102,17 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, return __scm_send(sock, msg, scm); } -#ifdef CONFIG_SECURITY_NETWORK -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ - char *secdata; - u32 seclen; - int err; - - if (test_bit(SOCK_PASSSEC, &sock->flags)) { - err = security_secid_to_secctx(scm->secid, &secdata, &seclen); - - if (!err) { - put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata); - security_release_secctx(secdata, seclen); - } - } -} - -static inline bool scm_has_secdata(struct socket *sock) -{ - return test_bit(SOCK_PASSSEC, &sock->flags); -} -#else -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ } - -static inline bool scm_has_secdata(struct socket *sock) -{ - return false; -} -#endif /* CONFIG_SECURITY_NETWORK */ - -static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) -{ - struct file *pidfd_file = NULL; - int pidfd; - - /* - * put_cmsg() doesn't return an error if CMSG is truncated, - * that's why we need to opencode these checks here. - */ - if ((msg->msg_controllen <= sizeof(struct cmsghdr)) || - (msg->msg_controllen - sizeof(struct cmsghdr)) < sizeof(int)) { - msg->msg_flags |= MSG_CTRUNC; - return; - } - - if (!scm->pid) - return; - - pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); - - if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { - if (pidfd_file) { - put_unused_fd(pidfd); - fput(pidfd_file); - } - - return; - } - - if (pidfd_file) - fd_install(pidfd, pidfd_file); -} - -static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!msg->msg_control) { - if (test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags) || - scm->fp || scm_has_secdata(sock)) - msg->msg_flags |= MSG_CTRUNC; - scm_destroy(scm); - return false; - } - - if (test_bit(SOCK_PASSCRED, &sock->flags)) { - struct user_namespace *current_ns = current_user_ns(); - struct ucred ucreds = { - .pid = scm->creds.pid, - .uid = from_kuid_munged(current_ns, scm->creds.uid), - .gid = from_kgid_munged(current_ns, scm->creds.gid), - }; - put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); - } - - scm_passec(sock, msg, scm); - - if (scm->fp) - scm_detach_fds(msg, scm); - - return true; -} - -static inline void scm_recv(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!__scm_recv_common(sock, msg, scm, flags)) - return; - - scm_destroy_cred(scm); -} +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); -static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) +static inline int scm_recv_one_fd(struct file *f, int __user *ufd, + unsigned int flags) { - if (!__scm_recv_common(sock, msg, scm, flags)) - return; - - if (test_bit(SOCK_PASSPIDFD, &sock->flags)) - scm_pidfd_recv(msg, scm); - - scm_destroy_cred(scm); + if (!ufd) + return -EFAULT; + return receive_fd(f, ufd, flags); } #endif /* __LINUX_NET_SCM_H */ diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index d4b3b2dcd15b..6f2cd562b1de 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -22,16 +22,11 @@ struct sctp_endpoint; struct sctp_association; struct sctp_authkey; struct sctp_hmacalgo; -struct crypto_shash; -/* - * Define a generic struct that will hold all the info - * necessary for an HMAC transform - */ +/* Defines an HMAC algorithm supported by SCTP chunk authentication */ struct sctp_hmac { - __u16 hmac_id; /* one of the above ids */ - char *hmac_name; /* name for loading */ - __u16 hmac_len; /* length of the signature */ + __u16 hmac_id; /* one of SCTP_AUTH_HMAC_ID_* */ + __u16 hmac_len; /* length of the HMAC value in bytes */ }; /* This is generic structure that containst authentication bytes used @@ -77,10 +72,9 @@ struct sctp_shared_key *sctp_auth_get_shkey( int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, struct sctp_association *asoc, gfp_t gfp); -int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp); -void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]); -struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); -struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); +const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); +const struct sctp_hmac * +sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs); int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index f514a0aa849e..654d37ec0402 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -15,8 +15,6 @@ * Dinakaran Joseph * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> - * - * Rewritten to use libcrc32c by: * Vlad Yasevich <vladislav.yasevich@hp.com> */ @@ -25,42 +23,18 @@ #include <linux/types.h> #include <linux/sctp.h> -#include <linux/crc32c.h> -#include <linux/crc32.h> - -static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum) -{ - /* This uses the crypto implementation of crc32c, which is either - * implemented w/ hardware support or resolves to __crc32c_le(). - */ - return (__force __wsum)crc32c((__force __u32)sum, buff, len); -} - -static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - return (__force __wsum)__crc32c_le_combine((__force __u32)csum, - (__force __u32)csum2, len); -} - -static const struct skb_checksum_ops sctp_csum_ops = { - .update = sctp_csum_update, - .combine = sctp_csum_combine, -}; static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, unsigned int offset) { struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); __le32 old = sh->checksum; - __wsum new; + u32 new; sh->checksum = 0; - new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0, - &sctp_csum_ops); + new = ~skb_crc32c(skb, offset, skb->len - offset, ~0); sh->checksum = old; - - return cpu_to_le32((__force __u32)new); + return cpu_to_le32(new); } #endif /* __sctp_checksum_h__ */ diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 5859e0a16a58..ae3376ba0b99 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -296,9 +296,8 @@ enum { SCTP_MAX_GABS = 16 }; */ #define SCTP_DEFAULT_MINSEGMENT 512 /* MTU size ... if no mtu disc */ -#define SCTP_SECRET_SIZE 32 /* Number of octets in a 256 bits. */ - -#define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */ +#define SCTP_COOKIE_KEY_SIZE 32 /* size of cookie HMAC key */ +#define SCTP_COOKIE_MAC_SIZE 32 /* size of HMAC field in cookies */ #define SCTP_COOKIE_MULTIPLE 32 /* Pad out our cookie to make our hash * functions simpler to write. @@ -417,16 +416,12 @@ enum { SCTP_AUTH_HMAC_ID_RESERVED_0, SCTP_AUTH_HMAC_ID_SHA1, SCTP_AUTH_HMAC_ID_RESERVED_2, -#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE) SCTP_AUTH_HMAC_ID_SHA256, -#endif __SCTP_AUTH_HMAC_MAX }; #define SCTP_AUTH_HMAC_ID_MAX __SCTP_AUTH_HMAC_MAX - 1 #define SCTP_AUTH_NUM_HMACS __SCTP_AUTH_HMAC_MAX -#define SCTP_SHA1_SIG_SIZE 20 -#define SCTP_SHA256_SIG_SIZE 32 /* SCTP-AUTH, Section 3.2 * The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH chunks diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index a2310fa995f6..58242b37b47a 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -28,7 +28,7 @@ #define __net_sctp_h__ /* Header Strategy. - * Start getting some control over the header file depencies: + * Start getting some control over the header file dependencies: * includes * constants * structs @@ -85,7 +85,7 @@ void sctp_udp_sock_stop(struct net *net); /* * sctp/socket.c */ -int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, +int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); @@ -94,8 +94,7 @@ void sctp_data_ready(struct sock *sk); __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); -void sctp_copy_sock(struct sock *newsk, struct sock *sk, - struct sctp_association *asoc); + extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *); @@ -364,8 +363,6 @@ sctp_assoc_to_state(const struct sctp_association *asoc) /* Look up the association by its id. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id); -int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp); - /* A macro to walk a list of skbs. */ #define sctp_skb_for_each(pos, head, tmp) \ skb_queue_walk_safe(head, pos, tmp) @@ -636,7 +633,7 @@ static inline void sctp_transport_pl_reset(struct sctp_transport *t) } } else { if (t->pl.state != SCTP_PL_DISABLED) { - if (del_timer(&t->probe_timer)) + if (timer_delete(&t->probe_timer)) sctp_transport_put(t); t->pl.state = SCTP_PL_DISABLED; } diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 64c42bd56bb2..3bfd261a53cc 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -161,7 +161,6 @@ const struct sctp_sm_table_entry *sctp_sm_lookup_event( enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype); -int sctp_chunk_iif(const struct sctp_chunk *); struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *, struct sctp_chunk *, gfp_t gfp); diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h index 572d73fdcd5e..77806ef1cb70 100644 --- a/include/net/sctp/stream_sched.h +++ b/include/net/sctp/stream_sched.h @@ -35,10 +35,10 @@ struct sctp_sched_ops { struct sctp_chunk *(*dequeue)(struct sctp_outq *q); /* Called only if the chunk fit the packet */ void (*dequeue_done)(struct sctp_outq *q, struct sctp_chunk *chunk); - /* Sched all chunks already enqueued */ - void (*sched_all)(struct sctp_stream *steam); - /* Unched all chunks already enqueued */ - void (*unsched_all)(struct sctp_stream *steam); + /* Schedule all chunks already enqueued */ + void (*sched_all)(struct sctp_stream *stream); + /* Unschedule all chunks already enqueued */ + void (*unsched_all)(struct sctp_stream *stream); }; int sctp_sched_set_sched(struct sctp_association *asoc, @@ -52,10 +52,10 @@ void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch); void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch); int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp); -struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream); +const struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream); void sctp_sched_ops_register(enum sctp_sched_type sched, - struct sctp_sched_ops *sched_ops); + const struct sctp_sched_ops *sched_ops); void sctp_sched_ops_prio_init(void); void sctp_sched_ops_rr_init(void); void sctp_sched_ops_fc_init(void); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 5a24d6d8522a..affee44bd38e 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -32,6 +32,7 @@ #ifndef __sctp_structs_h__ #define __sctp_structs_h__ +#include <crypto/sha2.h> #include <linux/ktime.h> #include <linux/generic-radix-tree.h> #include <linux/rhashtable-types.h> @@ -51,9 +52,9 @@ * We should wean ourselves off this. */ union sctp_addr { + struct sockaddr_inet sa; /* Large enough for both address families */ struct sockaddr_in v4; struct sockaddr_in6 v6; - struct sockaddr sa; }; /* Forward declarations for data structures. */ @@ -68,7 +69,6 @@ struct sctp_outq; struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; -struct crypto_shash; struct sctp_stream; @@ -155,10 +155,6 @@ struct sctp_sock { /* PF_ family specific functions. */ struct sctp_pf *pf; - /* Access to HMAC transform. */ - struct crypto_shash *hmac; - char *sctp_hmac_alg; - /* What is our base endpointer? */ struct sctp_endpoint *ep; @@ -227,14 +223,11 @@ struct sctp_sock { frag_interleave:1, recvrcvinfo:1, recvnxtinfo:1, - data_ready_signalled:1; + data_ready_signalled:1, + cookie_auth_enable:1; atomic_t pd_mode; - /* Fields after this point will be skipped on copies, like on accept - * and peeloff operations - */ - /* Receive to here while partial delivery is in effect. */ struct sk_buff_head pd_lobby; @@ -242,10 +235,7 @@ struct sctp_sock { int do_auto_asconf; }; -static inline struct sctp_sock *sctp_sk(const struct sock *sk) -{ - return (struct sctp_sock *)sk; -} +#define sctp_sk(ptr) container_of_const(ptr, struct sctp_sock, inet.sk) static inline struct sock *sctp_opt2sk(const struct sctp_sock *sp) { @@ -338,7 +328,7 @@ struct sctp_cookie { /* The format of our cookie that we send to our peer. */ struct sctp_signed_cookie { - __u8 signature[SCTP_SECRET_SIZE]; + __u8 mac[SCTP_COOKIE_MAC_SIZE]; __u32 __pad; /* force sctp_cookie alignment to 64 bits */ struct sctp_cookie c; } __packed; @@ -503,9 +493,6 @@ struct sctp_pf { int (*bind_verify) (struct sctp_sock *, union sctp_addr *); int (*send_verify) (struct sctp_sock *, union sctp_addr *); int (*supported_addrs)(const struct sctp_sock *, __be16 *); - struct sock *(*create_accept_sk) (struct sock *sk, - struct sctp_association *asoc, - bool kern); int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr); void (*to_sk_saddr)(union sctp_addr *, struct sock *sk); void (*to_sk_daddr)(union sctp_addr *, struct sock *sk); @@ -524,7 +511,7 @@ struct sctp_datamsg { refcount_t refcnt; /* When is this message no longer interesting to the peer? */ unsigned long expires_at; - /* Did the messenge fail to send? */ + /* Did the message fail to send? */ int send_error; u8 send_failed:1, can_delay:1, /* should this message be Nagle delayed */ @@ -778,6 +765,7 @@ struct sctp_transport { /* Reference counting. */ refcount_t refcnt; + __u32 dead:1, /* RTO-Pending : A flag used to track if one of the DATA * chunks sent to this address is currently being * used to compute a RTT. If this flag is 0, @@ -787,7 +775,7 @@ struct sctp_transport { * calculation completes (i.e. the DATA chunk * is SACK'd) clear this flag. */ - __u32 rto_pending:1, + rto_pending:1, /* * hb_sent : a flag that signals that we have a pending @@ -795,7 +783,7 @@ struct sctp_transport { */ hb_sent:1, - /* Is the Path MTU update pending on this tranport */ + /* Is the Path MTU update pending on this transport */ pmtu_pending:1, dst_pending_confirm:1, /* need to confirm neighbour */ @@ -1081,7 +1069,7 @@ struct sctp_outq { struct list_head out_chunk_list; /* Stream scheduler being used */ - struct sctp_sched_ops *sched; + const struct sctp_sched_ops *sched; unsigned int out_qlen; /* Total length of queued data chunks. */ @@ -1226,7 +1214,7 @@ enum sctp_endpoint_type { }; /* - * A common base class to bridge the implmentation view of a + * A common base class to bridge the implementation view of a * socket (usually listening) endpoint versus an association's * local endpoint. * This common structure is useful for several purposes: @@ -1309,33 +1297,15 @@ struct sctp_endpoint { /* This is really a list of struct sctp_association entries. */ struct list_head asocs; - /* Secret Key: A secret key used by this endpoint to compute - * the MAC. This SHOULD be a cryptographic quality - * random number with a sufficient length. - * Discussion in [RFC1750] can be helpful in - * selection of the key. - */ - __u8 secret_key[SCTP_SECRET_SIZE]; - - /* digest: This is a digest of the sctp cookie. This field is - * only used on the receive path when we try to validate - * that the cookie has not been tampered with. We put - * this here so we pre-allocate this once and can re-use - * on every receive. - */ - __u8 *digest; - + /* Cookie authentication key used by this endpoint */ + struct hmac_sha256_key cookie_auth_key; + /* sendbuf acct. policy. */ __u32 sndbuf_policy; /* rcvbuf acct. policy. */ __u32 rcvbuf_policy; - /* SCTP AUTH: array of the HMACs that will be allocated - * we need this per association so that we don't serialize - */ - struct crypto_shash **auth_hmacs; - /* SCTP-AUTH: hmacs for the endpoint encoded into parameter */ struct sctp_hmac_algo_param *auth_hmacs_list; @@ -1356,7 +1326,7 @@ struct sctp_endpoint { struct rcu_head rcu; }; -/* Recover the outter endpoint structure. */ +/* Recover the outer endpoint structure. */ static inline struct sctp_endpoint *sctp_ep(struct sctp_ep_common *base) { struct sctp_endpoint *ep; @@ -1909,7 +1879,7 @@ struct sctp_association { __u32 rwnd_over; /* Keeps treack of rwnd pressure. This happens when we have - * a window, but not recevie buffer (i.e small packets). This one + * a window, but not receive buffer (i.e small packets). This one * is releases slowly (1 PMTU at a time ). */ __u32 rwnd_press; @@ -1997,7 +1967,7 @@ struct sctp_association { /* ADDIP Section 5.2 Upon reception of an ASCONF Chunk. * - * This is needed to implement itmes E1 - E4 of the updated + * This is needed to implement items E1 - E4 of the updated * spec. Here is the justification: * * Since the peer may bundle multiple ASCONF chunks toward us, @@ -2008,7 +1978,7 @@ struct sctp_association { /* These ASCONF chunks are waiting to be sent. * - * These chunaks can't be pushed to outqueue until receiving + * These chunks can't be pushed to outqueue until receiving * ASCONF_ACK for the previous ASCONF indicated by * addip_last_asconf, so as to guarantee that only one ASCONF * is in flight at any time. @@ -2062,13 +2032,13 @@ struct sctp_association { struct sctp_transport *new_transport; /* SCTP AUTH: list of the endpoint shared keys. These - * keys are provided out of band by the user applicaton + * keys are provided out of band by the user application * and can't change during the lifetime of the association */ struct list_head endpoint_shared_keys; /* SCTP AUTH: - * The current generated assocaition shared key (secret) + * The current generated association shared key (secret) */ struct sctp_auth_bytes *asoc_shared_key; struct sctp_shared_key *shkey; @@ -2124,7 +2094,7 @@ enum { SCTP_ASSOC_EYECATCHER = 0xa550c123, }; -/* Recover the outter association structure. */ +/* Recover the outer association structure. */ static inline struct sctp_association *sctp_assoc(struct sctp_ep_common *base) { struct sctp_association *asoc; @@ -2154,8 +2124,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *, const union sctp_addr *address, const gfp_t gfp, const int peer_state); -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr); void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer); void sctp_assoc_control_transport(struct sctp_association *asoc, diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 21e7fa2a1813..cddebafb9f77 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -16,9 +16,5 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport); u32 secure_tcpv6_ts_off(const struct net *net, const __be32 *saddr, const __be32 *daddr); -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport); #endif /* _NET_SECURE_SEQ */ diff --git a/include/net/seg6.h b/include/net/seg6.h index af668f17b398..82b3fbbcbb93 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -52,10 +52,17 @@ static inline struct seg6_pernet_data *seg6_pernet(struct net *net) extern int seg6_init(void); extern void seg6_exit(void); +#ifdef CONFIG_IPV6_SEG6_LWTUNNEL extern int seg6_iptunnel_init(void); extern void seg6_iptunnel_exit(void); extern int seg6_local_init(void); extern void seg6_local_exit(void); +#else +static inline int seg6_iptunnel_init(void) { return 0; } +static inline void seg6_iptunnel_exit(void) {} +static inline int seg6_local_init(void) { return 0; } +static inline void seg6_local_exit(void) {} +#endif extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced); extern struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags); diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 2b5d2ee5613e..e9f41725933e 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -9,6 +9,8 @@ #ifndef _NET_SEG6_HMAC_H #define _NET_SEG6_HMAC_H +#include <crypto/sha1.h> +#include <crypto/sha2.h> #include <net/flow.h> #include <net/ip6_fib.h> #include <net/sock.h> @@ -19,7 +21,6 @@ #include <linux/seg6_hmac.h> #include <linux/rhashtable-types.h> -#define SEG6_HMAC_MAX_DIGESTSIZE 160 #define SEG6_HMAC_RING_SIZE 256 struct seg6_hmac_info { @@ -27,16 +28,15 @@ struct seg6_hmac_info { struct rcu_head rcu; u32 hmackeyid; + /* The raw key, kept only so it can be returned back to userspace */ char secret[SEG6_HMAC_SECRET_LEN]; u8 slen; u8 alg_id; -}; - -struct seg6_hmac_algo { - u8 alg_id; - char name[64]; - struct crypto_shash * __percpu *tfms; - struct shash_desc * __percpu *shashs; + /* The prepared key, which the calculations actually use */ + union { + struct hmac_sha1_key sha1; + struct hmac_sha256_key sha256; + } key; }; extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo, @@ -49,9 +49,12 @@ extern int seg6_hmac_info_del(struct net *net, u32 key); extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr, struct ipv6_sr_hdr *srh); extern bool seg6_hmac_validate_skb(struct sk_buff *skb); -extern int seg6_hmac_init(void); -extern void seg6_hmac_exit(void); +#ifdef CONFIG_IPV6_SEG6_HMAC extern int seg6_hmac_net_init(struct net *net); extern void seg6_hmac_net_exit(struct net *net); +#else +static inline int seg6_hmac_net_init(struct net *net) { return 0; } +static inline void seg6_hmac_net_exit(struct net *net) {} +#endif #endif diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h index 3fab9dec2ec4..888c1ce6f527 100644 --- a/include/net/seg6_local.h +++ b/include/net/seg6_local.h @@ -19,6 +19,7 @@ extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb); struct seg6_bpf_srh_state { + local_lock_t bh_lock; struct ipv6_sr_hdr *srh; u16 hdrlen; bool valid; diff --git a/include/net/selftests.h b/include/net/selftests.h index e65e8d230d33..c36e07406ad4 100644 --- a/include/net/selftests.h +++ b/include/net/selftests.h @@ -3,9 +3,48 @@ #define _NET_SELFTESTS #include <linux/ethtool.h> +#include <linux/netdevice.h> + +struct net_packet_attrs { + const unsigned char *src; + const unsigned char *dst; + u32 ip_src; + u32 ip_dst; + bool tcp; + u16 sport; + u16 dport; + int timeout; + int size; + int max_size; + u8 id; + u16 queue_mapping; + bool bad_csum; +}; + +struct net_test_priv { + struct net_packet_attrs *packet; + struct packet_type pt; + struct completion comp; + int double_vlan; + int vlan_id; + int ok; +}; + +struct netsfhdr { + __be32 version; + __be64 magic; + u8 id; +} __packed; + +#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct netsfhdr)) +#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL +#define NET_LB_TIMEOUT msecs_to_jiffies(200) #if IS_ENABLED(CONFIG_NET_SELFTESTS) +struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id, + struct net_packet_attrs *attr); void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf); int net_selftest_get_count(void); @@ -13,6 +52,12 @@ void net_selftest_get_strings(u8 *data); #else +static inline struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id, + struct net_packet_attrs *attr) +{ + return NULL; +} + static inline void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { diff --git a/include/net/smc.h b/include/net/smc.h index a002552be29c..bfdc4c41f019 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -15,8 +15,10 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/wait.h> -#include "linux/ism.h" +#include <linux/dibs.h> +struct tcp_sock; +struct inet_request_sock; struct sock; #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ @@ -26,57 +28,16 @@ struct smc_hashinfo { struct hlist_head ht; }; -int smc_hash_sk(struct sock *sk); -void smc_unhash_sk(struct sock *sk); - /* SMCD/ISM device driver interface */ -struct smcd_dmb { - u64 dmb_tok; - u64 rgid; - u32 dmb_len; - u32 sba_idx; - u32 vlan_valid; - u32 vlan_id; - void *cpu_addr; - dma_addr_t dma_addr; -}; - -#define ISM_EVENT_DMB 0 -#define ISM_EVENT_GID 1 -#define ISM_EVENT_SWR 2 - #define ISM_RESERVED_VLANID 0x1FFF -#define ISM_ERROR 0xFFFF - -struct smcd_dev; -struct ism_client; - -struct smcd_ops { - int (*query_remote_gid)(struct smcd_dev *dev, u64 rgid, u32 vid_valid, - u32 vid); - int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, - struct ism_client *client); - int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*set_vlan_required)(struct smcd_dev *dev); - int (*reset_vlan_required)(struct smcd_dev *dev); - int (*signal_event)(struct smcd_dev *dev, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info); - int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size); - int (*supports_v2)(void); - u8* (*get_system_eid)(void); - u64 (*get_local_gid)(struct smcd_dev *dev); - u16 (*get_chid)(struct smcd_dev *dev); - struct device* (*get_dev)(struct smcd_dev *dev); +struct smcd_gid { + u64 gid; + u64 gid_ext; }; struct smcd_dev { - const struct smcd_ops *ops; - void *priv; + struct dibs_dev *dibs; struct list_head list; spinlock_t lock; struct smc_connection **conn; @@ -91,4 +52,55 @@ struct smcd_dev { u8 going_away : 1; }; +#define SMC_HS_CTRL_NAME_MAX 16 + +enum { + /* ops can be inherit from init_net */ + SMC_HS_CTRL_FLAG_INHERITABLE = 0x1, + + SMC_HS_CTRL_ALL_FLAGS = SMC_HS_CTRL_FLAG_INHERITABLE, +}; + +struct smc_hs_ctrl { + /* private */ + + struct list_head list; + struct module *owner; + + /* public */ + + /* unique name */ + char name[SMC_HS_CTRL_NAME_MAX]; + int flags; + + /* Invoked before computing SMC option for SYN packets. + * We can control whether to set SMC options by returning various value. + * Return 0 to disable SMC, or return any other value to enable it. + */ + int (*syn_option)(struct tcp_sock *tp); + + /* Invoked before Set up SMC options for SYN-ACK packets + * We can control whether to respond SMC options by returning various + * value. Return 0 to disable SMC, or return any other value to enable + * it. + */ + int (*synack_option)(const struct tcp_sock *tp, + struct inet_request_sock *ireq); +}; + +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) +#define smc_call_hsbpf(init_val, tp, func, ...) ({ \ + typeof(init_val) __ret = (init_val); \ + struct smc_hs_ctrl *ctrl; \ + rcu_read_lock(); \ + ctrl = rcu_dereference(sock_net((struct sock *)(tp))->smc.hs_ctrl); \ + if (ctrl && ctrl->func) \ + __ret = ctrl->func(tp, ##__VA_ARGS__); \ + rcu_read_unlock(); \ + __ret; \ +}) +#else +#define smc_call_hsbpf(init_val, tp, ...) ({ (void)(tp); (init_val); }) +#endif /* CONFIG_SMC_HS_CTRL_BPF */ + #endif /* _SMC_H */ diff --git a/include/net/snmp.h b/include/net/snmp.h index 468a67836e2f..584e70742e9b 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -36,11 +36,6 @@ struct snmp_mib { .entry = _entry, \ } -#define SNMP_MIB_SENTINEL { \ - .name = NULL, \ - .entry = 0, \ -} - /* * We use unsigned longs for most mibs but u64 for ipstats. */ @@ -159,7 +154,7 @@ struct linux_tls_mib { #define __SNMP_ADD_STATS64(mib, field, addend) \ do { \ - __typeof__(*mib) *ptr = raw_cpu_ptr(mib); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[field] += addend; \ u64_stats_update_end(&ptr->syncp); \ @@ -176,8 +171,7 @@ struct linux_tls_mib { #define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1) #define __SNMP_UPD_PO_STATS64(mib, basefield, addend) \ do { \ - __typeof__(*mib) *ptr; \ - ptr = raw_cpu_ptr((mib)); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[basefield##PKTS]++; \ ptr->mibs[basefield##OCTETS] += addend; \ diff --git a/include/net/sock.h b/include/net/sock.h index 11d503417591..aafe8bdb2c0f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -76,19 +76,6 @@ * the other protocols. */ -/* Define this to get the SOCK_DBG debugging facility. */ -#define SOCK_DEBUGGING -#ifdef SOCK_DEBUGGING -#define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ - printk(KERN_DEBUG msg); } while (0) -#else -/* Validate arguments and do nothing */ -static inline __printf(2, 3) -void SOCK_DEBUG(const struct sock *sk, const char *msg, ...) -{ -} -#endif - /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. @@ -131,6 +118,7 @@ typedef __u64 __bitwise __addrpair; * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting + * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol @@ -187,6 +175,7 @@ struct sock_common { unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; + unsigned char skc_bypass_prot_mem:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; @@ -262,6 +251,7 @@ struct sk_filter; * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy + * @psp_assoc: PSP association, if socket is PSP-secured * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -277,8 +267,6 @@ struct sk_filter; * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes - * @__sk_flags_offset: empty field used to determine location of bitfield - * @sk_padding: unused element for alignment * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) @@ -297,9 +285,11 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter + * @sk_drop_counters: optional pointer to numa_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_ino: inode number (zero if orphaned) * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting @@ -315,15 +305,19 @@ struct sk_filter; * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer + * @tcp_retransmit_timer: tcp retransmit timer + * @mptcp_retransmit_timer: mptcp retransmit timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags + * @sk_bpf_cb_flags: used in bpf_setsockopt() * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests + * @sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh. * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. @@ -336,7 +330,7 @@ struct sk_filter; * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start - * @sk_wait_pending: number of threads blocked on this socket + * @sk_disconnects: number of disconnect operations performed on this sock * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available @@ -351,8 +345,16 @@ struct sk_filter; * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags + * @sk_scm_recv_flags: all flags used by scm_recv() + * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS + * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY + * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS + * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference - * @sk_bind2_node: bind node in the bhash2 table + * @sk_user_frags: xarray of pages the user is holding a reference on. + * @sk_owner: reference to the real owner of the socket that calls + * sock_lock_init_class_and_name(). */ struct sock { /* @@ -383,6 +385,7 @@ struct sock { #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt +#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot @@ -394,14 +397,10 @@ struct sock { #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash - /* early demux fields */ - struct dst_entry __rcu *sk_rx_dst; - int sk_rx_dst_ifindex; - u32 sk_rx_dst_cookie; + __cacheline_group_begin(sock_write_rx); - socket_lock_t sk_lock; atomic_t sk_drops; - int sk_rcvlowat; + __s32 sk_peek_off; struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* @@ -418,18 +417,24 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; } sk_backlog; - #define sk_rmem_alloc sk_backlog.rmem_alloc - int sk_forward_alloc; - u32 sk_reserved_mem; + __cacheline_group_end(sock_write_rx); + + __cacheline_group_begin(sock_read_rx); + /* early demux fields */ + struct dst_entry __rcu *sk_rx_dst; + int sk_rx_dst_ifindex; + u32 sk_rx_dst_cookie; + #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_ll_usec; - /* ===== mostly read cache line ===== */ unsigned int sk_napi_id; + u16 sk_busy_poll_budget; + u8 sk_prefer_busy_poll; #endif + u8 sk_userlocks; int sk_rcvbuf; - int sk_wait_pending; struct sk_filter __rcu *sk_filter; union { @@ -438,15 +443,38 @@ struct sock { struct socket_wq *sk_wq_raw; /* public: */ }; + + void (*sk_data_ready)(struct sock *sk); + long sk_rcvtimeo; + int sk_rcvlowat; + __cacheline_group_end(sock_read_rx); + + __cacheline_group_begin(sock_read_rxtx); + int sk_err; + struct socket *sk_socket; +#ifdef CONFIG_MEMCG + struct mem_cgroup *sk_memcg; +#endif #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; +#endif + __cacheline_group_end(sock_read_rxtx); - struct dst_entry __rcu *sk_dst_cache; + __cacheline_group_begin(sock_write_rxtx); + socket_lock_t sk_lock; + u32 sk_reserved_mem; + int sk_forward_alloc; + u32 sk_tsflags; + __cacheline_group_end(sock_write_rxtx); + + __cacheline_group_begin(sock_write_tx); + int sk_write_pending; atomic_t sk_omem_alloc; - int sk_sndbuf; + int sk_err_soft; - /* ===== cache line for TX ===== */ int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; @@ -455,22 +483,44 @@ struct sock { struct rb_root tcp_rtx_queue; }; struct sk_buff_head sk_write_queue; - __s32 sk_peek_off; - int sk_write_pending; - __u32 sk_dst_pending_confirm; - u32 sk_pacing_status; /* see enum sk_pacing */ - long sk_sndtimeo; - struct timer_list sk_timer; - __u32 sk_priority; - __u32 sk_mark; + struct page_frag sk_frag; + union { + struct timer_list sk_timer; + struct timer_list tcp_retransmit_timer; + struct timer_list mptcp_retransmit_timer; + }; unsigned long sk_pacing_rate; /* bytes per second */ + atomic_t sk_zckey; + atomic_t sk_tskey; + unsigned long sk_tx_queue_mapping_jiffies; + __cacheline_group_end(sock_write_tx); + + __cacheline_group_begin(sock_read_tx); + u32 sk_dst_pending_confirm; + u32 sk_pacing_status; /* see enum sk_pacing */ unsigned long sk_max_pacing_rate; - struct page_frag sk_frag; + long sk_sndtimeo; + u32 sk_priority; + u32 sk_mark; + kuid_t sk_uid; + u16 sk_protocol; + u16 sk_type; + struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; - int sk_gso_type; +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); +#endif + u16 sk_gso_type; + u16 sk_gso_max_segs; unsigned int sk_gso_max_size; gfp_t sk_allocation; - __u32 sk_txhash; + u32 sk_txhash; + int sk_sndbuf; + u8 sk_pacing_shift; + bool sk_use_task_frag; + __cacheline_group_end(sock_read_tx); /* * Because of non atomicity rules, all @@ -479,72 +529,71 @@ struct sock { u8 sk_gso_disabled : 1, sk_kern_sock : 1, sk_no_check_tx : 1, - sk_no_check_rx : 1, - sk_userlocks : 4; - u8 sk_pacing_shift; - u16 sk_type; - u16 sk_protocol; - u16 sk_gso_max_segs; + sk_no_check_rx : 1; + u8 sk_shutdown; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; - int sk_err, - sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; - kuid_t sk_uid; - u8 sk_txrehash; -#ifdef CONFIG_NET_RX_BUSY_POLL - u8 sk_prefer_busy_poll; - u16 sk_busy_poll_budget; -#endif + unsigned long sk_ino; spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; - long sk_rcvtimeo; ktime_t sk_stamp; #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif - atomic_t sk_tskey; - atomic_t sk_zckey; - u32 sk_tsflags; - u8 sk_shutdown; + int sk_disconnects; + union { + u8 sk_txrehash; + u8 sk_scm_recv_flags; + struct { + u8 sk_scm_credentials : 1, + sk_scm_security : 1, + sk_scm_pidfd : 1, + sk_scm_rights : 1, + sk_scm_unused : 4; + }; + }; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; - bool sk_use_task_frag; +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG)) + u8 sk_bpf_cb_flags; - struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY void *sk_security; #endif struct sock_cgroup_data sk_cgrp_data; - struct mem_cgroup *sk_memcg; void (*sk_state_change)(struct sock *sk); - void (*sk_data_ready)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); -#ifdef CONFIG_SOCK_VALIDATE_XMIT - struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, - struct net_device *dev, - struct sk_buff *skb); -#endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif + struct numa_drop_counters *sk_drop_counters; struct rcu_head sk_rcu; netns_tracker ns_tracker; - struct hlist_node sk_bind2_node; + struct xarray sk_user_frags; + +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) + struct module *sk_owner; +#endif +}; + +struct sock_bh_locked { + struct sock *sock; + local_lock_t bh_lock; }; enum sk_pacing { @@ -789,11 +838,9 @@ static inline bool sk_del_node_init(struct sock *sk) { bool rc = __sk_del_node_init(sk); - if (rc) { - /* paranoid for a while -acme */ - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + if (rc) __sock_put(sk); - } + return rc; } #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) @@ -811,14 +858,25 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) { bool rc = __sk_nulls_del_node_init_rcu(sk); - if (rc) { - /* paranoid for a while -acme */ - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + if (rc) __sock_put(sk); - } + return rc; } +static inline bool sk_nulls_replace_node_init_rcu(struct sock *old, + struct sock *new) +{ + if (sk_hashed(old)) { + hlist_nulls_replace_init_rcu(&old->sk_nulls_node, + &new->sk_nulls_node); + __sock_put(old); + return true; + } + + return false; +} + static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); @@ -873,16 +931,6 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_add_head(&sk->sk_bind_node, list); } -static inline void __sk_del_bind2_node(struct sock *sk) -{ - __hlist_del(&sk->sk_bind2_node); -} - -static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) -{ - hlist_add_head(&sk->sk_bind2_node, list); -} - #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ @@ -900,8 +948,8 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) -#define sk_for_each_bound_bhash2(__sk, list) \ - hlist_for_each_entry(__sk, list, sk_bind2_node) +#define sk_for_each_bound_safe(__sk, tmp, list) \ + hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset @@ -959,9 +1007,17 @@ enum sock_flags { SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ + SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ + SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) +/* + * The highest bit of sk_tsflags is reserved for kernel-internal + * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that + * SOF_TIMESTAMPING* values do not reach this reserved area + */ +#define SOCKCM_FLAG_TS_OPT_ID BIT(31) static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) { @@ -1053,6 +1109,12 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val) WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } +static inline void sk_forward_alloc_add(struct sock *sk, int val) +{ + /* Paired with lockless reads of sk->sk_forward_alloc */ + WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val); +} + void sk_stream_write_space(struct sock *sk); /* OOB backlog add */ @@ -1126,41 +1188,6 @@ static inline void sk_incoming_cpu_update(struct sock *sk) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } -static inline void sock_rps_record_flow_hash(__u32 hash) -{ -#ifdef CONFIG_RPS - struct rps_sock_flow_table *sock_flow_table; - - rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); - rps_record_sock_flow(sock_flow_table, hash); - rcu_read_unlock(); -#endif -} - -static inline void sock_rps_record_flow(const struct sock *sk) -{ -#ifdef CONFIG_RPS - if (static_branch_unlikely(&rfs_needed)) { - /* Reading sk->sk_rxhash might incur an expensive cache line - * miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. - */ - if (sk->sk_state == TCP_ESTABLISHED) { - /* This READ_ONCE() is paired with the WRITE_ONCE() - * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). - */ - sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); - } - } -#endif -} static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) @@ -1183,8 +1210,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) } #define sk_wait_event(__sk, __timeo, __condition, __wait) \ - ({ int __rc; \ - __sk->sk_wait_pending++; \ + ({ int __rc, __dis = __sk->sk_disconnects; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ @@ -1194,8 +1220,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) } \ sched_annotate_sleep(); \ lock_sock(__sk); \ - __sk->sk_wait_pending--; \ - __rc = __condition; \ + __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \ __rc; \ }) @@ -1240,6 +1265,13 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) size - offsetof(struct sock, sk_node.pprev)); } +struct proto_accept_arg { + int flags; + int err; + int is_empty; + bool kern; +}; + /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ @@ -1247,15 +1279,15 @@ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int addr_len); int (*connect)(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); - struct sock * (*accept)(struct sock *sk, int flags, int *err, - bool kern); + struct sock * (*accept)(struct sock *sk, + struct proto_accept_arg *arg); int (*ioctl)(struct sock *sk, int cmd, int *karg); @@ -1279,9 +1311,9 @@ struct proto { size_t len, int flags, int *addr_len); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, - struct sockaddr *addr, int addr_len); + struct sockaddr_unsized *addr, int addr_len); int (*bind_add)(struct sock *sk, - struct sockaddr *addr, int addr_len); + struct sockaddr_unsized *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); @@ -1307,10 +1339,6 @@ struct proto { unsigned int inuse_idx; #endif -#if IS_ENABLED(CONFIG_MPTCP) - int (*forward_alloc_get)(const struct sock *sk); -#endif - bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ @@ -1345,8 +1373,6 @@ struct proto { unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ - unsigned int __percpu *orphan_count; - struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; @@ -1371,15 +1397,6 @@ int sock_load_diag_module(int family, int protocol); INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); -static inline int sk_forward_alloc_get(const struct sock *sk) -{ -#if IS_ENABLED(CONFIG_MPTCP) - if (sk->sk_prot->forward_alloc_get) - return sk->sk_prot->forward_alloc_get(sk); -#endif - return sk->sk_forward_alloc; -} - static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) @@ -1417,72 +1434,6 @@ static inline int sk_under_cgroup_hierarchy(struct sock *sk, #endif } -static inline bool sk_has_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure != NULL; -} - -static inline bool sk_under_global_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure && - !!READ_ONCE(*sk->sk_prot->memory_pressure); -} - -static inline bool sk_under_memory_pressure(const struct sock *sk) -{ - if (!sk->sk_prot->memory_pressure) - return false; - - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) - return true; - - return !!READ_ONCE(*sk->sk_prot->memory_pressure); -} - -static inline long -proto_memory_allocated(const struct proto *prot) -{ - return max(0L, atomic_long_read(prot->memory_allocated)); -} - -static inline long -sk_memory_allocated(const struct sock *sk) -{ - return proto_memory_allocated(sk->sk_prot); -} - -/* 1 MB per cpu, in page units */ -#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) - -static inline void -sk_memory_allocated_add(struct sock *sk, int amt) -{ - int local_reserve; - - preempt_disable(); - local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt); - if (local_reserve >= SK_MEMORY_PCPU_RESERVE) { - __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve); - atomic_long_add(local_reserve, sk->sk_prot->memory_allocated); - } - preempt_enable(); -} - -static inline void -sk_memory_allocated_sub(struct sock *sk, int amt) -{ - int local_reserve; - - preempt_disable(); - local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt); - if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) { - __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve); - atomic_long_add(local_reserve, sk->sk_prot->memory_allocated); - } - preempt_enable(); -} - #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) @@ -1509,15 +1460,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot) return percpu_counter_sum_positive(prot->sockets_allocated); } -static inline bool -proto_memory_pressure(struct proto *prot) -{ - if (!prot->memory_pressure) - return false; - return !!READ_ONCE(*prot->memory_pressure); -} - - #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { @@ -1571,6 +1513,10 @@ static inline int __sk_prot_rehash(struct sock *sk) #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 +/** + * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time + */ +#define SOCK_CONNECT_BIND 16 struct socket_alloc { struct socket socket; @@ -1626,7 +1572,7 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { int delta; @@ -1634,7 +1580,13 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || - skb_pfmemalloc(skb); + pfmemalloc; +} + +static inline bool +sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size) +{ + return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } static inline int sk_unused_reserved_mem(const struct sock *sk) @@ -1673,17 +1625,48 @@ static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; - sk->sk_forward_alloc -= size; + sk_forward_alloc_add(sk, -size); } static inline void sk_mem_uncharge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; - sk->sk_forward_alloc += size; + sk_forward_alloc_add(sk, size); sk_mem_reclaim(sk); } +void __sk_charge(struct sock *sk, gfp_t gfp); + +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ + __module_get(owner); + sk->sk_owner = owner; +} + +static inline void sk_owner_clear(struct sock *sk) +{ + sk->sk_owner = NULL; +} + +static inline void sk_owner_put(struct sock *sk) +{ + module_put(sk->sk_owner); +} +#else +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ +} + +static inline void sk_owner_clear(struct sock *sk) +{ +} + +static inline void sk_owner_put(struct sock *sk) +{ +} +#endif /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. @@ -1693,13 +1676,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ + sk_owner_set(sk, THIS_MODULE); \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ - sizeof((sk)->sk_lock)); \ + sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ - (skey), (sname)); \ + (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) @@ -1733,7 +1717,7 @@ bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); * lock_sock_fast - fast version of lock_sock * @sk: socket * - * This version should be used for very small section, where process wont block + * This version should be used for very small section, where process won't block * return false if fast path is taken: * * sk_lock.slock locked, owned = 0, BH disabled @@ -1804,6 +1788,13 @@ static inline void sock_owned_by_me(const struct sock *sk) #endif } +static inline void sock_not_owned_by_me(const struct sock *sk) +{ +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks); +#endif +} + static inline bool sock_owned_by_user(const struct sock *sk) { sock_owned_by_me(sk); @@ -1817,12 +1808,11 @@ static inline bool sock_owned_by_user_nocheck(const struct sock *sk) static inline void sock_release_ownership(struct sock *sk) { - if (sock_owned_by_user_nocheck(sk)) { - sk->sk_lock.owned = 0; + DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk)); + sk->sk_lock.owned = 0; - /* The sk_lock has mutex_unlock() semantics: */ - mutex_release(&sk->sk_lock.dep_map, _RET_IP_); - } + /* The sk_lock has mutex_unlock() semantics: */ + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } /* no reclassification while locks are held */ @@ -1837,9 +1827,14 @@ static inline bool sock_allow_reclassification(const struct sock *csk) struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); +void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); -struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); -void sk_free_unlock_clone(struct sock *sk); +struct sock *sk_clone(const struct sock *sk, const gfp_t priority, bool lock); + +static inline struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +{ + return sk_clone(sk, priority, true); +} struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); @@ -1853,6 +1848,15 @@ void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); + +static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + skb->sk = sk; + skb->destructor = sock_edemux; + } +} #else #define sock_edemux sock_efree #endif @@ -1861,11 +1865,13 @@ int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); +int do_sock_setsockopt(struct socket *sock, bool compat, int level, + int optname, sockptr_t optval, int optlen); +int do_sock_getsockopt(struct socket *sock, bool compat, int level, + int optname, sockptr_t optval, sockptr_t optlen); int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); -int sock_getsockopt(struct socket *sock, int level, int op, - char __user *optval, int __user *optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, @@ -1880,6 +1886,8 @@ static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, } void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); @@ -1895,12 +1903,19 @@ struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; + u32 ts_opt_id; + u32 priority; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { - *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; + *sockc = (struct sockcm_cookie) { + .mark = READ_ONCE(sk->sk_mark), + .tsflags = READ_ONCE(sk->sk_tsflags), + .priority = READ_ONCE(sk->sk_priority), + }; } int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, @@ -1912,10 +1927,10 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ -int sock_no_bind(struct socket *, struct sockaddr *, int); -int sock_no_connect(struct socket *, struct sockaddr *, int, int); +int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len); +int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags); int sock_no_socketpair(struct socket *, struct socket *); -int sock_no_accept(struct socket *, struct socket *, int, bool); +int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); @@ -2000,23 +2015,31 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) /* sk_tx_queue_mapping accept only upto a 16-bit value */ if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) return; - sk->sk_tx_queue_mapping = tx_queue; + /* Paired with READ_ONCE() in sk_tx_queue_get() and + * other WRITE_ONCE() because socket lock might be not held. + */ + if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) { + WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); + WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies); + return; + } + + /* Refresh sk_tx_queue_mapping_jiffies if too old. */ + if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ)) + WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies); } #define NO_QUEUE_MAPPING USHRT_MAX static inline void sk_tx_queue_clear(struct sock *sk) { - sk->sk_tx_queue_mapping = NO_QUEUE_MAPPING; + /* Paired with READ_ONCE() in sk_tx_queue_get() and + * other WRITE_ONCE() because socket lock might be not held. + */ + WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); } -static inline int sk_tx_queue_get(const struct sock *sk) -{ - if (sk && sk->sk_tx_queue_mapping != NO_QUEUE_MAPPING) - return sk->sk_tx_queue_mapping; - - return -1; -} +int sk_tx_queue_get(const struct sock *sk); static inline void __sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb, @@ -2067,6 +2090,13 @@ static inline int sk_rx_queue_get(const struct sock *sk) static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk->sk_socket = sock; + if (sock) { + WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); + WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } else { + /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); + } } static inline wait_queue_head_t *sk_sleep(struct sock *sk) @@ -2097,18 +2127,25 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); - sk->sk_uid = SOCK_INODE(parent)->i_uid; security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } -kuid_t sock_i_uid(struct sock *sk); -unsigned long __sock_i_ino(struct sock *sk); -unsigned long sock_i_ino(struct sock *sk); +static inline unsigned long sock_i_ino(const struct sock *sk) +{ + /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */ + return READ_ONCE(sk->sk_ino); +} + +static inline kuid_t sk_uid(const struct sock *sk) +{ + /* Paired with WRITE_ONCE() in sockfs_setattr() */ + return READ_ONCE(sk->sk_uid); +} static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { - return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); + return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) @@ -2134,14 +2171,14 @@ static inline bool sk_rethink_txhash(struct sock *sk) } static inline struct dst_entry * -__sk_dst_get(struct sock *sk) +__sk_dst_get(const struct sock *sk) { return rcu_dereference_check(sk->sk_dst_cache, lockdep_sock_is_held(sk)); } static inline struct dst_entry * -sk_dst_get(struct sock *sk) +sk_dst_get(const struct sock *sk) { struct dst_entry *dst; @@ -2155,17 +2192,10 @@ sk_dst_get(struct sock *sk) static inline void __dst_negative_advice(struct sock *sk) { - struct dst_entry *ndst, *dst = __sk_dst_get(sk); + struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->ops->negative_advice) { - ndst = dst->ops->negative_advice(dst); - - if (ndst != dst) { - rcu_assign_pointer(sk->sk_dst_cache, ndst); - sk_tx_queue_clear(sk); - sk->sk_dst_pending_confirm = 0; - } - } + if (dst && dst->ops->negative_advice) + dst->ops->negative_advice(sk, dst); } static inline void dst_negative_advice(struct sock *sk) @@ -2180,7 +2210,7 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old_dst; sk_tx_queue_clear(sk); - sk->sk_dst_pending_confirm = 0; + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = rcu_dereference_protected(sk->sk_dst_cache, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_dst_cache, dst); @@ -2193,8 +2223,8 @@ sk_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old_dst; sk_tx_queue_clear(sk); - sk->sk_dst_pending_confirm = 0; - old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst); + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); + old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst))); dst_release(old_dst); } @@ -2231,7 +2261,7 @@ static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) } } -bool sk_mc_loop(struct sock *sk); +bool sk_mc_loop(const struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) { @@ -2295,6 +2325,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro return 0; } +#define SK_WMEM_ALLOC_BIAS 1 /** * sk_wmem_alloc_get - returns write allocations * @sk: socket @@ -2303,7 +2334,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro */ static inline int sk_wmem_alloc_get(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) - 1; + return refcount_read(&sk->sk_wmem_alloc) - SK_WMEM_ALLOC_BIAS; } /** @@ -2365,7 +2396,7 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) } /** - * sock_poll_wait - place memory barrier behind the poll_wait call. + * sock_poll_wait - wrapper for the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table @@ -2375,15 +2406,12 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { - if (!poll_does_not_wait(p)) { - poll_wait(filp, &sock->wq.wait, p); - /* We need to be sure we are in sync with the - * socket flags modification. - * - * This memory barrier is paired in the wq_has_sleeper. - */ - smp_mb(); - } + /* Provides a barrier we need to be sure we are in sync + * with the socket flags modification. + * + * This memory barrier is paired in the wq_has_sleeper. + */ + poll_wait(filp, &sock->wq.wait, p); } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) @@ -2536,6 +2564,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band) } } +static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) +{ + if (unlikely(sock_flag(sk, SOCK_FASYNC))) + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); +} + /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at @@ -2585,12 +2619,16 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); +static inline bool __sock_writeable(const struct sock *sk, int wmem_alloc) +{ + return wmem_alloc < (READ_ONCE(sk->sk_sndbuf) >> 1); +} /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); + return __sock_writeable(sk, refcount_read(&sk->sk_wmem_alloc)); } static inline gfp_t gfp_any(void) @@ -2603,14 +2641,62 @@ static inline gfp_t gfp_memcg_charge(void) return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return sk->sk_memcg; +} + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk); +} + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + +#ifdef CONFIG_MEMCG_V1 + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return !!memcg->tcpmem_pressure; +#endif /* CONFIG_MEMCG_V1 */ + + do { + if (time_before64(get_jiffies_64(), + mem_cgroup_get_socket_pressure(memcg))) { + memcg_memory_event(mem_cgroup_from_sk(sk), + MEMCG_SOCK_THROTTLED); + return true; + } + } while ((memcg = parent_mem_cgroup(memcg))); + + return false; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return NULL; +} + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return false; +} + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + return false; +} +#endif + static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { - return noblock ? 0 : sk->sk_rcvtimeo; + return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo); } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { - return noblock ? 0 : sk->sk_sndtimeo; + return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo); } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) @@ -2634,10 +2720,10 @@ struct sock_skb_cb { /* Store sock_skb_cb at the end of skb->cb[] so protocol families * using skb->cb[] would keep using it directly and utilize its - * alignement guarantee. + * alignment guarantee. */ -#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ - sizeof(struct sock_skb_cb))) +#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ + sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) @@ -2645,18 +2731,53 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_add(struct sock *sk, int segs) +{ + struct numa_drop_counters *ndc = sk->sk_drop_counters; + + if (ndc) + numa_drop_add(ndc, segs); + else + atomic_add(segs, &sk->sk_drops); +} + +static inline void sk_drops_inc(struct sock *sk) +{ + sk_drops_add(sk, 1); +} + +static inline int sk_drops_read(const struct sock *sk) +{ + const struct numa_drop_counters *ndc = sk->sk_drop_counters; + + if (ndc) { + DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); + return numa_drop_read(ndc); + } + return atomic_read(&sk->sk_drops); +} + +static inline void sk_drops_reset(struct sock *sk) +{ + struct numa_drop_counters *ndc = sk->sk_drop_counters; + + if (ndc) + numa_drop_reset(ndc); + atomic_set(&sk->sk_drops, 0); +} + static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? - atomic_read(&sk->sk_drops) : 0; + sk_drops_read(sk) : 0; } -static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) +static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - atomic_add(segs, &sk->sk_drops); + sk_drops_add(sk, segs); } static inline ktime_t sock_read_timestamp(struct sock *sk) @@ -2692,12 +2813,16 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); +bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk); +int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, + struct timespec64 *ts); + static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - ktime_t kt = skb->tstamp; struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); - + u32 tsflags = READ_ONCE(sk->sk_tsflags); + ktime_t kt = skb->tstamp; /* * generate control messages if * - receive time stamping in software requested @@ -2705,10 +2830,10 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || - (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || - (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) || + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || + (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && - (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) + (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); @@ -2726,11 +2851,13 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ - (1UL << SOCK_RCVMARK)) + (1UL << SOCK_RCVMARK) | \ + (1UL << SOCK_RCVPRIORITY) | \ + (1UL << SOCK_TIMESTAMPING_ANY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) - if (sk->sk_flags & FLAGS_RECV_CMSGS || sk->sk_tsflags & TSFLAGS_ANY) + if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); @@ -2738,45 +2865,90 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, sock_write_timestamp(sk, 0); } -void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); +void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet - * @tsflags: timestamping flags to use + * @sockc: pointer to socket cmsg cookie to get timestamping info * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ -static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, +static inline void _sock_tx_timestamp(struct sock *sk, + const struct sockcm_cookie *sockc, __u8 *tx_flags, __u32 *tskey) { + __u32 tsflags = sockc->tsflags; + if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && - tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = atomic_inc_return(&sk->sk_tskey) - 1; + tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { + if (tsflags & SOCKCM_FLAG_TS_OPT_ID) + *tskey = sockc->ts_opt_id; + else + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; + } } - if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) - *tx_flags |= SKBTX_WIFI_STATUS; } -static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags, +static inline void sock_tx_timestamp(struct sock *sk, + const struct sockcm_cookie *sockc, __u8 *tx_flags) { - _sock_tx_timestamp(sk, tsflags, tx_flags, NULL); + _sock_tx_timestamp(sk, sockc, tx_flags, NULL); } -static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) +static inline void skb_setup_tx_timestamp(struct sk_buff *skb, + const struct sockcm_cookie *sockc) { - _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags, + _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } +static inline bool sk_is_inet(const struct sock *sk) +{ + int family = READ_ONCE(sk->sk_family); + + return family == AF_INET || family == AF_INET6; +} + static inline bool sk_is_tcp(const struct sock *sk) { - return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; + return sk_is_inet(sk) && + sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static inline bool sk_is_udp(const struct sock *sk) +{ + return sk_is_inet(sk) && + sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP; +} + +static inline bool sk_is_unix(const struct sock *sk) +{ + return sk->sk_family == AF_UNIX; +} + +static inline bool sk_is_stream_unix(const struct sock *sk) +{ + return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM; +} + +static inline bool sk_is_vsock(const struct sock *sk) +{ + return sk->sk_family == AF_VSOCK; +} + +static inline bool sk_may_scm_recv(const struct sock *sk) +{ + return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) || + sk->sk_family == AF_NETLINK || + (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH); } /** @@ -2818,53 +2990,10 @@ sk_is_refcounted(struct sock *sk) return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } -/** - * skb_steal_sock - steal a socket from an sk_buff - * @skb: sk_buff to steal the socket from - * @refcounted: is set to true if the socket is reference-counted - * @prefetched: is set to true if the socket was assigned from bpf - */ -static inline struct sock * -skb_steal_sock(struct sk_buff *skb, bool *refcounted, bool *prefetched) -{ - if (skb->sk) { - struct sock *sk = skb->sk; - - *refcounted = true; - *prefetched = skb_sk_is_prefetched(skb); - if (*prefetched) - *refcounted = sk_is_refcounted(sk); - skb->destructor = NULL; - skb->sk = NULL; - return sk; - } - *prefetched = false; - *refcounted = false; - return NULL; -} - -/* Checks if this SKB belongs to an HW offloaded socket - * and whether any SW fallbacks are required based on dev. - * Check decrypted mark in case skb_orphan() cleared socket. - */ -static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, - struct net_device *dev) +static inline bool +sk_requests_wifi_status(struct sock *sk) { -#ifdef CONFIG_SOCK_VALIDATE_XMIT - struct sock *sk = skb->sk; - - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { - skb = sk->sk_validate_xmit_skb(sk, dev, skb); -#ifdef CONFIG_TLS_DEVICE - } else if (unlikely(skb->decrypted)) { - pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); - kfree_skb(skb); - skb = NULL; -#endif - } -#endif - - return skb; + return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV @@ -2875,6 +3004,16 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } +/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT + * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE) + * TCP RST and ACK can be attached to TIME_WAIT. + */ +static inline bool sk_listener_or_tw(const struct sock *sk) +{ + return (1 << READ_ONCE(sk->sk_state)) & + (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT); +} + void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); @@ -2893,15 +3032,12 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_WMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; -extern int sysctl_tstamp_allow_data; -extern int sysctl_optmem_max; - extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; @@ -2964,7 +3100,13 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool); int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); -void sock_enable_timestamps(struct sock *sk); +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op); +#else +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ +} +#endif void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); @@ -2974,7 +3116,7 @@ void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); -int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); +int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); int sock_get_timeout(long timeo, void *optval, bool old_timeval); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, @@ -2985,8 +3127,11 @@ int sock_ioctl_inout(struct sock *sk, unsigned int cmd, int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { - if (sk->sk_prot->sock_is_readable) - return sk->sk_prot->sock_is_readable(sk); + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->sock_is_readable) + return prot->sock_is_readable(sk); + return false; } #endif /* _SOCK_H */ diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 6ec140b0a61b..6e4faf3ee76f 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -26,7 +26,7 @@ struct sock_reuseport { unsigned int bind_inany:1; unsigned int has_conns:1; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ - struct sock *socks[]; /* array of sock pointers */ + struct sock *socks[] __counted_by(max_socks); }; extern int reuseport_alloc(struct sock *sk, bool bind_inany); diff --git a/include/net/strparser.h b/include/net/strparser.h index 41e2ce9e9e10..0ed73e364faa 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -43,6 +43,8 @@ struct strparser; struct strp_callbacks { int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); + int (*read_sock)(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); int (*read_sock_done)(struct strparser *strp, int err); void (*abort_parser)(struct strparser *strp, int err); void (*lock)(struct strparser *strp); @@ -112,8 +114,6 @@ static inline void strp_pause(struct strparser *strp) /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); -/* Must be called with process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index a43062d4c734..8346b0d29542 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -308,6 +308,9 @@ void switchdev_deferred_process(void); int switchdev_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack); +bool switchdev_port_obj_act_is_deferred(struct net_device *dev, + enum switchdev_notifier_type nt, + const struct switchdev_obj *obj); int switchdev_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack); diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h index e8dd77a96748..a5ce83f3eea4 100644 --- a/include/net/tc_act/tc_connmark.h +++ b/include/net/tc_act/tc_connmark.h @@ -7,6 +7,7 @@ struct tcf_connmark_parms { struct net *net; u16 zone; + int action; struct rcu_head rcu; }; diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 68269e4581b7..8d0c7a9f9345 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -8,6 +8,7 @@ struct tcf_csum_params { u32 update_flags; + int action; struct rcu_head rcu; }; @@ -18,15 +19,6 @@ struct tcf_csum { }; #define to_tcf_csum(a) ((struct tcf_csum *)a) -static inline bool is_tcf_csum(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_CSUM) - return true; -#endif - return false; -} - static inline u32 tcf_csum_update_flags(const struct tc_action *a) { u32 update_flags; diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index b24ea2d9400b..8b90c86c0b0d 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -13,7 +13,7 @@ struct tcf_ct_params { struct nf_conntrack_helper *helper; struct nf_conn *tmpl; u16 zone; - + int action; u32 mark; u32 mark_mask; @@ -22,6 +22,7 @@ struct tcf_ct_params { struct nf_nat_range2 range; bool ipv4_range; + bool put_labels; u16 ct_action; @@ -57,6 +58,11 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) return to_ct_params(a)->nf_ft; } +static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) +{ + return to_ct_params(a)->helper; +} + #else static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; } static inline int tcf_ct_action(const struct tc_action *a) { return 0; } @@ -64,6 +70,10 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) { return NULL; } +static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) +{ + return NULL; +} #endif /* CONFIG_NF_CONNTRACK */ #if IS_ENABLED(CONFIG_NET_ACT_CT) @@ -82,13 +92,4 @@ static inline void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { } #endif -static inline bool is_tcf_ct(const struct tc_action *a) -{ -#if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK) - if (a->ops && a->ops->id == TCA_ID_CT) - return true; -#endif - return false; -} - #endif /* __NET_TC_CT_H */ diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h index f071c1d70a25..7fe01ab236da 100644 --- a/include/net/tc_act/tc_ctinfo.h +++ b/include/net/tc_act/tc_ctinfo.h @@ -7,6 +7,7 @@ struct tcf_ctinfo_params { struct rcu_head rcu; struct net *net; + int action; u32 dscpmask; u32 dscpstatemask; u32 cpmarkmask; @@ -18,9 +19,9 @@ struct tcf_ctinfo_params { struct tcf_ctinfo { struct tc_action common; struct tcf_ctinfo_params __rcu *params; - u64 stats_dscp_set; - u64 stats_dscp_error; - u64 stats_cpmark_set; + atomic64_t stats_dscp_set; + atomic64_t stats_dscp_error; + atomic64_t stats_cpmark_set; }; enum { diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h index c8fa11ebb397..c1a67149c6b6 100644 --- a/include/net/tc_act/tc_gate.h +++ b/include/net/tc_act/tc_gate.h @@ -51,15 +51,6 @@ struct tcf_gate { #define to_gate(a) ((struct tcf_gate *)a) -static inline bool is_tcf_gate(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_GATE) - return true; -#endif - return false; -} - static inline s32 tcf_gate_prio(const struct tc_action *a) { s32 tcfg_prio; diff --git a/include/net/tc_act/tc_ipt.h b/include/net/tc_act/tc_ipt.h deleted file mode 100644 index 4225fcb1c6ba..000000000000 --- a/include/net/tc_act/tc_ipt.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NET_TC_IPT_H -#define __NET_TC_IPT_H - -#include <net/act_api.h> - -struct xt_entry_target; - -struct tcf_ipt { - struct tc_action common; - u32 tcfi_hook; - char *tcfi_tname; - struct xt_entry_target *tcfi_t; -}; -#define to_ipt(a) ((struct tcf_ipt *)a) - -#endif /* __NET_TC_IPT_H */ diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 32ce8ea36950..75722d967bf2 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -8,6 +8,7 @@ struct tcf_mirred { struct tc_action common; int tcfm_eaction; + u32 tcfm_blockid; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; netdevice_tracker tcfm_dev_tracker; diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h index 721de4f5733a..dd067bd4018d 100644 --- a/include/net/tc_act/tc_mpls.h +++ b/include/net/tc_act/tc_mpls.h @@ -10,6 +10,7 @@ struct tcf_mpls_params { int tcfm_action; u32 tcfm_label; + int action; /* tcf_action */ u8 tcfm_tc; u8 tcfm_ttl; u8 tcfm_bos; @@ -27,15 +28,6 @@ struct tcf_mpls { }; #define to_mpls(a) ((struct tcf_mpls *)a) -static inline bool is_tcf_mpls(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_MPLS) - return true; -#endif - return false; -} - static inline u32 tcf_mpls_action(const struct tc_action *a) { u32 tcfm_action; diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h index c869274ac529..ae35f4009445 100644 --- a/include/net/tc_act/tc_nat.h +++ b/include/net/tc_act/tc_nat.h @@ -6,6 +6,7 @@ #include <net/act_api.h> struct tcf_nat_parms { + int action; __be32 old_addr; __be32 new_addr; __be32 mask; diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index 83fe39931781..f58ee15cd858 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -14,6 +14,7 @@ struct tcf_pedit_key_ex { struct tcf_pedit_parms { struct tc_pedit_key *tcfp_keys; struct tcf_pedit_key_ex *tcfp_keys_ex; + int action; u32 tcfp_off_max_hint; unsigned char tcfp_nkeys; unsigned char tcfp_flags; diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index 283bde711a42..a89fc8e68b1e 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -5,10 +5,11 @@ #include <net/act_api.h> struct tcf_police_params { + int action; int tcfp_result; u32 tcfp_ewma_rate; - s64 tcfp_burst; u32 tcfp_mtu; + s64 tcfp_burst; s64 tcfp_mtu_ptoks; s64 tcfp_pkt_burst; struct psched_ratecfg rate; @@ -44,15 +45,6 @@ struct tc_police_compat { struct tc_ratespec peakrate; }; -static inline bool is_tcf_police(const struct tc_action *act) -{ -#ifdef CONFIG_NET_CLS_ACT - if (act->ops && act->ops->id == TCA_ID_POLICE) - return true; -#endif - return false; -} - static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act) { struct tcf_police *police = to_police(act); diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h index b5d76305e854..abd163ca1864 100644 --- a/include/net/tc_act/tc_sample.h +++ b/include/net/tc_act/tc_sample.h @@ -17,15 +17,6 @@ struct tcf_sample { }; #define to_sample(a) ((struct tcf_sample *)a) -static inline bool is_tcf_sample(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - return a->ops && a->ops->id == TCA_ID_SAMPLE; -#else - return false; -#endif -} - static inline __u32 tcf_sample_rate(const struct tc_action *a) { return to_sample(a)->rate; diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 9649600fb3dc..31b2cd0bebb5 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -12,6 +12,7 @@ #include <linux/tc_act/tc_skbedit.h> struct tcf_skbedit_params { + int action; u32 flags; u32 priority; u32 mark; diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h index 7c240d2fed4e..626704cd6241 100644 --- a/include/net/tc_act/tc_skbmod.h +++ b/include/net/tc_act/tc_skbmod.h @@ -12,6 +12,7 @@ struct tcf_skbmod_params { struct rcu_head rcu; u64 flags; /*up to 64 types of operations; extend if needed */ + int action; u8 eth_dst[ETH_ALEN]; u16 eth_type; u8 eth_src[ETH_ALEN]; diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index 879fe8cff581..0f1925f97520 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -14,6 +14,7 @@ struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; + int action; struct metadata_dst *tcft_enc_metadata; }; diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 904eddfc1826..beadee41669a 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -10,6 +10,7 @@ #include <linux/tc_act/tc_vlan.h> struct tcf_vlan_params { + int action; int tcfv_action; unsigned char tcfv_push_dst[ETH_ALEN]; unsigned char tcfv_push_src[ETH_ALEN]; @@ -26,15 +27,6 @@ struct tcf_vlan { }; #define to_vlan(a) ((struct tcf_vlan *)a) -static inline bool is_tcf_vlan(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_VLAN) - return true; -#endif - return false; -} - static inline u32 tcf_vlan_action(const struct tc_action *a) { u32 tcfv_action; diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h index a6d481b5bcbc..ffe58a02537c 100644 --- a/include/net/tc_wrapper.h +++ b/include/net/tc_wrapper.h @@ -4,7 +4,7 @@ #include <net/pkt_cls.h> -#if IS_ENABLED(CONFIG_RETPOLINE) +#if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) #include <linux/cpufeature.h> #include <linux/static_key.h> @@ -117,10 +117,6 @@ static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, if (a->ops->act == tcf_ife_act) return tcf_ife_act(skb, a, res); #endif -#if IS_BUILTIN(CONFIG_NET_ACT_IPT) - if (a->ops->act == tcf_ipt_act) - return tcf_ipt_act(skb, a, res); -#endif #if IS_BUILTIN(CONFIG_NET_ACT_SIMP) if (a->ops->act == tcf_simp_act) return tcf_simp_act(skb, a, res); diff --git a/include/net/tcp.h b/include/net/tcp.h index 91688d0dadcd..0deb5e9dd911 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -26,6 +26,7 @@ #include <linux/kref.h> #include <linux/ktime.h> #include <linux/indirect_call_wrapper.h> +#include <linux/bits.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> @@ -37,9 +38,11 @@ #include <net/snmp.h> #include <net/ip.h> #include <net/tcp_states.h> +#include <net/tcp_ao.h> #include <net/inet_ecn.h> #include <net/dst.h> #include <net/mptcp.h> +#include <net/xfrm.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> @@ -51,6 +54,18 @@ extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); +static inline void tcp_orphan_count_inc(void) +{ + this_cpu_inc(tcp_orphan_count); +} + +static inline void tcp_orphan_count_dec(void) +{ + this_cpu_dec(tcp_orphan_count); +} + +DECLARE_PER_CPU(u32, tcp_tw_isn); + void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) @@ -85,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U +/* Default sending frequency of accurate ECN option per RTT */ +#define TCP_ACCECN_OPTION_BEACON 3 + /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 @@ -131,6 +149,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */ #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ +static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); + #if HZ >= 100 #define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */ #define TCP_ATO_MIN ((unsigned)(HZ/25)) @@ -138,9 +158,13 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif -#define TCP_RTO_MAX ((unsigned)(120*HZ)) -#define TCP_RTO_MIN ((unsigned)(HZ/5)) +#define TCP_RTO_MAX_SEC 120 +#define TCP_RTO_MAX ((unsigned)(TCP_RTO_MAX_SEC * HZ)) +#define TCP_RTO_MIN ((unsigned)(HZ / 5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ + +#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */ + #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the @@ -161,7 +185,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 -#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) +/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds + * to avoid overflows. This assumes a clock smaller than 1 Mhz. + * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz. + */ +#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC) + #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN @@ -184,8 +213,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ +#define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ +#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */ +#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -203,6 +235,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 +#define TCPOLEN_ACCECN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 @@ -216,6 +249,14 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 +#define TCPOLEN_ACCECN_PERFIELD 3 + +/* Maximum number of byte counters in AccECN option + size */ +#define TCP_ACCECN_NUMFIELDS 3 +#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ + TCPOLEN_ACCECN_PERFIELD * \ + TCP_ACCECN_NUMFIELDS) +#define TCP_ACCECN_SAFETY_SHIFT 1 /* SAFETY_FACTOR in accecn draft */ /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -250,7 +291,6 @@ extern long sysctl_tcp_mem[3]; #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ -extern atomic_long_t tcp_memory_allocated; DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); extern struct percpu_counter tcp_sockets_allocated; @@ -259,10 +299,13 @@ extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) + if (mem_cgroup_sk_enabled(sk) && + mem_cgroup_sk_under_memory_pressure(sk)) return true; + if (sk->sk_bypass_prot_mem) + return false; + return READ_ONCE(tcp_memory_pressure); } /* @@ -282,14 +325,6 @@ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) return seq3 - seq2 >= seq1 - seq2; } -static inline bool tcp_out_of_memory(struct sock *sk) -{ - if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) - return true; - return false; -} - static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); @@ -302,7 +337,7 @@ static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) void sk_forced_mem_schedule(struct sock *sk, int size); -bool tcp_check_oom(struct sock *sk, int shift); +bool tcp_check_oom(const struct sock *sk, int shift); extern struct proto tcp_prot; @@ -312,7 +347,7 @@ extern struct proto tcp_prot; #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) -void tcp_tasklet_init(void); +void tcp_tsq_work_init(void); int tcp_v4_err(struct sk_buff *skb, u32); @@ -336,24 +371,27 @@ void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); +enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); +void tcp_rcvbuf_grow(struct sock *sk, u32 newval); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); -void tcp_twsk_purge(struct list_head *net_exit_list, int family); +void tcp_twsk_purge(struct list_head *net_exit_list); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule); -static inline void tcp_dec_quickack_mode(struct sock *sk, - const unsigned int pkts) +static inline void tcp_dec_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.quick) { + /* How many ACKs S/ACKing new data have we sent? */ + const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0; + if (pkts >= icsk->icsk_ack.quick) { icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ @@ -363,30 +401,71 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, } } -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 -#define TCP_ECN_SEEN 8 +#define TCP_ECN_MODE_RFC3168 BIT(0) +#define TCP_ECN_QUEUE_CWR BIT(1) +#define TCP_ECN_DEMAND_CWR BIT(2) +#define TCP_ECN_SEEN BIT(3) +#define TCP_ECN_MODE_ACCECN BIT(4) + +#define TCP_ECN_DISABLED 0 +#define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) +#define TCP_ECN_MODE_ANY (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) + +static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp) +{ + return tp->ecn_flags & TCP_ECN_MODE_ANY; +} + +static inline bool tcp_ecn_mode_rfc3168(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_RFC3168; +} + +static inline bool tcp_ecn_mode_accecn(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_ACCECN; +} + +static inline bool tcp_ecn_disabled(const struct tcp_sock *tp) +{ + return !tcp_ecn_mode_any(tp); +} + +static inline bool tcp_ecn_mode_pending(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_PENDING) == TCP_ECN_MODE_PENDING; +} + +static inline void tcp_ecn_mode_set(struct tcp_sock *tp, u8 mode) +{ + tp->ecn_flags &= ~TCP_ECN_MODE_ANY; + tp->ecn_flags |= mode; +} enum tcp_tw_status { TCP_TW_SUCCESS = 0, TCP_TW_RST = 1, TCP_TW_ACK = 2, - TCP_TW_SYN = 3 + TCP_TW_SYN = 3, + TCP_TW_ACK_OOW = 4 }; enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th); + const struct tcphdr *th, + u32 *tw_isn, + enum skb_drop_reason *drop_reason); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, - bool *lost_race); -int tcp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb); + bool *lost_race, enum skb_drop_reason *drop_reason); +enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag); void tcp_clear_retrans(struct tcp_sock *tp); +void tcp_update_pacing_rate(struct sock *sk); +void tcp_set_rto(struct sock *sk); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); @@ -406,6 +485,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +void tcp_reset_keepalive_timer(struct sock *sk, unsigned long timeout); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -424,7 +504,6 @@ int tcp_mmap(struct file *file, struct socket *sock, void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); -const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); /* * BPF SKB-less helpers @@ -456,7 +535,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req_unhash, bool *own_req); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); -int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); int tcp_connect(struct sock *sk); enum tcp_synack_type { TCP_SYNACK_NORMAL, @@ -477,13 +556,30 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst, u32 tsoff); -int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, - u32 cookie); + struct dst_entry *dst); +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, - const struct tcp_request_sock_ops *af_ops, - struct sock *sk, struct sk_buff *skb); + struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *tcp_opt, + int mss, u32 tsoff); + +#if IS_ENABLED(CONFIG_BPF) +struct bpf_tcp_req_attrs { + u32 rcv_tsval; + u32 rcv_tsecr; + u16 mss; + u8 rcv_wscale; + u8 snd_wscale; + u8 ecn_ok; + u8 wscale_ok; + u8 sack_ok; + u8 tstamp_ok; + u8 usec_ts_ok; + u8 reserved[3]; +}; +#endif + #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. @@ -563,18 +659,50 @@ static inline u32 tcp_cookie_time(void) return val; } +/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ +static inline u64 tcp_ns_to_ts(bool usec_ts, u64 val) +{ + if (usec_ts) + return div_u64(val, NSEC_PER_USEC); + + return div_u64(val, NSEC_PER_MSEC); +} + u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); -bool cookie_ecn_ok(const struct tcp_options_received *opt, - const struct net *net, const struct dst_entry *dst); + +static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *dst) +{ + return READ_ONCE(net->ipv4.sysctl_tcp_ecn) || + dst_feature(dst, RTAX_FEATURE_ECN); +} + +#if IS_ENABLED(CONFIG_BPF) +static inline bool cookie_bpf_ok(struct sk_buff *skb) +{ + return skb->sk; +} + +struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb); +#else +static inline bool cookie_bpf_ok(struct sk_buff *skb) +{ + return false; +} + +static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return NULL; +} +#endif /* From net/ipv6/syncookies.c */ -int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, - u32 cookie); +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, @@ -605,10 +733,11 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, void tcp_send_probe0(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); -void tcp_send_active_reset(struct sock *sk, gfp_t priority); +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); -void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); @@ -619,11 +748,25 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); +void tcp_done_with_error(struct sock *sk, int err); void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); void tcp_sack_compress_send_ack(struct sock *sk); +static inline void tcp_cleanup_skb(struct sk_buff *skb) +{ + skb_dst_drop(skb); + secpath_reset(skb); +} + +static inline void tcp_add_receive_queue(struct sock *sk, struct sk_buff *skb) +{ + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(secpath_exists(skb)); + __skb_queue_tail(&sk->sk_receive_queue, skb); +} + /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) @@ -670,6 +813,9 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off); void tcp_read_done(struct sock *sk, size_t len); @@ -680,56 +826,43 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); -static inline void tcp_bound_rto(const struct sock *sk) -{ - if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) - inet_csk(sk)->icsk_rto = TCP_RTO_MAX; -} - -static inline u32 __tcp_set_rto(const struct tcp_sock *tp) +static inline unsigned int tcp_rto_max(const struct sock *sk) { - return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); + return READ_ONCE(inet_csk(sk)->icsk_rto_max); } -static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) +static inline void tcp_bound_rto(struct sock *sk) { - /* mptcp hooks are only on the slow path */ - if (sk_is_mptcp((struct sock *)tp)) - return; - - tp->pred_flags = htonl((tp->tcp_header_len << 26) | - ntohl(TCP_FLAG_ACK) | - snd_wnd); + inet_csk(sk)->icsk_rto = min(inet_csk(sk)->icsk_rto, tcp_rto_max(sk)); } -static inline void tcp_fast_path_on(struct tcp_sock *tp) +static inline u32 __tcp_set_rto(const struct tcp_sock *tp) { - __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); + return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } -static inline void tcp_fast_path_check(struct sock *sk) +static inline unsigned long tcp_reqsk_timeout(struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); + u64 timeout = (u64)req->timeout << req->num_timeout; - if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && - tp->rcv_wnd && - atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && - !tp->urg_data) - tcp_fast_path_on(tp); + return (unsigned long)min_t(u64, timeout, + tcp_rto_max(req->rsk_listener)); } +u32 tcp_delack_max(const struct sock *sk); + /* Compute the actual rto_min value */ -static inline u32 tcp_rto_min(struct sock *sk) +static inline u32 tcp_rto_min(const struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = inet_csk(sk)->icsk_rto_min; + u32 rto_min = READ_ONCE(inet_csk(sk)->icsk_rto_min); if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); return rto_min; } -static inline u32 tcp_rto_min_us(struct sock *sk) +static inline u32 tcp_rto_min_us(const struct sock *sk) { return jiffies_to_usecs(tcp_rto_min(sk)); } @@ -789,22 +922,31 @@ static inline u64 tcp_clock_us(void) return div_u64(tcp_clock_ns(), NSEC_PER_USEC); } -/* This should only be used in contexts where tp->tcp_mstamp is up to date */ -static inline u32 tcp_time_stamp(const struct tcp_sock *tp) +static inline u64 tcp_clock_ms(void) +{ + return div_u64(tcp_clock_ns(), NSEC_PER_MSEC); +} + +/* TCP Timestamp included in TS option (RFC 1323) can either use ms + * or usec resolution. Each socket carries a flag to select one or other + * resolution, as the route attribute could change anytime. + * Each flow must stick to initial resolution. + */ +static inline u32 tcp_clock_ts(bool usec_ts) { - return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); + return usec_ts ? tcp_clock_us() : tcp_clock_ms(); } -/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ -static inline u32 tcp_ns_to_ts(u64 ns) +static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) { - return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); + return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); } -/* Could use tcp_clock_us() / 1000, but this version uses a single divide */ -static inline u32 tcp_time_stamp_raw(void) +static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp) { - return tcp_ns_to_ts(tcp_clock_ns()); + if (tp->tcp_usec_ts) + return tp->tcp_mstamp; + return tcp_time_stamp_ms(tp); } void tcp_mstamp_refresh(struct tcp_sock *tp); @@ -814,30 +956,75 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } -static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) -{ - return tcp_ns_to_ts(skb->skb_mstamp_ns); -} - /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } +/* Provide skb TSval in usec or ms unit */ +static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb) +{ + if (usec_ts) + return tcp_skb_timestamp_us(skb); -#define tcp_flag_byte(th) (((u_int8_t *)th)[13]) + return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC); +} + +static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw) +{ + return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset; +} -#define TCPHDR_FIN 0x01 -#define TCPHDR_SYN 0x02 -#define TCPHDR_RST 0x04 -#define TCPHDR_PSH 0x08 -#define TCPHDR_ACK 0x10 -#define TCPHDR_URG 0x20 -#define TCPHDR_ECE 0x40 -#define TCPHDR_CWR 0x80 +static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) +{ + return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off; +} + +#define tcp_flag_byte(th) (((u_int8_t *)th)[13]) +#define TCPHDR_FIN BIT(0) +#define TCPHDR_SYN BIT(1) +#define TCPHDR_RST BIT(2) +#define TCPHDR_PSH BIT(3) +#define TCPHDR_ACK BIT(4) +#define TCPHDR_URG BIT(5) +#define TCPHDR_ECE BIT(6) +#define TCPHDR_CWR BIT(7) +#define TCPHDR_AE BIT(8) +#define TCPHDR_FLAGS_MASK (TCPHDR_FIN | TCPHDR_SYN | TCPHDR_RST | \ + TCPHDR_PSH | TCPHDR_ACK | TCPHDR_URG | \ + TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) +#define tcp_flags_ntohs(th) (ntohs(*(__be16 *)&tcp_flag_word(th)) & \ + TCPHDR_FLAGS_MASK) + +#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR) + +#define TCP_ACCECN_CEP_ACE_MASK 0x7 +#define TCP_ACCECN_ACE_MAX_DELTA 6 + +/* To avoid/detect middlebox interference, not all counters start at 0. + * See draft-ietf-tcpm-accurate-ecn for the latest values. + */ +#define TCP_ACCECN_CEP_INIT_OFFSET 5 +#define TCP_ACCECN_E1B_INIT_OFFSET 1 +#define TCP_ACCECN_E0B_INIT_OFFSET 1 +#define TCP_ACCECN_CEB_INIT_OFFSET 0 + +/* State flags for sacked in struct tcp_skb_cb */ +enum tcp_skb_cb_sacked_flags { + TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ + TCPCB_SACKED_RETRANS = (1 << 1), /* SKB retransmitted */ + TCPCB_LOST = (1 << 2), /* SKB is lost */ + TCPCB_TAGBITS = (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS | + TCPCB_LOST), /* All tag bits */ + TCPCB_REPAIRED = (1 << 4), /* SKB repaired (no skb_mstamp_ns) */ + TCPCB_EVER_RETRANS = (1 << 7), /* Ever retransmitted frame */ + TCPCB_RETRANS = (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS | + TCPCB_REPAIRED), +}; /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. @@ -849,35 +1036,25 @@ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { - /* Note : tcp_tw_isn is used in input path only - * (isn chosen by tcp_timewait_state_process()) - * + /* Note : * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ - __u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; }; }; - __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ + __u16 tcp_flags; /* TCP header flags (tcp[12-13])*/ __u8 sacked; /* State flags for SACK. */ -#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ -#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ -#define TCPCB_LOST 0x04 /* SKB is lost */ -#define TCPCB_TAGBITS 0x07 /* All tag bits */ -#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */ -#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ -#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ - TCPCB_REPAIRED) - __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ - __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ +#define TSTAMP_ACK_SK 0x1 +#define TSTAMP_ACK_BPF 0x2 + __u8 txstamp_ack:2, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ has_rxtstamp:1, /* SKB has a RX timestamp */ - unused:5; + unused:4; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { @@ -982,9 +1159,18 @@ static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { + /* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */ return likely(tcp_skb_can_collapse_to(to) && mptcp_skb_can_collapse(to, from) && - skb_pure_zcopy_same(to, from)); + skb_pure_zcopy_same(to, from) && + skb_frags_readable(to) == skb_frags_readable(from)); +} + +static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to, + const struct sk_buff *from) +{ + return likely(mptcp_skb_can_collapse(to, from) && + !skb_cmp_decrypted(to, from)); } /* Events passed to congestion control interface */ @@ -1014,9 +1200,9 @@ enum tcp_ca_ack_event_flags { #define TCP_CA_UNSPEC 0 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ -#define TCP_CONG_NON_RESTRICTED 0x1 +#define TCP_CONG_NON_RESTRICTED BIT(0) /* Requires ECN/ECT set on all packets */ -#define TCP_CONG_NEEDS_ECN 0x2 +#define TCP_CONG_NEEDS_ECN BIT(1) #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; @@ -1081,7 +1267,7 @@ struct tcp_congestion_ops { /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); /* new value of cwnd after loss (required) */ @@ -1132,7 +1318,7 @@ extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else @@ -1330,10 +1516,12 @@ static inline unsigned long tcp_pacing_delay(const struct sock *sk) static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, - const unsigned long max_when) + bool pace_delay) { - inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), - max_when); + if (pace_delay) + when += tcp_pacing_delay(sk); + inet_csk_reset_xmit_timer(sk, what, when, + tcp_rto_max(sk)); } /* Something is really bad, we could not queue an additional packet, @@ -1362,7 +1550,7 @@ static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_base(sk), TCP_RTO_MAX); + tcp_probe0_base(sk), true); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) @@ -1394,7 +1582,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); -int tcp_filter(struct sock *sk, struct sk_buff *skb); +int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); @@ -1453,13 +1641,14 @@ static inline int tcp_space_from_win(const struct sock *sk, int win) return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); } +/* Assume a 50% default for skb->len/skb->truesize ratio. + * This may be adjusted later in tcp_measure_rcv_mss(). + */ +#define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1)) + static inline void tcp_scaling_ratio_init(struct sock *sk) { - /* Assume a conservative default of 1200 bytes of payload per 4K page. - * This may be adjusted later in tcp_measure_rcv_mss(). - */ - tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) / - SKB_TRUESIZE(4096); + tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; } /* Note: caller must be prepared to deal with negative returns */ @@ -1475,17 +1664,22 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } -static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) +static inline void __tcp_adjust_rcv_ssthresh(struct sock *sk, u32 new_ssthresh) { int unused_mem = sk_unused_reserved_mem(sk); struct tcp_sock *tp = tcp_sk(sk); - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + tp->rcv_ssthresh = min(tp->rcv_ssthresh, new_ssthresh); if (unused_mem) tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh, tcp_win_from_space(sk, unused_mem)); } +static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) +{ + __tcp_adjust_rcv_ssthresh(sk, 4U * tcp_sk(sk)->advmss); +} + void tcp_cleanup_rbuf(struct sock *sk, int copied); void __tcp_cleanup_rbuf(struct sock *sk, int copied); @@ -1590,7 +1784,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; if (unlikely(!time_before32(ktime_get_seconds(), - rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))) + rx_opt->ts_recent_stamp + TCP_PAWS_WRAP))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, @@ -1626,6 +1820,40 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, return true; } +static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) +{ + u32 ace; + + /* mptcp hooks are only on the slow path */ + if (sk_is_mptcp((struct sock *)tp)) + return; + + ace = tcp_ecn_mode_accecn(tp) ? + ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) & + TCP_ACCECN_CEP_ACE_MASK) : 0; + + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + (ace << 22) | + ntohl(TCP_FLAG_ACK) | + snd_wnd); +} + +static inline void tcp_fast_path_on(struct tcp_sock *tp) +{ + __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); +} + +static inline void tcp_fast_path_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && + tp->rcv_wnd && + atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && + !tp->urg_data) + tcp_fast_path_on(tp); +} + bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time); @@ -1639,23 +1867,12 @@ static inline void tcp_mib_init(struct net *net) } /* from STCP */ -static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) -{ - tp->lost_skb_hint = NULL; -} - static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { - tcp_clear_retrans_hints_partial(tp); tp->retransmit_skb_hint = NULL; } -union tcp_md5_addr { - struct in_addr a4; -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr a6; -#endif -}; +#define tcp_md5_addr tcp_ao_addr /* - key database */ struct tcp_md5sig_key { @@ -1692,22 +1909,42 @@ struct tcp6_pseudohdr { __be32 protocol; /* including padding */ }; -union tcp_md5sum_block { - struct tcp4_pseudohdr ip4; -#if IS_ENABLED(CONFIG_IPV6) - struct tcp6_pseudohdr ip6; -#endif +/* + * struct tcp_sigpool - per-CPU pool of ahash_requests + * @scratch: per-CPU temporary area, that can be used between + * tcp_sigpool_start() and tcp_sigpool_end() to perform + * crypto request + * @req: pre-allocated ahash request + */ +struct tcp_sigpool { + void *scratch; + struct ahash_request *req; }; -/* - pool: digest algorithm, hash description and scratch buffer */ -struct tcp_md5sig_pool { - struct ahash_request *md5_req; - void *scratch; -}; +int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size); +void tcp_sigpool_get(unsigned int id); +void tcp_sigpool_release(unsigned int id); +int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, + const struct sk_buff *skb, + unsigned int header_len); +/** + * tcp_sigpool_start - disable bh and start using tcp_sigpool_ahash + * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() + * @c: returned tcp_sigpool for usage (uninitialized on failure) + * + * Returns: 0 on success, error otherwise. + */ +int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c); +/** + * tcp_sigpool_end - enable bh and stop using tcp_sigpool + * @c: tcp_sigpool context that was returned by tcp_sigpool_start() + */ +void tcp_sigpool_end(struct tcp_sigpool *c); +size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len); /* - functions */ -int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, - const struct sock *sk, const struct sk_buff *skb); +void tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen); @@ -1717,31 +1954,34 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags); +void tcp_clear_md5_list(struct sock *sk); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG -#include <linux/jump_label.h> -extern struct static_key_false_deferred tcp_md5_needed; struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, - int family); + int family, bool any_l3index); static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { if (!static_branch_unlikely(&tcp_md5_needed.key)) return NULL; - return __tcp_md5_do_lookup(sk, l3index, addr, family); + return __tcp_md5_do_lookup(sk, l3index, addr, family, false); } -enum skb_drop_reason -tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, - const void *saddr, const void *daddr, - int family, int dif, int sdif); - +static inline struct tcp_md5sig_key * +tcp_md5_do_lookup_any_l3index(const struct sock *sk, + const union tcp_md5_addr *addr, int family) +{ + if (!static_branch_unlikely(&tcp_md5_needed.key)) + return NULL; + return __tcp_md5_do_lookup(sk, 0, addr, family, true); +} #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) +void tcp_md5_destruct_sock(struct sock *sk); #else static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, @@ -1750,28 +1990,23 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, return NULL; } -static inline enum skb_drop_reason -tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, - const void *saddr, const void *daddr, - int family, int dif, int sdif) +static inline struct tcp_md5sig_key * +tcp_md5_do_lookup_any_l3index(const struct sock *sk, + const union tcp_md5_addr *addr, int family) { - return SKB_NOT_DROPPED_YET; + return NULL; } -#define tcp_twsk_md5_key(twsk) NULL -#endif - -bool tcp_alloc_md5sig_pool(void); -struct tcp_md5sig_pool *tcp_get_md5sig_pool(void); -static inline void tcp_put_md5sig_pool(void) +#define tcp_twsk_md5_key(twsk) NULL +static inline void tcp_md5_destruct_sock(struct sock *sk) { - local_bh_enable(); } +#endif -int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *, - unsigned int header_len); -int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, - const struct tcp_md5sig_key *key); +struct md5_ctx; +void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb, + unsigned int header_len); +void tcp_md5_hash_key(struct md5_ctx *ctx, const struct tcp_md5sig_key *key); /* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, @@ -1973,6 +2208,14 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc tcp_wmem_free_skb(sk, skb); } +static inline void tcp_write_collapse_fence(struct sock *sk) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + + if (skb) + TCP_SKB_CB(skb)->eor = 1; +} + static inline void tcp_push_pending_frames(struct sock *sk) { if (tcp_send_head(sk)) { @@ -2070,12 +2313,18 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)); +#ifdef CONFIG_INET void tcp_gro_complete(struct sk_buff *skb); +#else +static inline void tcp_gro_complete(struct sk_buff *skb) { } +#endif void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); @@ -2106,7 +2355,7 @@ struct tcp_sock_af_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk, const struct sock *addr_sk); - int (*calc_md5_hash)(char *location, + void (*calc_md5_hash)(char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); @@ -2115,6 +2364,18 @@ struct tcp_sock_af_ops { sockptr_t optval, int optlen); #endif +#ifdef CONFIG_TCP_AO + int (*ao_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen); + struct tcp_ao_key *(*ao_lookup)(const struct sock *sk, + struct sock *addr_sk, + int sndid, int rcvid); + int (*ao_calc_key_sk)(struct tcp_ao_key *mkt, u8 *key, + const struct sock *sk, + __be32 sisn, __be32 disn, bool send); + int (*calc_ao_hash)(char *location, struct tcp_ao_key *ao, + const struct sock *sk, const struct sk_buff *skb, + const u8 *tkey, int hash_offset, u32 sne); +#endif }; struct tcp_request_sock_ops { @@ -2122,11 +2383,20 @@ struct tcp_request_sock_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk, const struct sock *addr_sk); - int (*calc_md5_hash) (char *location, + void (*calc_md5_hash) (char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); #endif +#ifdef CONFIG_TCP_AO + struct tcp_ao_key *(*ao_lookup)(const struct sock *sk, + struct request_sock *req, + int sndid, int rcvid); + int (*ao_calc_key)(struct tcp_ao_key *mkt, u8 *key, struct request_sock *sk); + int (*ao_synack_hash)(char *ao_hash, struct tcp_ao_key *mkt, + struct request_sock *req, const struct sk_buff *skb, + int hash_offset, u32 sne); +#endif #ifdef CONFIG_SYN_COOKIES __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); @@ -2134,7 +2404,8 @@ struct tcp_request_sock_ops { struct dst_entry *(*route_req)(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req); + struct request_sock *req, + u32 tw_isn); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, @@ -2167,6 +2438,70 @@ static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, } #endif +struct tcp_key { + union { + struct { + struct tcp_ao_key *ao_key; + char *traffic_key; + u32 sne; + u8 rcv_next; + }; + struct tcp_md5sig_key *md5_key; + }; + enum { + TCP_KEY_NONE = 0, + TCP_KEY_MD5, + TCP_KEY_AO, + } type; +}; + +static inline void tcp_get_current_key(const struct sock *sk, + struct tcp_key *out) +{ +#if defined(CONFIG_TCP_AO) || defined(CONFIG_TCP_MD5SIG) + const struct tcp_sock *tp = tcp_sk(sk); +#endif + +#ifdef CONFIG_TCP_AO + if (static_branch_unlikely(&tcp_ao_needed.key)) { + struct tcp_ao_info *ao; + + ao = rcu_dereference_protected(tp->ao_info, + lockdep_sock_is_held(sk)); + if (ao) { + out->ao_key = READ_ONCE(ao->current_key); + out->type = TCP_KEY_AO; + return; + } + } +#endif +#ifdef CONFIG_TCP_MD5SIG + if (static_branch_unlikely(&tcp_md5_needed.key) && + rcu_access_pointer(tp->md5sig_info)) { + out->md5_key = tp->af_specific->md5_lookup(sk, sk); + if (out->md5_key) { + out->type = TCP_KEY_MD5; + return; + } + } +#endif + out->type = TCP_KEY_NONE; +} + +static inline bool tcp_key_is_md5(const struct tcp_key *key) +{ + if (static_branch_tcp_md5()) + return key->type == TCP_KEY_MD5; + return false; +} + +static inline bool tcp_key_is_ao(const struct tcp_key *key) +{ + if (static_branch_tcp_ao()) + return key->type == TCP_KEY_AO; + return false; +} + int tcpv4_offload_init(void); void tcp_v4_init(void); @@ -2211,14 +2546,35 @@ void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); +static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str) +{ + WARN_ONCE(cond, + "%scwn:%u out:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", + str, + tcp_snd_cwnd(tcp_sk(sk)), + tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, + tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, + tcp_sk(sk)->tlp_high_seq, sk->sk_state, + inet_csk(sk)->icsk_ca_state, + tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache, + inet_csk(sk)->icsk_pmtu_cookie); +} + /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; - u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); - return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; + if (likely(skb)) { + u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); + + return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; + } else { + tcp_warn_once(sk, 1, "rtx queue empty: "); + return jiffies_to_usecs(rto); + } + } /* @@ -2306,7 +2662,7 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) */ static inline void tcp_listendrop(const struct sock *sk) { - atomic_inc(&((struct sock *)sk)->sk_drops); + sk_drops_inc((struct sock *)sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } @@ -2331,8 +2687,8 @@ struct tcp_ulp_ops { /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ - int (*get_info)(const struct sock *sk, struct sk_buff *skb); - size_t (*get_info_size)(const struct sock *sk); + int (*get_info)(struct sock *sk, struct sk_buff *skb, bool net_admin); + size_t (*get_info_size)(const struct sock *sk, bool net_admin); /* clone ulp */ void (*clone)(const struct request_sock *req, struct sock *newsk, const gfp_t priority); @@ -2349,8 +2705,8 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); #define MODULE_ALIAS_TCP_ULP(name) \ - __MODULE_INFO(alias, alias_userspace, name); \ - __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) + MODULE_INFO(alias, name); \ + MODULE_INFO(alias, "tcp-ulp-" name) #ifdef CONFIG_NET_SOCK_MSG struct sk_msg; @@ -2359,6 +2715,11 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); +#ifdef CONFIG_BPF_STREAM_PARSER +struct strparser; +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); +#endif /* CONFIG_BPF_STREAM_PARSER */ #endif /* CONFIG_BPF_SYSCALL */ #ifdef CONFIG_INET @@ -2409,6 +2770,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_owned_by_me(sk); } @@ -2486,10 +2848,10 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } -static inline void tcp_bpf_rtt(struct sock *sk) +static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) - tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt); } #if IS_ENABLED(CONFIG_SMC) @@ -2497,9 +2859,9 @@ extern struct static_key_false tcp_have_smc; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) -void clean_acked_data_enable(struct inet_connection_sock *icsk, +void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)); -void clean_acked_data_disable(struct inet_connection_sock *icsk); +void clean_acked_data_disable(struct tcp_sock *tp); void clean_acked_data_flush(void); #endif @@ -2525,4 +2887,59 @@ static inline u64 tcp_transmit_time(const struct sock *sk) return 0; } +static inline int tcp_parse_auth_options(const struct tcphdr *th, + const u8 **md5_hash, const struct tcp_ao_hdr **aoh) +{ + const u8 *md5_tmp, *ao_tmp; + int ret; + + ret = tcp_do_parse_auth_options(th, &md5_tmp, &ao_tmp); + if (ret) + return ret; + + if (md5_hash) + *md5_hash = md5_tmp; + + if (aoh) { + if (!ao_tmp) + *aoh = NULL; + else + *aoh = (struct tcp_ao_hdr *)(ao_tmp - 2); + } + + return 0; +} + +static inline bool tcp_ao_required(struct sock *sk, const void *saddr, + int family, int l3index, bool stat_inc) +{ +#ifdef CONFIG_TCP_AO + struct tcp_ao_info *ao_info; + struct tcp_ao_key *ao_key; + + if (!static_branch_unlikely(&tcp_ao_needed.key)) + return false; + + ao_info = rcu_dereference_check(tcp_sk(sk)->ao_info, + lockdep_sock_is_held(sk)); + if (!ao_info) + return false; + + ao_key = tcp_ao_do_lookup(sk, l3index, saddr, family, -1, -1); + if (ao_info->ao_required || ao_key) { + if (stat_inc) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOREQUIRED); + atomic64_inc(&ao_info->counters.ao_required); + } + return true; + } +#endif + return false; +} + +enum skb_drop_reason tcp_inbound_hash(struct sock *sk, + const struct request_sock *req, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif); + #endif /* _TCP_H */ diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h new file mode 100644 index 000000000000..1e9e27d6e06b --- /dev/null +++ b/include/net/tcp_ao.h @@ -0,0 +1,356 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _TCP_AO_H +#define _TCP_AO_H + +#define TCP_AO_KEY_ALIGN 1 +#define __tcp_ao_key_align __aligned(TCP_AO_KEY_ALIGN) + +union tcp_ao_addr { + struct in_addr a4; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr a6; +#endif +}; + +struct tcp_ao_hdr { + u8 kind; + u8 length; + u8 keyid; + u8 rnext_keyid; +}; + +static inline u8 tcp_ao_hdr_maclen(const struct tcp_ao_hdr *aoh) +{ + return aoh->length - sizeof(struct tcp_ao_hdr); +} + +struct tcp_ao_counters { + atomic64_t pkt_good; + atomic64_t pkt_bad; + atomic64_t key_not_found; + atomic64_t ao_required; + atomic64_t dropped_icmp; +}; + +struct tcp_ao_key { + struct hlist_node node; + union tcp_ao_addr addr; + u8 key[TCP_AO_MAXKEYLEN] __tcp_ao_key_align; + unsigned int tcp_sigpool_id; + unsigned int digest_size; + int l3index; + u8 prefixlen; + u8 family; + u8 keylen; + u8 keyflags; + u8 sndid; + u8 rcvid; + u8 maclen; + struct rcu_head rcu; + atomic64_t pkt_good; + atomic64_t pkt_bad; + u8 traffic_keys[]; +}; + +static inline u8 *rcv_other_key(struct tcp_ao_key *key) +{ + return key->traffic_keys; +} + +static inline u8 *snd_other_key(struct tcp_ao_key *key) +{ + return key->traffic_keys + key->digest_size; +} + +static inline int tcp_ao_maclen(const struct tcp_ao_key *key) +{ + return key->maclen; +} + +/* Use tcp_ao_len_aligned() for TCP header calculations */ +static inline int tcp_ao_len(const struct tcp_ao_key *key) +{ + return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr); +} + +static inline int tcp_ao_len_aligned(const struct tcp_ao_key *key) +{ + return round_up(tcp_ao_len(key), 4); +} + +static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key) +{ + return key->digest_size; +} + +static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key) +{ + return sizeof(struct tcp_ao_key) + (key->digest_size << 1); +} + +struct tcp_ao_info { + /* List of tcp_ao_key's */ + struct hlist_head head; + /* current_key and rnext_key are maintained on sockets + * in TCP_AO_ESTABLISHED states. + * Their purpose is to cache keys on established connections, + * saving needless lookups. Never dereference any of them from + * listen sockets. + * ::current_key may change in RX to the key that was requested by + * the peer, please use READ_ONCE()/WRITE_ONCE() in order to avoid + * load/store tearing. + * Do the same for ::rnext_key, if you don't hold socket lock + * (it's changed only by userspace request in setsockopt()). + */ + struct tcp_ao_key *current_key; + struct tcp_ao_key *rnext_key; + struct tcp_ao_counters counters; + u32 ao_required :1, + accept_icmps :1, + __unused :30; + __be32 lisn; + __be32 risn; + /* Sequence Number Extension (SNE) are upper 4 bytes for SEQ, + * that protect TCP-AO connection from replayed old TCP segments. + * See RFC5925 (6.2). + * In order to get correct SNE, there's a helper tcp_ao_compute_sne(). + * It needs SEQ basis to understand whereabouts are lower SEQ numbers. + * According to that basis vector, it can provide incremented SNE + * when SEQ rolls over or provide decremented SNE when there's + * a retransmitted segment from before-rolling over. + * - for request sockets such basis is rcv_isn/snt_isn, which seems + * good enough as it's unexpected to receive 4 Gbytes on reqsk. + * - for full sockets the basis is rcv_nxt/snd_una. snd_una is + * taken instead of snd_nxt as currently it's easier to track + * in tcp_snd_una_update(), rather than updating SNE in all + * WRITE_ONCE(tp->snd_nxt, ...) + * - for time-wait sockets the basis is tw_rcv_nxt/tw_snd_nxt. + * tw_snd_nxt is not expected to change, while tw_rcv_nxt may. + */ + u32 snd_sne; + u32 rcv_sne; + refcount_t refcnt; /* Protects twsk destruction */ +}; + +#ifdef CONFIG_TCP_MD5SIG +#include <linux/jump_label.h> +extern struct static_key_false_deferred tcp_md5_needed; +#define static_branch_tcp_md5() static_branch_unlikely(&tcp_md5_needed.key) +#else +#define static_branch_tcp_md5() false +#endif +#ifdef CONFIG_TCP_AO +/* TCP-AO structures and functions */ +#include <linux/jump_label.h> +extern struct static_key_false_deferred tcp_ao_needed; +#define static_branch_tcp_ao() static_branch_unlikely(&tcp_ao_needed.key) +#else +#define static_branch_tcp_ao() false +#endif + +#ifdef CONFIG_TCP_AO +/* TCP-AO structures and functions */ +struct tcp4_ao_context { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + __be32 sisn; + __be32 disn; +}; + +struct tcp6_ao_context { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + __be32 sisn; + __be32 disn; +}; + +struct tcp_sigpool; +/* Established states are fast-path and there always is current_key/rnext_key */ +#define TCP_AO_ESTABLISHED (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | \ + TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSING) + +int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, + struct tcp_ao_key *key, struct tcphdr *th, + __u8 *hash_location); +int tcp_ao_hash_skb(unsigned short int family, + char *ao_hash, struct tcp_ao_key *key, + const struct sock *sk, const struct sk_buff *skb, + const u8 *tkey, int hash_offset, u32 sne); +int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, + sockptr_t optval, int optlen); +struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, + struct tcp_ao_info *ao, + int sndid, int rcvid); +int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, + struct request_sock *req, struct sk_buff *skb, + int family); +int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx, + unsigned int len, struct tcp_sigpool *hp); +void tcp_ao_destroy_sock(struct sock *sk, bool twsk); +void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp); +bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code); +int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen); +int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen); +int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen); +int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen); +enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, + const struct sk_buff *skb, unsigned short int family, + const struct request_sock *req, int l3index, + const struct tcp_ao_hdr *aoh); +u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq); +struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index, + const union tcp_ao_addr *addr, + int family, int sndid, int rcvid); +int tcp_ao_hash_hdr(unsigned short family, char *ao_hash, + struct tcp_ao_key *key, const u8 *tkey, + const union tcp_ao_addr *daddr, + const union tcp_ao_addr *saddr, + const struct tcphdr *th, u32 sne); +int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb, + const struct tcp_ao_hdr *aoh, int l3index, u32 seq, + struct tcp_ao_key **key, char **traffic_key, + bool *allocated_traffic_key, u8 *keyid, u32 *sne); + +/* ipv4 specific functions */ +int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); +struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk, + int sndid, int rcvid); +int tcp_v4_ao_synack_hash(char *ao_hash, struct tcp_ao_key *mkt, + struct request_sock *req, const struct sk_buff *skb, + int hash_offset, u32 sne); +int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, + const struct sock *sk, + __be32 sisn, __be32 disn, bool send); +int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, + struct request_sock *req); +struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk, + struct request_sock *req, + int sndid, int rcvid); +int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, + const struct sock *sk, const struct sk_buff *skb, + const u8 *tkey, int hash_offset, u32 sne); +/* ipv6 specific functions */ +int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp, + const struct in6_addr *daddr, + const struct in6_addr *saddr, int nbytes); +int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key, + const struct sk_buff *skb, __be32 sisn, __be32 disn); +int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, + const struct sock *sk, __be32 sisn, + __be32 disn, bool send); +int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, + struct request_sock *req); +struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk, + struct sock *addr_sk, int sndid, int rcvid); +struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk, + struct request_sock *req, + int sndid, int rcvid); +int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, + const struct sock *sk, const struct sk_buff *skb, + const u8 *tkey, int hash_offset, u32 sne); +int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); +int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key, + struct request_sock *req, const struct sk_buff *skb, + int hash_offset, u32 sne); +void tcp_ao_established(struct sock *sk); +void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb); +void tcp_ao_connect_init(struct sock *sk); +void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, + struct request_sock *req, unsigned short int family); +#else /* CONFIG_TCP_AO */ + +static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, + struct tcp_ao_key *key, struct tcphdr *th, + __u8 *hash_location) +{ + return 0; +} + +static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, + struct request_sock *req, unsigned short int family) +{ +} + +static inline bool tcp_ao_ignore_icmp(const struct sock *sk, int family, + int type, int code) +{ + return false; +} + +static inline enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, + const struct sk_buff *skb, unsigned short int family, + const struct request_sock *req, int l3index, + const struct tcp_ao_hdr *aoh) +{ + return SKB_NOT_DROPPED_YET; +} + +static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, + int l3index, const union tcp_ao_addr *addr, + int family, int sndid, int rcvid) +{ + return NULL; +} + +static inline void tcp_ao_destroy_sock(struct sock *sk, bool twsk) +{ +} + +static inline void tcp_ao_established(struct sock *sk) +{ +} + +static inline void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) +{ +} + +static inline void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, + struct tcp_sock *tp) +{ +} + +static inline void tcp_ao_connect_init(struct sock *sk) +{ +} + +static inline int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen) +{ + return -ENOPROTOOPT; +} + +static inline int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) +{ + return -ENOPROTOOPT; +} + +static inline int tcp_ao_get_repair(struct sock *sk, + sockptr_t optval, sockptr_t optlen) +{ + return -ENOPROTOOPT; +} + +static inline int tcp_ao_set_repair(struct sock *sk, + sockptr_t optval, unsigned int optlen) +{ + return -ENOPROTOOPT; +} +#endif + +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) +int tcp_do_parse_auth_options(const struct tcphdr *th, + const u8 **md5_hash, const u8 **ao_hash); +#else +static inline int tcp_do_parse_auth_options(const struct tcphdr *th, + const u8 **md5_hash, const u8 **ao_hash) +{ + *md5_hash = NULL; + *ao_hash = NULL; + return 0; +} +#endif + +#endif /* _TCP_AO_H */ diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h new file mode 100644 index 000000000000..f13e5cd2b1ac --- /dev/null +++ b/include/net/tcp_ecn.h @@ -0,0 +1,642 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _TCP_ECN_H +#define _TCP_ECN_H + +#include <linux/tcp.h> +#include <linux/skbuff.h> +#include <linux/bitfield.h> + +#include <net/inet_connection_sock.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/inet_ecn.h> + +/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is + * attemped to be negotiated and requested for incoming connection + * and outgoing connection, respectively. + */ +enum tcp_ecn_mode { + TCP_ECN_IN_NOECN_OUT_NOECN = 0, + TCP_ECN_IN_ECN_OUT_ECN = 1, + TCP_ECN_IN_ECN_OUT_NOECN = 2, + TCP_ECN_IN_ACCECN_OUT_ACCECN = 3, + TCP_ECN_IN_ACCECN_OUT_ECN = 4, + TCP_ECN_IN_ACCECN_OUT_NOECN = 5, +}; + +/* AccECN option sending when AccECN has been successfully negotiated */ +enum tcp_accecn_option { + TCP_ACCECN_OPTION_DISABLED = 0, + TCP_ACCECN_OPTION_MINIMUM = 1, + TCP_ACCECN_OPTION_FULL = 2, +}; + +static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) +{ + /* Do not set CWR if in AccECN mode! */ + if (tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_QUEUE_CWR; +} + +static inline void tcp_ecn_accept_cwr(struct sock *sk, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + + /* If the sender is telling us it has entered CWR, then its + * cwnd may be very low (even just 1 packet), so we should ACK + * immediately. + */ + if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + } +} + +static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) +{ + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; +} + +/* tp->accecn_fail_mode */ +#define TCP_ACCECN_ACE_FAIL_SEND BIT(0) +#define TCP_ACCECN_ACE_FAIL_RECV BIT(1) +#define TCP_ACCECN_OPT_FAIL_SEND BIT(2) +#define TCP_ACCECN_OPT_FAIL_RECV BIT(3) + +static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; +} + +static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV; +} + +static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND; +} + +static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV; +} + +static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) +{ + tp->accecn_fail_mode |= mode; +} + +#define TCP_ACCECN_OPT_NOT_SEEN 0x0 +#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 +#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 +#define TCP_ACCECN_OPT_FAIL_SEEN 0x3 + +static inline u8 tcp_accecn_ace(const struct tcphdr *th) +{ + return (th->ae << 2) | (th->cwr << 1) | th->ece; +} + +/* Infer the ECT value our SYN arrived with from the echoed ACE field */ +static inline int tcp_accecn_extract_syn_ect(u8 ace) +{ + /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */ + static const int ace_to_ecn[8] = { + INET_ECN_ECT_0, /* 0b000 (Undefined) */ + INET_ECN_ECT_1, /* 0b001 (Undefined) */ + INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */ + INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */ + INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */ + INET_ECN_ECT_1, /* 0b101 (Reserved) */ + INET_ECN_CE, /* 0b110 (CE is received) */ + INET_ECN_ECT_1 /* 0b111 (Undefined) */ + }; + + return ace_to_ecn[ace & 0x7]; +} + +/* Check ECN field transition to detect invalid transitions */ +static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv) +{ + if (rcv == snt) + return true; + + /* Non-ECT altered to something or something became non-ECT */ + if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT) + return false; + /* CE -> ECT(0/1)? */ + if (snt == INET_ECN_CE) + return false; + return true; +} + +static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, + u8 sent_ect) +{ + u8 ect = tcp_accecn_extract_syn_ect(ace); + struct tcp_sock *tp = tcp_sk(sk); + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + return true; + + if (!tcp_ect_transition_valid(sent_ect, ect)) { + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + return false; + } + + return true; +} + +static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, + u8 saw_opt) +{ + tp->saw_accecn_opt = saw_opt; + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); +} + +/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ +static inline void tcp_accecn_third_ack(struct sock *sk, + const struct sk_buff *skb, u8 sent_ect) +{ + u8 ace = tcp_accecn_ace(tcp_hdr(skb)); + struct tcp_sock *tp = tcp_sk(sk); + + switch (ace) { + case 0x0: + /* Invalid value */ + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + break; + case 0x7: + case 0x5: + case 0x1: + /* Unused but legal values */ + break; + default: + /* Validation only applies to first non-data packet */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && + !TCP_SKB_CB(skb)->sacked && + tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { + if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && + !tp->delivered_ce) + tp->delivered_ce++; + } + break; + } +} + +/* Demand the minimum # to send AccECN optnio */ +static inline void tcp_accecn_opt_demand_min(struct sock *sk, + u8 opt_demand_min) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 opt_demand; + + opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand); + tp->accecn_opt_demand = opt_demand; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). + */ +static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return 1; + case INET_ECN_CE: + return 2; + case INET_ECN_ECT_0: + return 3; + } + return 0; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field value offset. + * Some fields do not start from zero, to detect zeroing by middleboxes. + */ +static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return TCP_ACCECN_E1B_INIT_OFFSET; + case INET_ECN_CE: + return TCP_ACCECN_CEB_INIT_OFFSET; + case INET_ECN_ECT_0: + return TCP_ACCECN_E0B_INIT_OFFSET; + } + return 0; +} + +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, + bool order) +{ + /* Based on Table 5 of the AccECN spec to map (option, order) to + * the corresponding ECN conuters (ECT-1, ECT-0, or CE). + */ + static const u8 optfield_lookup[2][3] = { + /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ + { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, + /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ + { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } + }; + + return optfield_lookup[order][option % 3]; +} + +/* Handles AccECN option ECT and CE 24-bit byte counters update into + * the u32 value in tcp_sock. As we're processing TCP options, it is + * safe to access from - 1. + */ +static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, + u32 init_offset) +{ + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & + 0xFFFFFFU; + u32 delta = (truncated - *cnt) & 0xFFFFFFU; + + /* If delta has the highest bit set (24th bit) indicating + * negative, sign extend to correct an estimation using + * sign_extend32(delta, 24 - 1) + */ + delta = sign_extend32(delta, 23); + *cnt += delta; + return (s32)delta; +} + +/* Updates Accurate ECN received counters from the received IP ECN field */ +static inline void tcp_ecn_received_counters(struct sock *sk, + const struct sk_buff *skb, u32 len) +{ + u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + u8 is_ce = INET_ECN_is_ce(ecnfield); + struct tcp_sock *tp = tcp_sk(sk); + bool ecn_edge; + + if (!INET_ECN_is_not_ect(ecnfield)) { + u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); + + /* As for accurate ECN, the TCP_ECN_SEEN flag is set by + * tcp_ecn_received_counters() when the ECN codepoint of + * received TCP data or ACK contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_SEEN; + + /* ACE counter tracks *all* segments including pure ACKs */ + tp->received_ce += pcount; + tp->received_ce_pending = min(tp->received_ce_pending + pcount, + 0xfU); + + if (len > 0) { + u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); + u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1]; + u32 bytes_mask = GENMASK_U32(31, 22); + + tp->received_ecn_bytes[ecnfield - 1] += len; + tp->accecn_minlen = max_t(u8, tp->accecn_minlen, + minlen); + + /* Send AccECN option at least once per 2^22-byte + * increase in any ECN byte counter. + */ + if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) & + bytes_mask) { + tcp_accecn_opt_demand_min(sk, 1); + } + } + } + + ecn_edge = tp->prev_ecnfield != ecnfield; + if (ecn_edge || is_ce) { + tp->prev_ecnfield = ecnfield; + /* Demand Accurate ECN change-triggered ACKs. Two ACK are + * demanded to indicate unambiguously the ecnfield value + * in the latter ACK. + */ + if (tcp_ecn_mode_accecn(tp)) { + if (ecn_edge) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + tp->accecn_opt_demand = 2; + } + } +} + +/* AccECN specification, 2.2: [...] A Data Receiver maintains four counters + * initialized at the start of the half-connection. [...] These byte counters + * reflect only the TCP payload length, excluding TCP header and TCP options. + */ +static inline void tcp_ecn_received_counters_payload(struct sock *sk, + const struct sk_buff *skb) +{ + const struct tcphdr *th = (const struct tcphdr *)skb->data; + + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); +} + +/* AccECN specification, 5.1: [...] a server can determine that it + * negotiated AccECN as [...] if the ACK contains an ACE field with + * the value 0b010 to 0b111 (decimal 2 to 7). + */ +static inline bool cookie_accecn_ok(const struct tcphdr *th) +{ + return tcp_accecn_ace(th) > 0x1; +} + +/* Used to form the ACE flags for SYN/ACK */ +static inline u16 tcp_accecn_reflector_flags(u8 ect) +{ + /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN. + * Below is an excerpt from the 1st block of Table 2 of AccECN spec, + * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE + */ + static const u8 ecn_to_ace_flags[4] = { + 0b010, /* Not-ECT is received */ + 0b011, /* ECT(1) is received */ + 0b100, /* ECT(0) is received */ + 0b110 /* CE is received */ + }; + + return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]); +} + +/* AccECN specification, 3.1.2: If a TCP server that implements AccECN + * receives a SYN with the three TCP header flags (AE, CWR and ECE) set + * to any combination other than 000, 011 or 111, it MUST negotiate the + * use of AccECN as if they had been set to 111. + */ +static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) +{ + u8 ace = tcp_accecn_ace(th); + + return ace && ace != 0x3; +} + +static inline void __tcp_accecn_init_bytes_counters(int *counter_array) +{ + BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); + BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); + BUILD_BUG_ON(INET_ECN_CE != 0x3); + + counter_array[INET_ECN_ECT_1 - 1] = 0; + counter_array[INET_ECN_ECT_0 - 1] = 0; + counter_array[INET_ECN_CE - 1] = 0; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; + tp->received_ce_pending = 0; + __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); + __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); + tp->accecn_minlen = 0; + tp->accecn_opt_demand = 0; + tp->est_ecnfield = 0; +} + +/* Used for make_synack to form the ACE flags */ +static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) +{ + /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received + * from SYN. Below is an excerpt from Table 2 of the AccECN spec: + * +====================+====================================+ + * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK | + * | received on SYN | AE CWR ECE | + * +====================+====================================+ + * | Not-ECT | 0 1 0 | + * | ECT(1) | 0 1 1 | + * | ECT(0) | 1 0 0 | + * | CE | 1 1 0 | + * +====================+====================================+ + */ + th->ae = !!(ect & INET_ECN_ECT_0); + th->cwr = ect != INET_ECN_ECT_0; + th->ece = ect == INET_ECN_ECT_1; +} + +static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, + struct tcphdr *th) +{ + u32 wire_ace; + + /* The final packet of the 3WHS or anything like it must reflect + * the SYN/ACK ECT instead of putting CEP into ACE field, such + * case show up in tcp_flags. + */ + if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) { + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; + } +} + +static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, + u8 opt_offset) +{ + u8 *ptr = skb_transport_header(skb) + opt_offset; + unsigned int optlen = ptr[1] - 2; + + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return TCP_ACCECN_OPT_FAIL_SEEN; + ptr += 2; + + /* Detect option zeroing: an AccECN connection "MAY check that the + * initial value of the EE0B field or the EE1B field is non-zero" + */ + if (optlen < TCPOLEN_ACCECN_PERFIELD) + return TCP_ACCECN_OPT_EMPTY_SEEN; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) + return TCP_ACCECN_OPT_COUNTER_SEEN; + ptr += TCPOLEN_ACCECN_PERFIELD * 2; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + + return TCP_ACCECN_OPT_COUNTER_SEEN; +} + +/* See Table 2 of the AccECN draft */ +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, + const struct tcphdr *th, u8 ip_dsfield) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 ace = tcp_accecn_ace(th); + + switch (ace) { + case 0x0: + case 0x7: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | No ECN | 0 0 0 | Not ECN | + * | AccECN | Broken | 1 1 1 | Not ECN | + * +========+========+============+=============+ + */ + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + break; + case 0x1: + case 0x5: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | Nonce | 1 0 1 | (Reserved) | + * | AccECN | ECN | 0 0 1 | Classic ECN | + * | Nonce | AccECN | 0 0 1 | Classic ECN | + * | ECN | AccECN | 0 0 1 | Classic ECN | + * +========+========+============+=============+ + */ + if (tcp_ecn_mode_pending(tp)) + /* Downgrade from AccECN, or requested initially */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + break; + default: + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tp->accecn_opt_demand = 2; + } + if (INET_ECN_is_ce(ip_dsfield) && + tcp_accecn_validate_syn_feedback(sk, ace, + tp->syn_ect_snt)) { + tp->received_ce++; + tp->received_ce_pending++; + } + break; + } +} + +static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, + const struct sk_buff *skb) +{ + if (tcp_ecn_mode_pending(tp)) { + if (!tcp_accecn_syn_requested(th)) { + /* Downgrade to classic ECN feedback */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } else { + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + tp->prev_ecnfield = tp->syn_ect_rcv; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + } + } + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); +} + +static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) + return true; + return false; +} + +/* Packet ECN state for a SYN-ACK */ +static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; + if (tcp_ecn_disabled(tp)) + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk) || + tcp_bpf_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + + if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + TCP_SKB_CB(skb)->tcp_flags |= + tcp_accecn_reflector_flags(tp->syn_ect_rcv); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } +} + +/* Packet ECN state for a SYN. */ +static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn, use_accecn; + u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn); + + use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN; + use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || + tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; + + if (!use_ecn) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } + + tp->ecn_flags = 0; + + if (use_ecn) { + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); + + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; + if (use_accecn) { + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } else { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } + } +} + +static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) +{ + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) { + /* tp->ecn_flags are cleared at a later point in time when + * SYN ACK is ultimatively being received. + */ + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + } +} + +static inline void +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) +{ + if (tcp_rsk(req)->accecn_ok) + tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); + else if (inet_rsk(req)->ecn_ok) + th->ece = 1; +} + +static inline bool tcp_accecn_option_beacon_check(const struct sock *sk) +{ + u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon); + const struct tcp_sock *tp = tcp_sk(sk); + + if (!ecn_beacon) + return false; + + return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >= + (tp->srtt_us >> 3); +} + +#endif /* _LINUX_TCP_ECN_H */ diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h index cc00118acca1..d60e8148ff4c 100644 --- a/include/net/tcp_states.h +++ b/include/net/tcp_states.h @@ -22,6 +22,7 @@ enum { TCP_LISTEN, TCP_CLOSING, /* Now a valid state */ TCP_NEW_SYN_RECV, + TCP_BOUND_INACTIVE, /* Pseudo-state for inet_diag */ TCP_MAX_STATES /* Leave at the end! */ }; @@ -43,6 +44,7 @@ enum { TCPF_LISTEN = (1 << TCP_LISTEN), TCPF_CLOSING = (1 << TCP_CLOSING), TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV), + TCPF_BOUND_INACTIVE = (1 << TCP_BOUND_INACTIVE), }; #endif /* _LINUX_TCP_STATES_H */ diff --git a/include/net/tcx.h b/include/net/tcx.h index 264f147953ba..23a61af13547 100644 --- a/include/net/tcx.h +++ b/include/net/tcx.h @@ -13,14 +13,13 @@ struct mini_Qdisc; struct tcx_entry { struct mini_Qdisc __rcu *miniq; struct bpf_mprog_bundle bundle; - bool miniq_active; + u32 miniq_active; struct rcu_head rcu; }; struct tcx_link { struct bpf_link link; struct net_device *dev; - u32 location; }; static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) @@ -38,16 +37,11 @@ static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry) return container_of(bundle, struct tcx_entry, bundle); } -static inline struct tcx_link *tcx_link(struct bpf_link *link) +static inline struct tcx_link *tcx_link(const struct bpf_link *link) { return container_of(link, struct tcx_link, link); } -static inline const struct tcx_link *tcx_link_const(const struct bpf_link *link) -{ - return tcx_link((struct bpf_link *)link); -} - void tcx_inc(void); void tcx_dec(void); @@ -80,9 +74,9 @@ tcx_entry_fetch(struct net_device *dev, bool ingress) return rcu_dereference_rtnl(dev->tcx_egress); } -static inline struct bpf_mprog_entry *tcx_entry_create(void) +static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void) { - struct tcx_entry *tcx = kzalloc(sizeof(*tcx), GFP_KERNEL); + struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL); if (tcx) { bpf_mprog_bundle_init(&tcx->bundle); @@ -90,6 +84,7 @@ static inline struct bpf_mprog_entry *tcx_entry_create(void) } return NULL; } +#define tcx_entry_create(...) alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__)) static inline void tcx_entry_free(struct bpf_mprog_entry *entry) { @@ -129,11 +124,16 @@ static inline void tcx_skeys_dec(bool ingress) tcx_dec(); } -static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry, - const bool active) +static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry) +{ + ASSERT_RTNL(); + tcx_entry(entry)->miniq_active++; +} + +static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry) { ASSERT_RTNL(); - tcx_entry(entry)->miniq_active = active; + tcx_entry(entry)->miniq_active--; } static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry) diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 74d2b463cc95..0a85ac64a66d 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -15,22 +15,6 @@ struct timewait_sock_ops { struct kmem_cache *twsk_slab; char *twsk_slab_name; unsigned int twsk_obj_size; - int (*twsk_unique)(struct sock *sk, - struct sock *sktw, void *twp); - void (*twsk_destructor)(struct sock *sk); }; -static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) -{ - if (sk->sk_prot->twsk_prot->twsk_unique != NULL) - return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp); - return 0; -} - -static inline void twsk_destructor(struct sock *sk) -{ - if (sk->sk_prot->twsk_prot->twsk_destructor != NULL) - sk->sk_prot->twsk_prot->twsk_destructor(sk); -} - #endif /* _TIMEWAIT_SOCK_H */ diff --git a/include/net/tls.h b/include/net/tls.h index a2b44578dcb7..ebd2550280ae 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -53,15 +53,20 @@ struct tls_rec; /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) +/* Minimum record size limit as per RFC8449 */ +#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6) #define TLS_HEADER_SIZE 5 #define TLS_NONCE_OFFSET TLS_HEADER_SIZE #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) +#define TLS_HANDSHAKE_KEYUPDATE 24 /* rfc8446 B.3: Key update */ + #define TLS_AAD_SPACE_SIZE 13 -#define MAX_IV_SIZE 16 +#define TLS_MAX_IV_SIZE 16 +#define TLS_MAX_SALT_SIZE 4 #define TLS_TAG_SIZE 16 #define TLS_MAX_REC_SEQ_SIZE 8 #define TLS_MAX_AAD_SIZE TLS_AAD_SPACE_SIZE @@ -96,9 +101,6 @@ struct tls_sw_context_tx { struct tls_rec *open_rec; struct list_head tx_list; atomic_t encrypt_pending; - /* protect crypto_wait with encrypt_pending */ - spinlock_t encrypt_compl_lock; - int async_notify; u8 async_capable:1; #define BIT_TX_SCHEDULED 0 @@ -113,7 +115,8 @@ struct tls_strparser { u32 stopped : 1; u32 copy_mode : 1; u32 mixed_decrypted : 1; - u32 msg_ready : 1; + + bool msg_ready; struct strp_msg stm; @@ -131,12 +134,11 @@ struct tls_sw_context_rx { u8 async_capable:1; u8 zc_capable:1; u8 reader_contended:1; + bool key_update_pending; struct tls_strparser strp; atomic_t decrypt_pending; - /* protect crypto_wait with decrypt_pending*/ - spinlock_t decrypt_compl_lock; struct sk_buff_head async_hold; struct wait_queue_head wq; }; @@ -149,6 +151,7 @@ struct tls_record_info { skb_frag_t frags[MAX_SKB_FRAGS]; }; +#define TLS_DRIVER_STATE_SIZE_TX 16 struct tls_offload_context_tx { struct crypto_aead *aead_send; spinlock_t lock; /* protects records list */ @@ -162,17 +165,13 @@ struct tls_offload_context_tx { void (*sk_destruct)(struct sock *sk); struct work_struct destruct_work; struct tls_context *ctx; - u8 driver_state[] __aligned(8); /* The TLS layer reserves room for driver specific state * Currently the belief is that there is not enough * driver specific state to justify another layer of indirection */ -#define TLS_DRIVER_STATE_SIZE_TX 16 + u8 driver_state[TLS_DRIVER_STATE_SIZE_TX] __aligned(8); }; -#define TLS_OFFLOAD_CONTEXT_SIZE_TX \ - (sizeof(struct tls_offload_context_tx) + TLS_DRIVER_STATE_SIZE_TX) - enum tls_context_flags { /* tls_device_down was called after the netdev went down, device state * was released, and kTLS works in software, even though rx_conf is @@ -193,8 +192,8 @@ enum tls_context_flags { }; struct cipher_context { - char *iv; - char *rec_seq; + char iv[TLS_MAX_IV_SIZE + TLS_MAX_SALT_SIZE]; + char rec_seq[TLS_MAX_REC_SEQ_SIZE]; }; union tls_crypto_context { @@ -229,6 +228,7 @@ struct tls_context { u8 rx_conf:3; u8 zerocopy_sendfile:1; u8 rx_no_pad:1; + u16 tx_max_payload_len; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); @@ -302,6 +302,7 @@ struct tls_offload_resync_async { u32 log[TLS_DEVICE_RESYNC_ASYNC_LOGMAX]; }; +#define TLS_DRIVER_STATE_SIZE_RX 8 struct tls_offload_context_rx { /* sw must be the first member of tls_offload_context_rx */ struct tls_sw_context_rx sw; @@ -325,17 +326,13 @@ struct tls_offload_context_rx { struct tls_offload_resync_async *resync_async; }; }; - u8 driver_state[] __aligned(8); /* The TLS layer reserves room for driver specific state * Currently the belief is that there is not enough * driver specific state to justify another layer of indirection */ -#define TLS_DRIVER_STATE_SIZE_RX 8 + u8 driver_state[TLS_DRIVER_STATE_SIZE_RX] __aligned(8); }; -#define TLS_OFFLOAD_CONTEXT_SIZE_RX \ - (sizeof(struct tls_offload_context_rx) + TLS_DRIVER_STATE_SIZE_RX) - struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn); @@ -371,7 +368,7 @@ static inline bool tls_is_skb_tx_device_offloaded(const struct sk_buff *skb) static inline struct tls_context *tls_get_ctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code, * TLS data path doesn't need rcu_dereference(). @@ -399,8 +396,12 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) static inline bool tls_sw_has_ctx_tx(const struct sock *sk) { - struct tls_context *ctx = tls_get_ctx(sk); + struct tls_context *ctx; + + if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk)) + return false; + ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_tx(ctx); @@ -408,8 +409,12 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk) static inline bool tls_sw_has_ctx_rx(const struct sock *sk) { - struct tls_context *ctx = tls_get_ctx(sk); + struct tls_context *ctx; + + if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk)) + return false; + ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_rx(ctx); @@ -449,25 +454,26 @@ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) /* Log all TLS record header TCP sequences in [seq, seq+len] */ static inline void -tls_offload_rx_resync_async_request_start(struct sock *sk, __be32 seq, u16 len) +tls_offload_rx_resync_async_request_start(struct tls_offload_resync_async *resync_async, + __be32 seq, u16 len) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); - - atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) | + atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) | ((u64)len << 16) | RESYNC_REQ | RESYNC_REQ_ASYNC); - rx_ctx->resync_async->loglen = 0; - rx_ctx->resync_async->rcd_delta = 0; + resync_async->loglen = 0; + resync_async->rcd_delta = 0; } static inline void -tls_offload_rx_resync_async_request_end(struct sock *sk, __be32 seq) +tls_offload_rx_resync_async_request_end(struct tls_offload_resync_async *resync_async, + __be32 seq) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); + atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); +} - atomic64_set(&rx_ctx->resync_async->req, - ((u64)ntohl(seq) << 32) | RESYNC_REQ); +static inline void +tls_offload_rx_resync_async_request_cancel(struct tls_offload_resync_async *resync_async) +{ + atomic64_set(&resync_async->req, 0); } static inline void diff --git a/include/net/udp.h b/include/net/udp.h index 488a6d2babcc..a061d1b22ddc 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -50,39 +50,68 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** - * struct udp_hslot - UDP hash slot + * struct udp_hslot - UDP hash slot used by udp_table.hash/hash4 * * @head: head of list of sockets + * @nulls_head: head of list of sockets, only used by hash4 * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { - struct hlist_head head; + union { + struct hlist_head head; + /* hash4 uses hlist_nulls to avoid moving wrongly onto another + * hlist, because rehash() can happen with lookup(). + */ + struct hlist_nulls_head nulls_head; + }; int count; spinlock_t lock; -} __attribute__((aligned(2 * sizeof(long)))); +} __aligned(2 * sizeof(long)); + +/** + * struct udp_hslot_main - UDP hash slot used by udp_table.hash2 + * + * @hslot: basic hash slot + * @hash4_cnt: number of sockets in hslot4 of the same + * (local port, local address) + */ +struct udp_hslot_main { + struct udp_hslot hslot; /* must be the first member */ +#if !IS_ENABLED(CONFIG_BASE_SMALL) + u32 hash4_cnt; +#endif +} __aligned(2 * sizeof(long)); +#define UDP_HSLOT_MAIN(__hslot) ((struct udp_hslot_main *)(__hslot)) /** * struct udp_table - UDP table * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) + * @hash4: hash table, connected sockets are hashed on + * (local port, local address, remote port, remote address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; - struct udp_hslot *hash2; + struct udp_hslot_main *hash2; +#if !IS_ENABLED(CONFIG_BASE_SMALL) + struct udp_hslot *hash4; +#endif unsigned int mask; unsigned int log; }; extern struct udp_table udp_table; void udp_table_init(struct udp_table *, const char *); static inline struct udp_hslot *udp_hashslot(struct udp_table *table, - struct net *net, unsigned int num) + const struct net *net, + unsigned int num) { return &table->hash[udp_hashfn(net, num, table->mask)]; } + /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() @@ -90,12 +119,92 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table, static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { - return &table->hash2[hash & table->mask]; + return &table->hash2[hash & table->mask].hslot; } +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_table_hash4_init(struct udp_table *table) +{ +} + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + BUILD_BUG(); + return NULL; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return false; +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return 0; +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return false; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ +} +#else /* !CONFIG_BASE_SMALL */ + +/* Must be called with table->hash2 initialized */ +static inline void udp_table_hash4_init(struct udp_table *table) +{ + table->hash4 = (void *)(table->hash2 + (table->mask + 1)); + for (int i = 0; i <= table->mask; i++) { + table->hash2[i].hash4_cnt = 0; + + INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i); + table->hash4[i].count = 0; + spin_lock_init(&table->hash4[i].lock); + } +} + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + return &table->hash4[hash & table->mask]; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node); +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return sizeof(struct udp_hslot); +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return UDP_HSLOT_MAIN(hslot2)->hash4_cnt; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt++; +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt--; +} +#endif /* CONFIG_BASE_SMALL */ + extern struct proto udp_prot; -extern atomic_long_t udp_memory_allocated; DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); /* sysctl variables for udp */ @@ -175,13 +284,28 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); -static inline void udp_lib_init_sock(struct sock *sk) +static inline int udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); + sk->sk_drop_counters = &up->drop_counters; skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); + + up->udp_prod_queue = kcalloc(nr_node_ids, sizeof(*up->udp_prod_queue), + GFP_KERNEL); + if (!up->udp_prod_queue) + return -ENOMEM; + for (int i = 0; i < nr_node_ids; i++) + init_llist_head(&up->udp_prod_queue[i].ll_root); + return 0; +} + +static inline void udp_drops_inc(struct sock *sk) +{ + numa_drop_add(&udp_sk(sk)->drop_counters, 1); } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ @@ -192,13 +316,29 @@ static inline int udp_lib_hash(struct sock *sk) } void udp_lib_unhash(struct sock *sk); -void udp_lib_rehash(struct sock *sk, u16 new_hash); +void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4); +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } +/* hash4 routines shared between UDPv4/6 */ +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_lib_hash4(struct sock *sk, u16 hash) +{ +} + +static inline void udp4_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +void udp_lib_hash4(struct sock *sk, u16 hash); +void udp4_hash4(struct sock *sk); +#endif /* CONFIG_BASE_SMALL */ + int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); @@ -231,7 +371,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, } /* Since this is being sent on the wire obfuscate hash a bit - * to minimize possbility that any useful information to an + * to minimize possibility that any useful information to an * attacker is leaked. Only upper 16 bits are relevant in the * computation for 16 bit port value. */ @@ -245,7 +385,7 @@ static inline int udp_rqueue_get(struct sock *sk) return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } -static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, +static inline bool udp_sk_bound_dev_eq(const struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) @@ -271,7 +411,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, return __skb_recv_udp(sk, flags, &off, err); } -int udp_v4_early_demux(struct sk_buff *skb); +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); @@ -284,7 +424,7 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, int *karg); int udp_init_sock(struct sock *sk); -int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); @@ -296,18 +436,19 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)); -struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, +struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); -struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, +struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, + __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); -struct sock *udp6_lib_lookup(struct net *net, +struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif); -struct sock *__udp6_lib_lookup(struct net *net, +struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, @@ -379,14 +520,7 @@ static inline bool udp_skb_is_linear(struct sk_buff *skb) static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { - int n; - - n = copy_to_iter(skb->data + off, len, to); - if (n == len) - return 0; - - iov_iter_revert(to, n); - return -EFAULT; + return copy_to_iter_full(skb->data + off, len, to) ? 0 : -EFAULT; } /* @@ -466,6 +600,16 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, { netdev_features_t features = NETIF_F_SG; struct sk_buff *segs; + int drop_count; + + /* + * Segmentation in UDP receive path is only for UDP GRO, drop udp + * fragmentation offload (UFO) packets. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) { + drop_count = 1; + goto drop; + } /* Avoid csum recalculation by skb_segment unless userspace explicitly * asks for the final checksum values @@ -489,16 +633,18 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, */ segs = __skb_gso_segment(skb, features, false); if (IS_ERR_OR_NULL(segs)) { - int segs_nr = skb_shinfo(skb)->gso_segs; - - atomic_add(segs_nr, &sk->sk_drops); - SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr); - kfree_skb(skb); - return NULL; + drop_count = skb_shinfo(skb)->gso_segs; + goto drop; } consume_skb(skb); return segs; + +drop: + sk_drops_add(sk, drop_count); + SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count); + kfree_skb(skb); + return NULL; } static inline void udp_post_segment_fix_csum(struct sk_buff *skb) diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 0ca9b7a11baf..9acef2fbd2fd 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -130,40 +130,42 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type); void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type); -static inline void udp_tunnel_get_rx_info(struct net_device *dev) -{ - ASSERT_RTNL(); - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); -} - -static inline void udp_tunnel_drop_rx_info(struct net_device *dev) -{ - ASSERT_RTNL(); - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); -} - /* Transmit the skb using UDP encapsulation. */ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck); + bool xnet, bool nocheck, u16 ipcb_flags); -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, struct in6_addr *saddr, - struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck); +void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck, + u16 ip6cb_flags); void udp_tunnel_sock_release(struct socket *sock); +struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, + struct net_device *dev, + struct net *net, int oif, + __be32 *saddr, + const struct ip_tunnel_key *key, + __be16 sport, __be16 dport, u8 tos, + struct dst_cache *dst_cache); +struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, + struct net_device *dev, + struct net *net, + struct socket *sock, int oif, + struct in6_addr *saddr, + const struct ip_tunnel_key *key, + __be16 sport, __be16 dport, u8 dsfield, + struct dst_cache *dst_cache); + struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, - __be16 flags, __be64 tunnel_id, - int md_size); + const unsigned long *flags, + __be64 tunnel_id, int md_size); #ifdef CONFIG_INET static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) @@ -174,16 +176,28 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) } #endif -static inline void udp_tunnel_encap_enable(struct socket *sock) +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) { - struct udp_sock *up = udp_sk(sock->sk); + udp_tunnel_update_gro_rcv(sk, false); + udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); +} - if (up->encap_enabled) +static inline void udp_tunnel_encap_enable(struct sock *sk) +{ + if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) return; - up->encap_enabled = 1; #if IS_ENABLED(CONFIG_IPV6) - if (sock->sk->sk_family == PF_INET6) + if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); #endif udp_encap_enable(); @@ -192,19 +206,17 @@ static inline void udp_tunnel_encap_enable(struct socket *sock) #define UDP_TUNNEL_NIC_MAX_TABLES 4 enum udp_tunnel_nic_info_flags { - /* Device callbacks may sleep */ - UDP_TUNNEL_NIC_INFO_MAY_SLEEP = BIT(0), /* Device only supports offloads when it's open, all ports * will be removed before close and re-added after open. */ - UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(1), + UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(0), /* Device supports only IPv4 tunnels */ - UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(2), + UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(1), /* Device has hard-coded the IANA VXLAN port (4789) as VXLAN. * This port must not be counted towards n_entries of any table. * Driver will not receive any callback associated with port 4789. */ - UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(3), + UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(2), }; struct udp_tunnel_nic; @@ -295,6 +307,9 @@ struct udp_tunnel_nic_ops { size_t (*dump_size)(struct net_device *dev, unsigned int table); int (*dump_write)(struct net_device *dev, unsigned int table, struct sk_buff *skb); + void (*assert_locked)(struct net_device *dev); + void (*lock)(struct net_device *dev); + void (*unlock)(struct net_device *dev); }; #ifdef CONFIG_INET @@ -323,8 +338,28 @@ static inline void udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { - if (udp_tunnel_nic_ops) + if (udp_tunnel_nic_ops) { + udp_tunnel_nic_ops->assert_locked(dev); udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv); + } +} + +static inline void udp_tunnel_nic_assert_locked(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->assert_locked(dev); +} + +static inline void udp_tunnel_nic_lock(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->lock(dev); +} + +static inline void udp_tunnel_nic_unlock(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->unlock(dev); } static inline void @@ -366,17 +401,50 @@ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev) static inline size_t udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { + size_t ret; + if (!udp_tunnel_nic_ops) return 0; - return udp_tunnel_nic_ops->dump_size(dev, table); + + udp_tunnel_nic_ops->lock(dev); + ret = udp_tunnel_nic_ops->dump_size(dev, table); + udp_tunnel_nic_ops->unlock(dev); + + return ret; } static inline int udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { + int ret; + if (!udp_tunnel_nic_ops) return 0; - return udp_tunnel_nic_ops->dump_write(dev, table, skb); + + udp_tunnel_nic_ops->lock(dev); + ret = udp_tunnel_nic_ops->dump_write(dev, table, skb); + udp_tunnel_nic_ops->unlock(dev); + + return ret; } + +static inline void udp_tunnel_get_rx_info(struct net_device *dev) +{ + ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + udp_tunnel_nic_assert_locked(dev); + call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); +} + +static inline void udp_tunnel_drop_rx_info(struct net_device *dev) +{ + ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + udp_tunnel_nic_assert_locked(dev); + call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); +} + #endif diff --git a/include/net/udplite.h b/include/net/udplite.h index bd33ff2b8f42..786919d29f8d 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -66,14 +66,18 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) /* Fast-path computation of checksum. Socket may not be locked. */ static inline __wsum udplite_csum(struct sk_buff *skb) { - const struct udp_sock *up = udp_sk(skb->sk); const int off = skb_transport_offset(skb); + const struct sock *sk = skb->sk; int len = skb->len - off; - if ((up->pcflag & UDPLITE_SEND_CC) && up->pcslen < len) { - if (0 < up->pcslen) - len = up->pcslen; - udp_hdr(skb)->len = htons(up->pcslen); + if (udp_test_bit(UDPLITE_SEND_CC, sk)) { + u16 pcslen = READ_ONCE(udp_sk(sk)->pcslen); + + if (pcslen < len) { + if (pcslen > 0) + len = pcslen; + udp_hdr(skb)->len = htons(pcslen); + } } skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h index cf8cc140d68d..c3f4cc206198 100644 --- a/include/net/vsock_addr.h +++ b/include/net/vsock_addr.h @@ -16,7 +16,7 @@ bool vsock_addr_bound(const struct sockaddr_vm *addr); void vsock_addr_unbind(struct sockaddr_vm *addr); bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, const struct sockaddr_vm *other); -int vsock_addr_cast(const struct sockaddr *addr, size_t len, +int vsock_addr_cast(const struct sockaddr_unsized *addr, size_t len, struct sockaddr_vm **out_addr); #endif diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 6a9f8a5f387c..0ee50785f4f1 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -210,22 +210,24 @@ struct vxlan_rdst { }; struct vxlan_config { - union vxlan_addr remote_ip; - union vxlan_addr saddr; - __be32 vni; - int remote_ifindex; - int mtu; - __be16 dst_port; - u16 port_min; - u16 port_max; - u8 tos; - u8 ttl; - __be32 label; - u32 flags; - unsigned long age_interval; - unsigned int addrmax; - bool no_share; - enum ifla_vxlan_df df; + union vxlan_addr remote_ip; + union vxlan_addr saddr; + __be32 vni; + int remote_ifindex; + int mtu; + __be16 dst_port; + u16 port_min; + u16 port_max; + u8 tos; + u8 ttl; + __be32 label; + enum ifla_vxlan_label_policy label_policy; + u32 flags; + unsigned long age_interval; + unsigned int addrmax; + bool no_share; + enum ifla_vxlan_df df; + struct vxlanhdr reserved_bits; }; enum { @@ -294,7 +296,7 @@ struct vxlan_dev { struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; - spinlock_t hash_lock[FDB_HASH_SIZE]; + spinlock_t hash_lock; unsigned int addrcnt; struct gro_cells gro_cells; @@ -302,9 +304,10 @@ struct vxlan_dev { struct vxlan_vni_group __rcu *vnigrp; - struct hlist_head fdb_head[FDB_HASH_SIZE]; + struct rhashtable fdb_hash_tbl; struct rhashtable mdb_tbl; + struct hlist_head fdb_list; struct hlist_head mdb_list; unsigned int mdb_seq; }; @@ -329,6 +332,7 @@ struct vxlan_dev { #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 #define VXLAN_F_LOCALBYPASS 0x80000 +#define VXLAN_F_MC_ROUTE 0x100000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -350,7 +354,9 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER | \ - VXLAN_F_LOCALBYPASS) + VXLAN_F_LOCALBYPASS | \ + VXLAN_F_MC_ROUTE | \ + 0) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/net/x25.h b/include/net/x25.h index 597eb53c471e..414f3fd99345 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -81,7 +81,7 @@ enum { #define X25_DEFAULT_WINDOW_SIZE 2 /* Default Window Size */ #define X25_DEFAULT_PACKET_SIZE X25_PS128 /* Default Packet Size */ -#define X25_DEFAULT_THROUGHPUT 0x0A /* Deafult Throughput */ +#define X25_DEFAULT_THROUGHPUT 0x0A /* Default Throughput */ #define X25_DEFAULT_REVERSE 0x00 /* Default Reverse Charging */ #define X25_SMODULUS 8 @@ -203,7 +203,6 @@ void x25_send_frame(struct sk_buff *, struct x25_neigh *); int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void x25_establish_link(struct x25_neigh *); -void x25_terminate_link(struct x25_neigh *); /* x25_facilities.c */ int x25_parse_facilities(struct sk_buff *, struct x25_facilities *, diff --git a/include/net/xdp.h b/include/net/xdp.h index de08c8e0d134..aa742f413c35 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -11,12 +11,14 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> /* skb_shared_info */ +#include <net/page_pool/types.h> + /** * DOC: XDP RX-queue information * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how - * the driver have configured a given RX-ring queue. + * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP @@ -32,7 +34,7 @@ * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver - * naturally need to stop the RX-ring before purging and reallocating + * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. @@ -62,7 +64,6 @@ struct xdp_rxq_info { u32 queue_index; u32 reg_state; struct xdp_mem_info mem; - unsigned int napi_id; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ @@ -75,6 +76,11 @@ enum xdp_buff_flags { XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ + /* frags have unreadable mem, this can't be true for real XDP packets, + * but drivers may use XDP helpers to construct Rx pkt state even when + * XDP program is not attached. + */ + XDP_FLAGS_FRAGS_UNREADABLE = BIT(2), }; struct xdp_buff { @@ -84,11 +90,23 @@ struct xdp_buff { void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; - u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ - u32 flags; /* supported values defined in xdp_buff_flags */ + + union { + struct { + /* frame size to deduce data_hard_end/tailroom */ + u32 frame_sz; + /* supported values defined in xdp_buff_flags */ + u32 flags; + }; + +#ifdef __LITTLE_ENDIAN + /* Used to micro-optimize xdp_init_buff(), don't use directly */ + u64 frame_sz_flags_init; +#endif + }; }; -static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } @@ -103,22 +121,42 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } -static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) +static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { - return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } -static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) +static __always_inline void xdp_buff_set_frag_unreadable(struct xdp_buff *xdp) { - xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; + xdp->flags |= XDP_FLAGS_FRAGS_UNREADABLE; +} + +static __always_inline u32 xdp_buff_get_skb_flags(const struct xdp_buff *xdp) +{ + return xdp->flags; +} + +static __always_inline void xdp_buff_clear_frag_pfmemalloc(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_FRAGS_PF_MEMALLOC; } static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { - xdp->frame_sz = frame_sz; xdp->rxq = rxq; + +#ifdef __LITTLE_ENDIAN + /* + * Force the compilers to initialize ::flags and assign ::frame_sz with + * one write on 64-bit LE architectures as they're often unable to do + * it themselves. + */ + xdp->frame_sz_flags_init = frame_sz; +#else + xdp->frame_sz = frame_sz; xdp->flags = 0; +#endif } static __always_inline void @@ -144,15 +182,16 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * -xdp_get_shared_info_from_buff(struct xdp_buff *xdp) +xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } -static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) +static __always_inline unsigned int +xdp_get_buff_len(const struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; - struct skb_shared_info *sinfo; + const struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; @@ -163,45 +202,133 @@ out: return len; } +void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp); + +/** + * __xdp_buff_add_frag - attach frag to &xdp_buff + * @xdp: XDP buffer to attach the frag to + * @netmem: network memory containing the frag + * @offset: offset at which the frag starts + * @size: size of the frag + * @truesize: total memory size occupied by the frag + * @try_coalesce: whether to try coalescing the frags (not valid for XSk) + * + * Attach frag to the XDP buffer. If it currently has no frags attached, + * initialize the related fields, otherwise check that the frag number + * didn't reach the limit of ``MAX_SKB_FRAGS``. If possible, try coalescing + * the frag with the previous one. + * The function doesn't check/update the pfmemalloc bit. Please use the + * non-underscored wrapper in drivers. + * + * Return: true on success, false if there's no space for the frag in + * the shared info struct. + */ +static inline bool __xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, + u32 offset, u32 size, u32 truesize, + bool try_coalesce) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + skb_frag_t *prev; + u32 nr_frags; + + if (!xdp_buff_has_frags(xdp)) { + xdp_buff_set_frags_flag(xdp); + + nr_frags = 0; + sinfo->xdp_frags_size = 0; + sinfo->xdp_frags_truesize = 0; + + goto fill; + } + + nr_frags = sinfo->nr_frags; + prev = &sinfo->frags[nr_frags - 1]; + + if (try_coalesce && netmem == skb_frag_netmem(prev) && + offset == skb_frag_off(prev) + skb_frag_size(prev)) { + skb_frag_size_add(prev, size); + /* Guaranteed to only decrement the refcount */ + xdp_return_frag(netmem, xdp); + } else if (unlikely(nr_frags == MAX_SKB_FRAGS)) { + return false; + } else { +fill: + __skb_fill_netmem_desc_noacc(sinfo, nr_frags++, netmem, + offset, size); + } + + sinfo->nr_frags = nr_frags; + sinfo->xdp_frags_size += size; + sinfo->xdp_frags_truesize += truesize; + + return true; +} + +/** + * xdp_buff_add_frag - attach frag to &xdp_buff + * @xdp: XDP buffer to attach the frag to + * @netmem: network memory containing the frag + * @offset: offset at which the frag starts + * @size: size of the frag + * @truesize: total memory size occupied by the frag + * + * Version of __xdp_buff_add_frag() which takes care of the pfmemalloc bit. + * + * Return: true on success, false if there's no space for the frag in + * the shared info struct. + */ +static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, + u32 offset, u32 size, u32 truesize) +{ + if (!__xdp_buff_add_frag(xdp, netmem, offset, size, truesize, true)) + return false; + + if (unlikely(netmem_is_pfmemalloc(netmem))) + xdp_buff_set_frag_pfmemalloc(xdp); + if (unlikely(netmem_is_net_iov(netmem))) + xdp_buff_set_frag_unreadable(xdp); + + return true; +} + struct xdp_frame { void *data; - u16 len; - u16 headroom; + u32 len; + u32 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, - * while mem info is valid on remote CPU. + * while mem_type is valid on remote CPU. */ - struct xdp_mem_info mem; + enum xdp_mem_type mem_type:32; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; -static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) +static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } -static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) +static __always_inline u32 +xdp_frame_get_skb_flags(const struct xdp_frame *frame) { - return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + return frame->flags; } #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; - void *xa; - void *q[XDP_BULK_QUEUE_SIZE]; + netmem_ref q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { - /* bq->count will be zero'ed when bq->xa gets updated */ - bq->xa = NULL; + bq->count = 0; } static inline struct skb_shared_info * -xdp_get_shared_info_from_frame(struct xdp_frame *frame) +xdp_get_shared_info_from_frame(const struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); @@ -223,33 +350,43 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) } static inline void -xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, - unsigned int size, unsigned int truesize, - bool pfmemalloc) +xdp_update_skb_frags_info(struct sk_buff *skb, u8 nr_frags, + unsigned int size, unsigned int truesize, + u32 xdp_flags) { - skb_shinfo(skb)->nr_frags = nr_frags; + struct skb_shared_info *sinfo = skb_shinfo(skb); + + sinfo->nr_frags = nr_frags; + /* + * ``destructor_arg`` is unionized with ``xdp_frags_{,true}size``, + * reset it after that these fields aren't used anymore. + */ + sinfo->destructor_arg = NULL; skb->len += size; skb->data_len += size; skb->truesize += truesize; - skb->pfmemalloc |= pfmemalloc; + skb->pfmemalloc |= !!(xdp_flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + skb->unreadable |= !!(xdp_flags & XDP_FLAGS_FRAGS_UNREADABLE); } /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp); +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); -int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline -void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) +void xdp_convert_frame_to_buff(const struct xdp_frame *frame, + struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; @@ -260,7 +397,7 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) } static inline -int xdp_update_frame_from_buff(struct xdp_buff *xdp, +int xdp_update_frame_from_buff(const struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; @@ -302,24 +439,33 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; - /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ - xdp_frame->mem = xdp->rxq->mem; + /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */ + xdp_frame->mem_type = xdp->rxq->mem.type; return xdp_frame; } -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - struct xdp_buff *xdp); +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); -static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) +static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + if (unlikely(!bq->count)) + return; + + page_pool_put_netmem_bulk(bq->q, bq->count); + bq->count = 0; +} + +static __always_inline unsigned int +xdp_get_frame_len(const struct xdp_frame *xdpf) { - struct skb_shared_info *sinfo; + const struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) @@ -351,6 +497,38 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); +int xdp_reg_page_pool(struct page_pool *pool); +void xdp_unreg_page_pool(const struct page_pool *pool); +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool); + +/** + * xdp_rxq_info_attach_mem_model - attach registered mem info to RxQ info + * @xdp_rxq: XDP RxQ info to attach the memory info to + * @mem: already registered memory info + * + * If the driver registers its memory providers manually, it must use this + * function instead of xdp_rxq_info_reg_mem_model(). + */ +static inline void +xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq, + const struct xdp_mem_info *mem) +{ + xdp_rxq->mem = *mem; +} + +/** + * xdp_rxq_info_detach_mem_model - detach registered mem info from RxQ info + * @xdp_rxq: XDP RxQ info to detach the memory info from + * + * If the driver registers its memory providers manually and then attaches it + * via xdp_rxq_info_attach_mem_model(), it must call this function before + * xdp_rxq_info_unreg(). + */ +static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->mem = (struct xdp_mem_info){ }; +} /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. @@ -369,7 +547,12 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) static inline bool xdp_metalen_invalid(unsigned long metalen) { - return (metalen & (sizeof(__u32) - 1)) || (metalen > 32); + unsigned long meta_max; + + meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); + BUILD_BUG_ON(!__builtin_constant_p(meta_max)); + + return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { @@ -383,14 +566,29 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE +/* Define the relationship between xdp-rx-metadata kfunc and + * various other entities: + * - xdp_rx_metadata enum + * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) + * - kfunc name + * - xdp_metadata_ops field + */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ - bpf_xdp_metadata_rx_timestamp) \ + NETDEV_XDP_RX_METADATA_TIMESTAMP, \ + bpf_xdp_metadata_rx_timestamp, \ + xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ - bpf_xdp_metadata_rx_hash) \ - -enum { -#define XDP_METADATA_KFUNC(name, _) name, + NETDEV_XDP_RX_METADATA_HASH, \ + bpf_xdp_metadata_rx_hash, \ + xmo_rx_hash) \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ + NETDEV_XDP_RX_METADATA_VLAN_TAG, \ + bpf_xdp_metadata_rx_vlan_tag, \ + xmo_rx_vlan_tag) \ + +enum xdp_rx_metadata { +#define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, @@ -416,6 +614,7 @@ enum xdp_rss_hash_type { XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ + XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, @@ -431,11 +630,13 @@ enum xdp_rss_hash_type { XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, @@ -446,14 +647,20 @@ struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); + int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, + u16 *vlan_tci); }; #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); +void xdp_features_clear_redirect_target_locked(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 1617af380162..23e8861e8b25 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -14,6 +14,8 @@ #include <linux/mm.h> #include <net/sock.h> +#define XDP_UMEM_SG_FLAG (1 << 1) + struct net_device; struct xsk_queue; struct xdp_buff; @@ -28,6 +30,7 @@ struct xdp_umem { struct user_struct *user; refcount_t users; u8 flags; + u8 tx_metadata_len; bool zc; struct page **pgs; int id; @@ -61,8 +64,12 @@ struct xdp_sock { struct xsk_queue *tx ____cacheline_aligned_in_smp; struct list_head tx_list; - /* Protects generic receive. */ - spinlock_t rx_lock; + /* record the number of tx descriptors sent by this xsk and + * when it exceeds MAX_PER_SOCKET_BUDGET, an opportunity needs + * to be given to other xsks for sending tx descriptors, thereby + * preventing other XSKs from being starved. + */ + u32 tx_budget_spent; /* Statistics */ u64 rx_dropped; @@ -77,17 +84,124 @@ struct xdp_sock { struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; + u32 max_tx_budget; /* Protects multiple processes in the control path */ struct mutex mutex; struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; +/* + * AF_XDP TX metadata hooks for network devices. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. + * + * void (*tmo_request_timestamp)(void *priv) + * Called when AF_XDP frame requested egress timestamp. + * + * u64 (*tmo_fill_timestamp)(void *priv) + * Called when AF_XDP frame, that had requested egress timestamp, + * received a completion. The hook needs to return the actual HW timestamp. + * + * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv) + * Called when AF_XDP frame requested HW checksum offload. csum_start + * indicates position where checksumming should start. + * csum_offset indicates position where checksum should be stored. + * + * void (*tmo_request_launch_time)(u64 launch_time, void *priv) + * Called when AF_XDP frame requested launch time HW offload support. + * launch_time indicates the PTP time at which the device can schedule the + * packet for transmission. + */ +struct xsk_tx_metadata_ops { + void (*tmo_request_timestamp)(void *priv); + u64 (*tmo_fill_timestamp)(void *priv); + void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); + void (*tmo_request_launch_time)(u64 launch_time, void *priv); +}; + #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); -void __xsk_map_flush(void); +void __xsk_map_flush(struct list_head *flush_list); +INDIRECT_CALLABLE_DECLARE(void xsk_destruct_skb(struct sk_buff *)); + +/** + * xsk_tx_metadata_to_compl - Save enough relevant metadata information + * to perform tx completion in the future. + * @meta: pointer to AF_XDP metadata area + * @compl: pointer to output struct xsk_tx_metadata_to_compl + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. The value of @compl should be stored + * and passed to xsk_tx_metadata_complete upon TX completion. + */ +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ + if (!meta) + return; + + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + compl->tx_timestamp = &meta->completion.tx_timestamp; + else + compl->tx_timestamp = NULL; +} + +/** + * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission + * and call appropriate xsk_tx_metadata_ops operation. + * @meta: pointer to AF_XDP metadata area + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. + */ +static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!meta) + return; + + if (ops->tmo_request_launch_time) + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + ops->tmo_request_launch_time(meta->request.launch_time, + priv); + + if (ops->tmo_request_timestamp) + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + ops->tmo_request_timestamp(priv); + + if (ops->tmo_request_checksum) + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) + ops->tmo_request_checksum(meta->request.csum_start, + meta->request.csum_offset, priv); +} + +/** + * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion + * and call appropriate xsk_tx_metadata_ops operation. + * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device upon + * AF_XDP egress completion. + */ +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!compl) + return; + if (!compl->tx_timestamp) + return; + + *compl->tx_timestamp = ops->tmo_fill_timestamp(priv); +} #else @@ -101,10 +215,32 @@ static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) return -EOPNOTSUPP; } -static inline void __xsk_map_flush(void) +static inline void __xsk_map_flush(struct list_head *flush_list) { } -#endif /* CONFIG_XDP_SOCKETS */ +#ifdef CONFIG_MITIGATION_RETPOLINE +static inline void xsk_destruct_skb(struct sk_buff *skb) +{ +} +#endif + +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ +} +static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + +#endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 1f6fc8c7a84c..242e34f771cc 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -12,6 +12,16 @@ #define XDP_UMEM_MIN_CHUNK_SHIFT 11 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) +#define NETDEV_XDP_ACT_XSK (NETDEV_XDP_ACT_BASIC | \ + NETDEV_XDP_ACT_REDIRECT | \ + NETDEV_XDP_ACT_XSK_ZEROCOPY) + +struct xsk_cb_desc { + void *src; + u8 off; + u8 bytes; +}; + #ifdef CONFIG_XDP_SOCKETS void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); @@ -47,13 +57,10 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, xp_set_rxq_info(pool, rxq); } -static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) { -#ifdef CONFIG_NET_RX_BUSY_POLL - return pool->heads[0].xdp.rxq->napi_id; -#else - return 0; -#endif + xp_fill_cb(pool, desc); } static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, @@ -89,7 +96,7 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); } -static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return !xp_mb_desc(desc); } @@ -114,8 +121,8 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) if (likely(!xdp_buff_has_frags(xdp))) goto out; - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { - list_del(&pos->xskb_list_node); + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { + list_del(&pos->list_node); xp_free(pos); } @@ -124,34 +131,72 @@ out: xp_free(xskb); } -static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +static inline bool xsk_buff_add_frag(struct xdp_buff *head, + struct xdp_buff *xdp) { - struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); + const void *data = xdp->data; + struct xdp_buff_xsk *frag; - list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); + if (!__xdp_buff_add_frag(head, virt_to_netmem(data), + offset_in_page(data), xdp->data_end - data, + xdp->frame_sz, false)) + return false; + + frag = container_of(xdp, struct xdp_buff_xsk, xdp); + list_add_tail(&frag->list_node, &frag->pool->xskb_list); + + return true; } -static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff *ret = NULL; struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, - struct xdp_buff_xsk, xskb_list_node); + struct xdp_buff_xsk, list_node); if (frag) { - list_del(&frag->xskb_list_node); + list_del(&frag->list_node); ret = &frag->xdp; } return ret; } +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + list_del(&xskb->list_node); +} + +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_first_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + list_node); + return &frag->xdp; +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + list_node); + return &frag->xdp; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; xdp->data_meta = xdp->data; xdp->data_end = xdp->data + size; + xdp->flags = 0; } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, @@ -165,12 +210,59 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) +/** + * xsk_buff_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for + * details. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) { - struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + return xp_raw_get_ctx(pool, addr); +} + +#define XDP_TXMD_FLAGS_VALID ( \ + XDP_TXMD_FLAGS_TIMESTAMP | \ + XDP_TXMD_FLAGS_CHECKSUM | \ + XDP_TXMD_FLAGS_LAUNCH_TIME | \ + 0) + +static inline bool +xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) +{ + return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); +} + +static inline struct xsk_tx_metadata * +__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) +{ + struct xsk_tx_metadata *meta; + + if (!pool->tx_metadata_len) + return NULL; + + meta = data - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return NULL; /* no way to signal the error to the user */ - if (!pool->dma_need_sync) - return; + return meta; +} + +static inline struct xsk_tx_metadata * +xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr)); +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); xp_dma_sync_for_cpu(xskb); } @@ -250,9 +342,9 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, { } -static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) { - return 0; } static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, @@ -281,7 +373,7 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; } -static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return false; } @@ -300,11 +392,27 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } -static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +static inline bool xsk_buff_add_frag(struct xdp_buff *head, + struct xdp_buff *xdp) +{ + return false; +} + +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { + return NULL; } -static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) +{ +} + +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) +{ + return NULL; +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { return NULL; } @@ -324,7 +432,30 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + return (struct xdp_desc_ctx){ }; +} + +static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +{ + return false; +} + +static inline struct xsk_tx_metadata * +__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) +{ + return NULL; +} + +static inline struct xsk_tx_metadata * +xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return NULL; +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { } diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 363c7d510554..0a14daaa5dd4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -19,6 +19,7 @@ #include <net/sock.h> #include <net/dst.h> +#include <net/inet_dscp.h> #include <net/ip.h> #include <net/route.h> #include <net/ipv6.h> @@ -37,6 +38,7 @@ #define XFRM_PROTO_COMP 108 #define XFRM_PROTO_IPIP 4 #define XFRM_PROTO_IPV6 41 +#define XFRM_PROTO_IPTFS IPPROTO_AGGFRAG #define XFRM_PROTO_ROUTING IPPROTO_ROUTING #define XFRM_PROTO_DSTOPTS IPPROTO_DSTOPTS @@ -51,8 +53,10 @@ #ifdef CONFIG_XFRM_STATISTICS #define XFRM_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.xfrm_statistics, field) +#define XFRM_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.xfrm_statistics, field, val) #else #define XFRM_INC_STATS(net, field) ((void)(net)) +#define XFRM_ADD_STATS(net, field, val) ((void)(net)) #endif @@ -65,27 +69,27 @@ - instance of a transformer, struct xfrm_state (=SA) - template to clone xfrm_state, struct xfrm_tmpl - SPD is plain linear list of xfrm_policy rules, ordered by priority. + SPD is organized as hash table (for policies that meet minimum address prefix + length setting, net->xfrm.policy_hthresh). Other policies are stored in + lists, sorted into rbtree ordered by destination and source address networks. + See net/xfrm/xfrm_policy.c for details. + (To be compatible with existing pfkeyv2 implementations, many rules with priority of 0x7fffffff are allowed to exist and such rules are ordered in an unpredictable way, thanks to bsd folks.) - Lookup is plain linear search until the first match with selector. - If "action" is "block", then we prohibit the flow, otherwise: if "xfrms_nr" is zero, the flow passes untransformed. Otherwise, policy entry has list of up to XFRM_MAX_DEPTH transformations, described by templates xfrm_tmpl. Each template is resolved to a complete xfrm_state (see below) and we pack bundle of transformations - to a dst_entry returned to requestor. + to a dst_entry returned to requester. dst -. xfrm .-> xfrm_state #1 |---. child .-> dst -. xfrm .-> xfrm_state #2 |---. child .-> dst -. xfrm .-> xfrm_state #3 |---. child .-> NULL - Bundles are cached at xrfm_policy struct (field ->bundles). - Resolution of xrfm_tmpl ----------------------- @@ -143,8 +147,19 @@ enum { }; struct xfrm_dev_offload { + /* The device for this offload. + * Device drivers should not use this directly, as that will prevent + * them from working with bonding device. Instead, the device passed + * to the add/delete callbacks should be used. + */ struct net_device *dev; netdevice_tracker dev_tracker; + /* This is a private pointer used by the bonding driver (and eventually + * should be moved there). Device drivers should not use it. + * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases, + * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock + * is held. + */ struct net_device *real_dev; unsigned long offload_handle; u8 dir : 2; @@ -176,13 +191,19 @@ struct xfrm_state { struct hlist_node gclist; struct hlist_node bydst; }; - struct hlist_node bysrc; + union { + struct hlist_node dev_gclist; + struct hlist_node bysrc; + }; struct hlist_node byspi; struct hlist_node byseq; + struct hlist_node state_cache; + struct hlist_node state_cache_input; refcount_t refcnt; spinlock_t lock; + u32 pcpu_num; struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; @@ -204,6 +225,7 @@ struct xfrm_state { u16 family; xfrm_address_t saddr; int header_len; + int enc_hdr_len; int trailer_len; u32 extra_flags; struct xfrm_mark smark; @@ -225,7 +247,10 @@ struct xfrm_state { /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; - struct sock __rcu *encap_sk; + + /* NAT keepalive */ + u32 nat_keepalive_interval; /* seconds */ + time64_t nat_keepalive_expiration; /* Data for care-of address */ xfrm_address_t *coaddr; @@ -289,6 +314,10 @@ struct xfrm_state { /* Private data of this transformer, format is opaque, * interpreted by xfrm_type methods. */ void *data; + u8 dir; + + const struct xfrm_mode_cbs *mode_cbs; + void *mode_data; }; static inline struct net *xs_net(struct xfrm_state *x) @@ -339,20 +368,25 @@ struct xfrm_if_cb { void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); void xfrm_if_unregister_cb(void); +struct xfrm_dst_lookup_params { + struct net *net; + dscp_t dscp; + int oif; + xfrm_address_t *saddr; + xfrm_address_t *daddr; + u32 mark; + __u8 ipproto; + union flowi_uli uli; +}; + struct net_device; struct xfrm_type; struct xfrm_dst; struct xfrm_policy_afinfo { struct dst_ops *dst_ops; - struct dst_entry *(*dst_lookup)(struct net *net, - int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - u32 mark); - int (*get_saddr)(struct net *net, int oif, - xfrm_address_t *saddr, - xfrm_address_t *daddr, - u32 mark); + struct dst_entry *(*dst_lookup)(const struct xfrm_dst_lookup_params *params); + int (*get_saddr)(xfrm_address_t *saddr, + const struct xfrm_dst_lookup_params *params); int (*fill_dst)(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl); @@ -407,7 +441,6 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo); void xfrm_flush_gc(void); -void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { struct module *owner; @@ -440,6 +473,54 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); +void xfrm_set_type_offload(struct xfrm_state *x, bool try_load); +static inline void xfrm_unset_type_offload(struct xfrm_state *x) +{ + if (!x->type_offload) + return; + + module_put(x->type_offload->owner); + x->type_offload = NULL; +} + +/** + * struct xfrm_mode_cbs - XFRM mode callbacks + * @owner: module owner or NULL + * @init_state: Add/init mode specific state in `xfrm_state *x` + * @clone_state: Copy mode specific values from `orig` to new state `x` + * @destroy_state: Cleanup mode specific state from `xfrm_state *x` + * @user_init: Process mode specific netlink attributes from user + * @copy_to_user: Add netlink attributes to `attrs` based on state in `x` + * @sa_len: Return space required to store mode specific netlink attributes + * @get_inner_mtu: Return avail payload space after removing encap overhead + * @input: Process received packet from SA using mode + * @output: Output given packet using mode + * @prepare_output: Add mode specific encapsulation to packet in skb. On return + * `transport_header` should point at ESP header, `network_header` should + * point at outer IP header and `mac_header` should opint at the + * protocol/nexthdr field of the outer IP. + * + * One should examine and understand the specific uses of these callbacks in + * xfrm for further detail on how and when these functions are called. RTSL. + */ +struct xfrm_mode_cbs { + struct module *owner; + int (*init_state)(struct xfrm_state *x); + int (*clone_state)(struct xfrm_state *x, struct xfrm_state *orig); + void (*destroy_state)(struct xfrm_state *x); + int (*user_init)(struct net *net, struct xfrm_state *x, + struct nlattr **attrs, + struct netlink_ext_ack *extack); + int (*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb); + unsigned int (*sa_len)(const struct xfrm_state *x); + u32 (*get_inner_mtu)(struct xfrm_state *x, int outer_mtu); + int (*input)(struct xfrm_state *x, struct sk_buff *skb); + int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); + int (*prepare_output)(struct xfrm_state *x, struct sk_buff *skb); +}; + +int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs); +void xfrm_unregister_mode_cbs(u8 mode); static inline int xfrm_af2proto(unsigned int family) { @@ -455,7 +536,8 @@ static inline int xfrm_af2proto(unsigned int family) static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) { - if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || + if ((x->sel.family != AF_UNSPEC) || + (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) return &x->inner_mode; else @@ -516,11 +598,44 @@ struct xfrm_policy_queue { unsigned long timeout; }; +/** + * struct xfrm_policy - xfrm policy + * @xp_net: network namespace the policy lives in + * @bydst: hlist node for SPD hash table or rbtree list + * @byidx: hlist node for index hash table + * @state_cache_list: hlist head for policy cached xfrm states + * @lock: serialize changes to policy structure members + * @refcnt: reference count, freed once it reaches 0 + * @pos: kernel internal tie-breaker to determine age of policy + * @timer: timer + * @genid: generation, used to invalidate old policies + * @priority: priority, set by userspace + * @index: policy index (autogenerated) + * @if_id: virtual xfrm interface id + * @mark: packet mark + * @selector: selector + * @lft: liftime configuration data + * @curlft: liftime state + * @walk: list head on pernet policy list + * @polq: queue to hold packets while aqcuire operaion in progress + * @bydst_reinsert: policy tree node needs to be merged + * @type: XFRM_POLICY_TYPE_MAIN or _SUB + * @action: XFRM_POLICY_ALLOW or _BLOCK + * @flags: XFRM_POLICY_LOCALOK, XFRM_POLICY_ICMP + * @xfrm_nr: number of used templates in @xfrm_vec + * @family: protocol family + * @security: SELinux security label + * @xfrm_vec: array of templates to resolve state + * @rcu: rcu head, used to defer memory release + * @xdo: hardware offload state + */ struct xfrm_policy { possible_net_t xp_net; struct hlist_node bydst; struct hlist_node byidx; + struct hlist_head state_cache_list; + /* This lock only affects elements except for entry. */ rwlock_t lock; refcount_t refcnt; @@ -545,7 +660,6 @@ struct xfrm_policy { u16 family; struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; - struct hlist_node bydst_inexact_list; struct rcu_head rcu; struct xfrm_dev_offload xdo; @@ -802,7 +916,7 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) xfrm_pol_put(pols[i]); } -void __xfrm_state_destroy(struct xfrm_state *, bool); +void __xfrm_state_destroy(struct xfrm_state *); static inline void __xfrm_state_put(struct xfrm_state *x) { @@ -812,13 +926,7 @@ static inline void __xfrm_state_put(struct xfrm_state *x) static inline void xfrm_state_put(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, false); -} - -static inline void xfrm_state_put_sync(struct xfrm_state *x) -{ - if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, true); + __xfrm_state_destroy(x); } static inline void xfrm_state_hold(struct xfrm_state *x) @@ -1006,7 +1114,7 @@ void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); struct xfrm_if_parms { int link; /* ifindex of underlying L2 interface */ - u32 if_id; /* interface identifyer */ + u32 if_id; /* interface identifier */ bool collect_md; }; @@ -1047,6 +1155,9 @@ struct xfrm_offload { #define CRYPTO_INVALID_PACKET_SYNTAX 64 #define CRYPTO_INVALID_PROTOCOL 128 + /* Used to keep whole l2 header for transport mode GRO */ + __u32 orig_mac_len; + __u8 proto; __u8 inner_ipproto; }; @@ -1170,9 +1281,19 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, if (xo) { x = xfrm_input_state(skb); - if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) - return (xo->flags & CRYPTO_DONE) && - (xo->status & CRYPTO_SUCCESS); + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) { + bool check = (xo->flags & CRYPTO_DONE) && + (xo->status & CRYPTO_SUCCESS); + + /* The packets here are plain ones and secpath was + * needed to indicate that hardware already handled + * them and there is no need to do nothing in addition. + * + * Consume secpath which was set by drivers. + */ + secpath_reset(skb); + return check; + } } return __xfrm_check_nopolicy(net, skb, dir) || @@ -1207,20 +1328,20 @@ static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir, return __xfrm_policy_check2(sk, dir, skb, AF_INET6, 1); } -int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, +int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse); -static inline int xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, +static inline int xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family) { - return __xfrm_decode_session(skb, fl, family, 0); + return __xfrm_decode_session(net, skb, fl, family, 0); } -static inline int xfrm_decode_session_reverse(struct sk_buff *skb, +static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family) { - return __xfrm_decode_session(skb, fl, family, 1); + return __xfrm_decode_session(net, skb, fl, family, 1); } int __xfrm_route_forward(struct sk_buff *skb, unsigned short family); @@ -1296,7 +1417,7 @@ static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *sk { return 1; } -static inline int xfrm_decode_session_reverse(struct sk_buff *skb, +static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family) { @@ -1577,22 +1698,20 @@ struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, unsigned short family); int xfrm_state_check_expire(struct xfrm_state *x); +void xfrm_state_update_stats(struct net *net); #ifdef CONFIG_XFRM_OFFLOAD -static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) +static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) { struct xfrm_dev_offload *xdo = &x->xso; - struct net_device *dev = xdo->dev; - - if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) - return; + struct net_device *dev = READ_ONCE(xdo->dev); if (dev && dev->xfrmdev_ops && - dev->xfrmdev_ops->xdo_dev_state_update_curlft) - dev->xfrmdev_ops->xdo_dev_state_update_curlft(x); + dev->xfrmdev_ops->xdo_dev_state_update_stats) + dev->xfrmdev_ops->xdo_dev_state_update_stats(x); } #else -static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) {} +static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) {} #endif void xfrm_state_insert(struct xfrm_state *x); int xfrm_state_add(struct xfrm_state *x); @@ -1600,6 +1719,10 @@ int xfrm_state_update(struct xfrm_state *x); struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family); +struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family); struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, @@ -1639,9 +1762,9 @@ struct xfrmk_spdinfo { u32 spdhmcnt; }; -struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid); @@ -1650,8 +1773,7 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); -int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, - struct netlink_ext_ack *extack); +int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack); int xfrm_init_state(struct xfrm_state *x); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); @@ -1663,13 +1785,21 @@ int xfrm_trans_queue(struct sk_buff *skb, struct sk_buff *)); int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); +int xfrm4_tunnel_check_size(struct sk_buff *skb); +#if IS_ENABLED(CONFIG_IPV6) +int xfrm6_tunnel_check_size(struct sk_buff *skb); +#else +static inline int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ + return -EMSGSIZE; +} +#endif #if IS_ENABLED(CONFIG_NET_PKTGEN) int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb); #endif void xfrm_local_error(struct sk_buff *skb, int mtu); -int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm4_transport_finish(struct sk_buff *skb, int async); @@ -1689,7 +1819,6 @@ int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char prot int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); void xfrm4_local_error(struct sk_buff *skb, u32 mtu); -int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t); int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, @@ -1712,6 +1841,10 @@ int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb); void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu); int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); +struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, + struct sk_buff *skb); +struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, + struct sk_buff *skb); int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen); #else @@ -1722,10 +1855,7 @@ static inline int xfrm_user_policy(struct sock *sk, int optname, } #endif -struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - int family, u32 mark); +struct dst_entry *__xfrm_dst_lookup(int family, const struct xfrm_dst_lookup_params *params); struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp); @@ -1752,7 +1882,7 @@ int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, struct netlink_ext_ack *extack); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u32 if_id, u8 proto, + u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); @@ -1767,12 +1897,16 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, + struct net *net, + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + struct xfrm_user_offload *xuo); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); @@ -1940,13 +2074,16 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, struct xfrm_user_offload *xuo, u8 dir, struct netlink_ext_ack *extack); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); +void xfrm_dev_state_delete(struct xfrm_state *x); +void xfrm_dev_state_free(struct xfrm_state *x); static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) { struct xfrm_dev_offload *xso = &x->xso; + struct net_device *dev = READ_ONCE(xso->dev); - if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn) - xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); + if (dev && dev->xfrmdev_ops->xdo_dev_state_advance_esn) + dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); } static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) @@ -1967,28 +2104,6 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) return false; } -static inline void xfrm_dev_state_delete(struct xfrm_state *x) -{ - struct xfrm_dev_offload *xso = &x->xso; - - if (xso->dev) - xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); -} - -static inline void xfrm_dev_state_free(struct xfrm_state *x) -{ - struct xfrm_dev_offload *xso = &x->xso; - struct net_device *dev = xso->dev; - - if (dev && dev->xfrmdev_ops) { - if (dev->xfrmdev_ops->xdo_dev_state_free) - dev->xfrmdev_ops->xdo_dev_state_free(x); - xso->dev = NULL; - xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; - netdev_put(dev, &xso->dev_tracker); - } -} - static inline void xfrm_dev_policy_delete(struct xfrm_policy *x) { struct xfrm_dev_offload *xdo = &x->xdo; @@ -2166,7 +2281,7 @@ static inline bool xfrm6_local_dontfrag(const struct sock *sk) proto = sk->sk_protocol; if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) - return inet6_sk(sk)->dontfrag; + return inet6_test_bit(DONTFRAG, sk); return false; } @@ -2188,4 +2303,19 @@ static inline int register_xfrm_interface_bpf(void) #endif +#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF) +int register_xfrm_state_bpf(void); +#else +static inline int register_xfrm_state_bpf(void) +{ + return 0; +} +#endif + +int xfrm_nat_keepalive_init(unsigned short family); +void xfrm_nat_keepalive_fini(unsigned short family); +int xfrm_nat_keepalive_net_init(struct net *net); +int xfrm_nat_keepalive_net_fini(struct net *net); +void xfrm_nat_keepalive_state_updated(struct xfrm_state *x); + #endif /* _NET_XFRM_H */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index b0bdff26fc88..92a2358c6ce3 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -12,6 +12,7 @@ struct xsk_buff_pool; struct xdp_rxq_info; +struct xsk_cb_desc; struct xsk_queue; struct xdp_desc; struct xdp_umem; @@ -27,12 +28,11 @@ struct xdp_buff_xsk { dma_addr_t dma; dma_addr_t frame_dma; struct xsk_buff_pool *pool; - u64 orig_addr; - struct list_head free_list_node; - struct list_head xskb_list_node; -}; + struct list_head list_node; +} __aligned_largest; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) +#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) struct xsk_dma_map { dma_addr_t *dma_pages; @@ -41,7 +41,6 @@ struct xsk_dma_map { refcount_t users; struct list_head list; /* Protected by the RTNL_LOCK */ u32 dma_pages_cnt; - bool dma_need_sync; }; struct xsk_buff_pool { @@ -54,6 +53,8 @@ struct xsk_buff_pool { refcount_t users; struct xdp_umem *umem; struct work_struct work; + /* Protects generic receive in shared and non-shared umem mode. */ + spinlock_t rx_lock; struct list_head free_list; struct list_head xskb_list; u32 heads_cnt; @@ -77,16 +78,23 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u32 xdp_zc_max_segs; + u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; - bool dma_need_sync; bool unaligned; + bool tx_sw_csum; void *addrs; - /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: - * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when - * sockets share a single cq when the same netdev and queue id is shared. + /* Mutual exclusion of the completion ring in the SKB mode. + * Protect: NAPI TX thread and sendmsg error paths in the SKB + * destructor callback. + */ + spinlock_t cq_prod_lock; + /* Mutual exclusion of the completion ring in the SKB mode. + * Protect: when sockets share a single cq when the same netdev + * and queue id is shared. */ - spinlock_t cq_lock; + spinlock_t cq_cached_prod_lock; struct xdp_buff_xsk *free_heads[]; }; @@ -118,7 +126,6 @@ void xp_free(struct xdp_buff_xsk *xskb); static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, u64 addr) { - xskb->orig_addr = addr; xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; } @@ -132,6 +139,7 @@ static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_p /* AF_XDP ZC drivers, via xdp_sock_buff.h */ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +void xp_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc); int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages); void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); @@ -140,6 +148,14 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); + +struct xdp_desc_ctx { + dma_addr_t dma; + struct xsk_tx_metadata *meta; +}; + +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr); + static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) { return xskb->dma; @@ -150,21 +166,17 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) return xskb->frame_dma; } -void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { - xp_dma_sync_for_cpu_slow(xskb); + dma_sync_single_for_cpu(xskb->pool->dev, xskb->dma, + xskb->pool->frame_len, + DMA_BIDIRECTIONAL); } -void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, - size_t size); static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { - if (!pool->dma_need_sync) - return; - - xp_dma_sync_for_device_slow(pool, dma, size); + dma_sync_single_for_device(pool->dev, dma, size, DMA_BIDIRECTIONAL); } /* Masks for xdp_umem_page flags. @@ -186,7 +198,7 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } -static inline bool xp_mb_desc(struct xdp_desc *desc) +static inline bool xp_mb_desc(const struct xdp_desc *desc) { return desc->options & XDP_PKT_CONTD; } @@ -223,14 +235,24 @@ static inline void xp_release(struct xdp_buff_xsk *xskb) xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; } -static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) +static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, + struct xsk_buff_pool *pool) { - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + u64 orig_addr = xskb->xdp.data - pool->addrs; + u64 offset; + + if (!pool->unaligned) + return orig_addr; - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); + offset = xskb->xdp.data - xskb->xdp.data_hard_start; + offset += pool->headroom; + orig_addr -= offset; + return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); +} + +static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) +{ + return pool->tx_metadata_len > 0; } #endif /* XSK_BUFF_POOL_H_ */ |
