summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-06-30 15:51:09 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-30 15:51:09 -0700
commitdbe69e43372212527abf48609aba7fc39a6daa27 (patch)
tree96cfafdf70f5325ceeac1054daf7deca339c9730 /net/core
parenta6eaf3850cb171c328a8b0db6d3c79286a1eba9d (diff)
parentb6df00789e2831fff7a2c65aa7164b2a4dcbe599 (diff)
Merge tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - BPF: - add syscall program type and libbpf support for generating instructions and bindings for in-kernel BPF loaders (BPF loaders for BPF), this is a stepping stone for signed BPF programs - infrastructure to migrate TCP child sockets from one listener to another in the same reuseport group/map to improve flexibility of service hand-off/restart - add broadcast support to XDP redirect - allow bypass of the lockless qdisc to improving performance (for pktgen: +23% with one thread, +44% with 2 threads) - add a simpler version of "DO_ONCE()" which does not require jump labels, intended for slow-path usage - virtio/vsock: introduce SOCK_SEQPACKET support - add getsocketopt to retrieve netns cookie - ip: treat lowest address of a IPv4 subnet as ordinary unicast address allowing reclaiming of precious IPv4 addresses - ipv6: use prandom_u32() for ID generation - ip: add support for more flexible field selection for hashing across multi-path routes (w/ offload to mlxsw) - icmp: add support for extended RFC 8335 PROBE (ping) - seg6: add support for SRv6 End.DT46 behavior - mptcp: - DSS checksum support (RFC 8684) to detect middlebox meddling - support Connection-time 'C' flag - time stamping support - sctp: packetization Layer Path MTU Discovery (RFC 8899) - xfrm: speed up state addition with seq set - WiFi: - hidden AP discovery on 6 GHz and other HE 6 GHz improvements - aggregation handling improvements for some drivers - minstrel improvements for no-ack frames - deferred rate control for TXQs to improve reaction times - switch from round robin to virtual time-based airtime scheduler - add trace points: - tcp checksum errors - openvswitch - action execution, upcalls - socket errors via sk_error_report Device APIs: - devlink: add rate API for hierarchical control of max egress rate of virtual devices (VFs, SFs etc.) - don't require RCU read lock to be held around BPF hooks in NAPI context - page_pool: generic buffer recycling New hardware/drivers: - mobile: - iosm: PCIe Driver for Intel M.2 Modem - support for Qualcomm MSM8998 (ipa) - WiFi: Qualcomm QCN9074 and WCN6855 PCI devices - sparx5: Microchip SparX-5 family of Enterprise Ethernet switches - Mellanox BlueField Gigabit Ethernet (control NIC of the DPU) - NXP SJA1110 Automotive Ethernet 10-port switch - Qualcomm QCA8327 switch support (qca8k) - Mikrotik 10/25G NIC (atl1c) Driver changes: - ACPI support for some MDIO, MAC and PHY devices from Marvell and NXP (our first foray into MAC/PHY description via ACPI) - HW timestamping (PTP) support: bnxt_en, ice, sja1105, hns3, tja11xx - Mellanox/Nvidia NIC (mlx5) - NIC VF offload of L2 bridging - support IRQ distribution to Sub-functions - Marvell (prestera): - add flower and match all - devlink trap - link aggregation - Netronome (nfp): connection tracking offload - Intel 1GE (igc): add AF_XDP support - Marvell DPU (octeontx2): ingress ratelimit offload - Google vNIC (gve): new ring/descriptor format support - Qualcomm mobile (rmnet & ipa): inline checksum offload support - MediaTek WiFi (mt76) - mt7915 MSI support - mt7915 Tx status reporting - mt7915 thermal sensors support - mt7921 decapsulation offload - mt7921 enable runtime pm and deep sleep - Realtek WiFi (rtw88) - beacon filter support - Tx antenna path diversity support - firmware crash information via devcoredump - Qualcomm WiFi (wcn36xx) - Wake-on-WLAN support with magic packets and GTK rekeying - Micrel PHY (ksz886x/ksz8081): add cable test support" * tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2168 commits) tcp: change ICSK_CA_PRIV_SIZE definition tcp_yeah: check struct yeah size at compile time gve: DQO: Fix off by one in gve_rx_dqo() stmmac: intel: set PCI_D3hot in suspend stmmac: intel: Enable PHY WOL option in EHL net: stmmac: option to enable PHY WOL with PMT enabled net: say "local" instead of "static" addresses in ndo_dflt_fdb_{add,del} net: use netdev_info in ndo_dflt_fdb_{add,del} ptp: Set lookup cookie when creating a PTP PPS source. net: sock: add trace for socket errors net: sock: introduce sk_error_report net: dsa: replay the local bridge FDB entries pointing to the bridge dev too net: dsa: ensure during dsa_fdb_offload_notify that dev_hold and dev_put are on the same dev net: dsa: include fdb entries pointing to bridge in the host fdb list net: dsa: include bridge addresses which are local in the host fdb list net: dsa: sync static FDB entries on foreign interfaces to hardware net: dsa: install the host MDB and FDB entries in the master's RX filter net: dsa: reference count the FDB addresses at the cross-chip notifier level net: dsa: introduce a separate cross-chip notifier type for host FDBs net: dsa: reference count the MDB entries at the cross-chip notifier level ...
Diffstat (limited to 'net/core')
-rw-r--r--net/core/bpf_sk_storage.c3
-rw-r--r--net/core/dev.c56
-rw-r--r--net/core/devlink.c716
-rw-r--r--net/core/filter.c114
-rw-r--r--net/core/flow_dissector.c4
-rw-r--r--net/core/neighbour.c2
-rw-r--r--net/core/net-traces.c1
-rw-r--r--net/core/netpoll.c4
-rw-r--r--net/core/page_pool.c28
-rw-r--r--net/core/pktgen.c38
-rw-r--r--net/core/rtnetlink.c70
-rw-r--r--net/core/skbuff.c26
-rw-r--r--net/core/skmsg.c82
-rw-r--r--net/core/sock.c120
-rw-r--r--net/core/sock_map.c2
-rw-r--r--net/core/sock_reuseport.c366
-rw-r--r--net/core/xdp.c39
17 files changed, 1427 insertions, 244 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index cc3712ad8716..f564f82e91d9 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -524,8 +524,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
nr_maps++;
}
- diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps,
- GFP_KERNEL);
+ diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);
if (!diag)
return ERR_PTR(-ENOMEM);
diff --git a/net/core/dev.c b/net/core/dev.c
index 2512f672bf8a..c253c2aafe97 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -148,6 +148,7 @@
#include <net/devlink.h>
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
+#include <linux/once_lite.h>
#include "net-sysfs.h"
@@ -3487,13 +3488,16 @@ EXPORT_SYMBOL(__skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
+static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
+{
+ pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
+ skb_dump(KERN_ERR, skb, true);
+ dump_stack();
+}
+
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
- if (net_ratelimit()) {
- pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
- skb_dump(KERN_ERR, skb, true);
- dump_stack();
- }
+ DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif
@@ -3852,10 +3856,33 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
if (q->flags & TCQ_F_NOLOCK) {
+ if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
+ qdisc_run_begin(q)) {
+ /* Retest nolock_qdisc_is_empty() within the protection
+ * of q->seqlock to protect from racing with requeuing.
+ */
+ if (unlikely(!nolock_qdisc_is_empty(q))) {
+ rc = q->enqueue(skb, q, &to_free) &
+ NET_XMIT_MASK;
+ __qdisc_run(q);
+ qdisc_run_end(q);
+
+ goto no_lock_out;
+ }
+
+ qdisc_bstats_cpu_update(q, skb);
+ if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
+ !nolock_qdisc_is_empty(q))
+ __qdisc_run(q);
+
+ qdisc_run_end(q);
+ return NET_XMIT_SUCCESS;
+ }
+
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
- if (likely(!netif_xmit_frozen_or_stopped(txq)))
- qdisc_run(q);
+ qdisc_run(q);
+no_lock_out:
if (unlikely(to_free))
kfree_skb_list(to_free);
return rc;
@@ -5277,9 +5304,9 @@ another_round:
if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret2;
- preempt_disable();
+ migrate_disable();
ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
- preempt_enable();
+ migrate_enable();
if (ret2 != XDP_PASS) {
ret = NET_RX_DROP;
@@ -6520,11 +6547,18 @@ EXPORT_SYMBOL(napi_schedule_prep);
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
*
- * Variant of __napi_schedule() assuming hard irqs are masked
+ * Variant of __napi_schedule() assuming hard irqs are masked.
+ *
+ * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
+ * because the interrupt disabled assumption might not be true
+ * due to force-threaded interrupts and spinlock substitution.
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
- ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ else
+ __napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 051432ea4f69..8fdd04f00fd7 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -190,6 +190,80 @@ static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
return devlink_port_get_from_attrs(devlink, info->attrs);
}
+static inline bool
+devlink_rate_is_leaf(struct devlink_rate *devlink_rate)
+{
+ return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF;
+}
+
+static inline bool
+devlink_rate_is_node(struct devlink_rate *devlink_rate)
+{
+ return devlink_rate->type == DEVLINK_RATE_TYPE_NODE;
+}
+
+static struct devlink_rate *
+devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ struct devlink_rate *devlink_rate;
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_from_attrs(devlink, info->attrs);
+ if (IS_ERR(devlink_port))
+ return ERR_CAST(devlink_port);
+ devlink_rate = devlink_port->devlink_rate;
+ return devlink_rate ?: ERR_PTR(-ENODEV);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name)
+{
+ static struct devlink_rate *devlink_rate;
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ if (devlink_rate_is_node(devlink_rate) &&
+ !strcmp(node_name, devlink_rate->name))
+ return devlink_rate;
+ }
+ return ERR_PTR(-ENODEV);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
+{
+ const char *rate_node_name;
+ size_t len;
+
+ if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME])
+ return ERR_PTR(-EINVAL);
+ rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]);
+ len = strlen(rate_node_name);
+ /* Name cannot be empty or decimal number */
+ if (!len || strspn(rate_node_name, "0123456789") == len)
+ return ERR_PTR(-EINVAL);
+
+ return devlink_rate_node_get_by_name(devlink, rate_node_name);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ return devlink_rate_node_get_from_attrs(devlink, info->attrs);
+}
+
+static struct devlink_rate *
+devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ struct nlattr **attrs = info->attrs;
+
+ if (attrs[DEVLINK_ATTR_PORT_INDEX])
+ return devlink_rate_leaf_get_from_info(devlink, info);
+ else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME])
+ return devlink_rate_node_get_from_info(devlink, info);
+ else
+ return ERR_PTR(-EINVAL);
+}
+
struct devlink_sb {
struct list_head list;
unsigned int index;
@@ -408,12 +482,14 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1)
+#define DEVLINK_NL_FLAG_NEED_RATE BIT(2)
+#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3)
/* The per devlink instance lock is taken by default in the pre-doit
* operation, yet several commands do not require this. The global
* devlink lock is taken and protects from disruption by user-calls.
*/
-#define DEVLINK_NL_FLAG_NO_LOCK BIT(2)
+#define DEVLINK_NL_FLAG_NO_LOCK BIT(4)
static int devlink_nl_pre_doit(const struct genl_ops *ops,
struct sk_buff *skb, struct genl_info *info)
@@ -442,6 +518,24 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
devlink_port = devlink_port_get_from_info(devlink, info);
if (!IS_ERR(devlink_port))
info->user_ptr[1] = devlink_port;
+ } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) {
+ struct devlink_rate *devlink_rate;
+
+ devlink_rate = devlink_rate_get_from_info(devlink, info);
+ if (IS_ERR(devlink_rate)) {
+ err = PTR_ERR(devlink_rate);
+ goto unlock;
+ }
+ info->user_ptr[1] = devlink_rate;
+ } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) {
+ struct devlink_rate *rate_node;
+
+ rate_node = devlink_rate_node_get_from_info(devlink, info);
+ if (IS_ERR(rate_node)) {
+ err = PTR_ERR(rate_node);
+ goto unlock;
+ }
+ info->user_ptr[1] = rate_node;
}
return 0;
@@ -748,6 +842,56 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *
return 0;
}
+static int devlink_nl_rate_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_rate *devlink_rate,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags,
+ struct netlink_ext_ack *extack)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type))
+ goto nla_put_failure;
+
+ if (devlink_rate_is_leaf(devlink_rate)) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ devlink_rate->devlink_port->index))
+ goto nla_put_failure;
+ } else if (devlink_rate_is_node(devlink_rate)) {
+ if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME,
+ devlink_rate->name))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE,
+ devlink_rate->tx_share, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX,
+ devlink_rate->tx_max, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (devlink_rate->parent)
+ if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
+ devlink_rate->parent->name))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
static bool
devlink_port_fn_state_valid(enum devlink_port_fn_state state)
{
@@ -919,6 +1063,111 @@ static void devlink_port_notify(struct devlink_port *devlink_port,
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
+static void devlink_rate_notify(struct devlink_rate *devlink_rate,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_RATE_NEW &&
+ cmd != DEVLINK_CMD_RATE_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_rate_fill(msg, devlink, devlink_rate,
+ cmd, 0, 0, 0, NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink_rate *devlink_rate;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err = 0;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ enum devlink_command cmd = DEVLINK_CMD_RATE_NEW;
+ u32 id = NETLINK_CB(cb->skb).portid;
+
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_rate_fill(msg, devlink,
+ devlink_rate,
+ cmd, id,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, NULL);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+ if (err != -EMSGSIZE)
+ return err;
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_rate *devlink_rate = info->user_ptr[1];
+ struct devlink *devlink = devlink_rate->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_rate_fill(msg, devlink, devlink_rate,
+ DEVLINK_CMD_RATE_NEW,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static bool
+devlink_rate_is_parent_node(struct devlink_rate *devlink_rate,
+ struct devlink_rate *parent)
+{
+ while (parent) {
+ if (parent == devlink_rate)
+ return true;
+ parent = parent->parent;
+ }
+ return false;
+}
+
static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
@@ -1339,6 +1588,255 @@ static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb,
return devlink->ops->port_del(devlink, port_index, extack);
}
+static int
+devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
+ struct genl_info *info,
+ struct nlattr *nla_parent)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ const char *parent_name = nla_data(nla_parent);
+ const struct devlink_ops *ops = devlink->ops;
+ size_t len = strlen(parent_name);
+ struct devlink_rate *parent;
+ int err = -EOPNOTSUPP;
+
+ parent = devlink_rate->parent;
+ if (parent && len) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Rate object already has parent.");
+ return -EBUSY;
+ } else if (parent && !len) {
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_parent_set(devlink_rate, NULL,
+ devlink_rate->priv, NULL,
+ info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_parent_set(devlink_rate, NULL,
+ devlink_rate->priv, NULL,
+ info->extack);
+ if (err)
+ return err;
+
+ refcount_dec(&parent->refcnt);
+ devlink_rate->parent = NULL;
+ } else if (!parent && len) {
+ parent = devlink_rate_node_get_by_name(devlink, parent_name);
+ if (IS_ERR(parent))
+ return -ENODEV;
+
+ if (parent == devlink_rate) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Parent to self is not allowed");
+ return -EINVAL;
+ }
+
+ if (devlink_rate_is_node(devlink_rate) &&
+ devlink_rate_is_parent_node(devlink_rate, parent->parent)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Node is already a parent of parent node.");
+ return -EEXIST;
+ }
+
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_parent_set(devlink_rate, parent,
+ devlink_rate->priv, parent->priv,
+ info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_parent_set(devlink_rate, parent,
+ devlink_rate->priv, parent->priv,
+ info->extack);
+ if (err)
+ return err;
+
+ refcount_inc(&parent->refcnt);
+ devlink_rate->parent = parent;
+ }
+
+ return 0;
+}
+
+static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
+ const struct devlink_ops *ops,
+ struct genl_info *info)
+{
+ struct nlattr *nla_parent, **attrs = info->attrs;
+ int err = -EOPNOTSUPP;
+ u64 rate;
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) {
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ if (err)
+ return err;
+ devlink_rate->tx_share = rate;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) {
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ if (err)
+ return err;
+ devlink_rate->tx_max = rate;
+ }
+
+ nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME];
+ if (nla_parent) {
+ err = devlink_nl_rate_parent_node_set(devlink_rate, info,
+ nla_parent);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
+ struct genl_info *info,
+ enum devlink_rate_type type)
+{
+ struct nlattr **attrs = info->attrs;
+
+ if (type == DEVLINK_RATE_TYPE_LEAF) {
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
+ !ops->rate_leaf_parent_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs");
+ return false;
+ }
+ } else if (type == DEVLINK_RATE_TYPE_NODE) {
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
+ !ops->rate_node_parent_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes");
+ return false;
+ }
+ } else {
+ WARN(1, "Unknown type of rate object");
+ return false;
+ }
+
+ return true;
+}
+
+static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_rate *devlink_rate = info->user_ptr[1];
+ struct devlink *devlink = devlink_rate->devlink;
+ const struct devlink_ops *ops = devlink->ops;
+ int err;
+
+ if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type))
+ return -EOPNOTSUPP;
+
+ err = devlink_nl_rate_set(devlink_rate, ops, info);
+
+ if (!err)
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
+ return err;
+}
+
+static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_rate *rate_node;
+ const struct devlink_ops *ops;
+ int err;
+
+ ops = devlink->ops;
+ if (!ops || !ops->rate_node_new || !ops->rate_node_del) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Rate nodes aren't supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE))
+ return -EOPNOTSUPP;
+
+ rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs);
+ if (!IS_ERR(rate_node))
+ return -EEXIST;
+ else if (rate_node == ERR_PTR(-EINVAL))
+ return -EINVAL;
+
+ rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
+ if (!rate_node)
+ return -ENOMEM;
+
+ rate_node->devlink = devlink;
+ rate_node->type = DEVLINK_RATE_TYPE_NODE;
+ rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL);
+ if (!rate_node->name) {
+ err = -ENOMEM;
+ goto err_strdup;
+ }
+
+ err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack);
+ if (err)
+ goto err_node_new;
+
+ err = devlink_nl_rate_set(rate_node, ops, info);
+ if (err)
+ goto err_rate_set;
+
+ refcount_set(&rate_node->refcnt, 1);
+ list_add(&rate_node->list, &devlink->rate_list);
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+ return 0;
+
+err_rate_set:
+ ops->rate_node_del(rate_node, rate_node->priv, info->extack);
+err_node_new:
+ kfree(rate_node->name);
+err_strdup:
+ kfree(rate_node);
+ return err;
+}
+
+static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_rate *rate_node = info->user_ptr[1];
+ struct devlink *devlink = rate_node->devlink;
+ const struct devlink_ops *ops = devlink->ops;
+ int err;
+
+ if (refcount_read(&rate_node->refcnt) > 1) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Node has children. Cannot delete node.");
+ return -EBUSY;
+ }
+
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL);
+ err = ops->rate_node_del(rate_node, rate_node->priv, info->extack);
+ if (rate_node->parent)
+ refcount_dec(&rate_node->parent->refcnt);
+ list_del(&rate_node->list);
+ kfree(rate_node->name);
+ kfree(rate_node);
+ return err;
+}
+
static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
struct devlink_sb *devlink_sb,
enum devlink_command cmd, u32 portid,
@@ -2207,6 +2705,23 @@ static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
return genlmsg_reply(msg, info);
}
+static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_rate *devlink_rate;
+
+ /* Take the lock to sync with devlink_rate_nodes_destroy() */
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list)
+ if (devlink_rate_is_node(devlink_rate)) {
+ mutex_unlock(&devlink->lock);
+ NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists.");
+ return -EBUSY;
+ }
+ mutex_unlock(&devlink->lock);
+ return 0;
+}
+
static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
@@ -2221,6 +2736,9 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
if (!ops->eswitch_mode_set)
return -EOPNOTSUPP;
mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = devlink_rate_nodes_check(devlink, mode, info->extack);
+ if (err)
+ return err;
err = ops->eswitch_mode_set(devlink, mode, info->extack);
if (err)
return err;
@@ -6994,8 +7512,9 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
}
}
-static int devlink_trap_stats_put(struct sk_buff *msg,
- struct devlink_stats __percpu *trap_stats)
+static int
+devlink_trap_group_stats_put(struct sk_buff *msg,
+ struct devlink_stats __percpu *trap_stats)
{
struct devlink_stats stats;
struct nlattr *attr;
@@ -7023,6 +7542,50 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_item *trap_item)
+{
+ struct devlink_stats stats;
+ struct nlattr *attr;
+ u64 drops = 0;
+ int err;
+
+ if (devlink->ops->trap_drop_counter_get) {
+ err = devlink->ops->trap_drop_counter_get(devlink,
+ trap_item->trap,
+ &drops);
+ if (err)
+ return err;
+ }
+
+ devlink_trap_stats_read(trap_item->stats, &stats);
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (devlink->ops->trap_drop_counter_get &&
+ nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops,
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
+ stats.rx_packets, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES,
+ stats.rx_bytes, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink,
const struct devlink_trap_item *trap_item,
enum devlink_command cmd, u32 portid, u32 seq,
@@ -7060,7 +7623,7 @@ static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink,
if (err)
goto nla_put_failure;
- err = devlink_trap_stats_put(msg, trap_item->stats);
+ err = devlink_trap_stats_put(msg, devlink, trap_item);
if (err)
goto nla_put_failure;
@@ -7277,7 +7840,7 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
group_item->policer_item->policer->id))
goto nla_put_failure;
- err = devlink_trap_stats_put(msg, group_item->stats);
+ err = devlink_trap_group_stats_put(msg, group_item->stats);
if (err)
goto nla_put_failure;
@@ -7801,6 +8364,11 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 },
[DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 },
[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING },
};
static const struct genl_small_ops devlink_nl_ops[] = {
@@ -7827,6 +8395,30 @@ static const struct genl_small_ops devlink_nl_ops[] = {
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
+ .cmd = DEVLINK_CMD_RATE_GET,
+ .doit = devlink_nl_cmd_rate_get_doit,
+ .dumpit = devlink_nl_cmd_rate_get_dumpit,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_RATE,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_SET,
+ .doit = devlink_nl_cmd_rate_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_RATE,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_NEW,
+ .doit = devlink_nl_cmd_rate_new_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_DEL,
+ .doit = devlink_nl_cmd_rate_del_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_RATE_NODE,
+ },
+ {
.cmd = DEVLINK_CMD_PORT_SPLIT,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_port_split_doit,
@@ -8201,6 +8793,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
__devlink_net_set(devlink, &init_net);
INIT_LIST_HEAD(&devlink->port_list);
+ INIT_LIST_HEAD(&devlink->rate_list);
INIT_LIST_HEAD(&devlink->sb_list);
INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
INIT_LIST_HEAD(&devlink->resource_list);
@@ -8303,6 +8896,7 @@ void devlink_free(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->resource_list));
WARN_ON(!list_empty(&devlink->dpipe_table_list));
WARN_ON(!list_empty(&devlink->sb_list));
+ WARN_ON(!list_empty(&devlink->rate_list));
WARN_ON(!list_empty(&devlink->port_list));
xa_destroy(&devlink->snapshot_ids);
@@ -8619,6 +9213,110 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
+/**
+ * devlink_rate_leaf_create - create devlink rate leaf
+ *
+ * @devlink_port: devlink port object to create rate object on
+ * @priv: driver private data
+ *
+ * Create devlink rate object of type leaf on provided @devlink_port.
+ * Throws call trace if @devlink_port already has a devlink rate object.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ *
+ * Return: -ENOMEM if failed to allocate rate object, 0 otherwise.
+ */
+int
+devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_rate *devlink_rate;
+
+ devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL);
+ if (!devlink_rate)
+ return -ENOMEM;
+
+ mutex_lock(&devlink->lock);
+ WARN_ON(devlink_port->devlink_rate);
+ devlink_rate->type = DEVLINK_RATE_TYPE_LEAF;
+ devlink_rate->devlink = devlink;
+ devlink_rate->devlink_port = devlink_port;
+ devlink_rate->priv = priv;
+ list_add_tail(&devlink_rate->list, &devlink->rate_list);
+ devlink_port->devlink_rate = devlink_rate;
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
+ mutex_unlock(&devlink->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_rate_leaf_create);
+
+/**
+ * devlink_rate_leaf_destroy - destroy devlink rate leaf
+ *
+ * @devlink_port: devlink port linked to the rate object
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_rate_leaf_destroy(struct devlink_port *devlink_port)
+{
+ struct devlink_rate *devlink_rate = devlink_port->devlink_rate;
+ struct devlink *devlink = devlink_port->devlink;
+
+ if (!devlink_rate)
+ return;
+
+ mutex_lock(&devlink->lock);
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL);
+ if (devlink_rate->parent)
+ refcount_dec(&devlink_rate->parent->refcnt);
+ list_del(&devlink_rate->list);
+ devlink_port->devlink_rate = NULL;
+ mutex_unlock(&devlink->lock);
+ kfree(devlink_rate);
+}
+EXPORT_SYMBOL_GPL(devlink_rate_leaf_destroy);
+
+/**
+ * devlink_rate_nodes_destroy - destroy all devlink rate nodes on device
+ *
+ * @devlink: devlink instance
+ *
+ * Unset parent for all rate objects and destroy all rate nodes
+ * on specified device.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_rate_nodes_destroy(struct devlink *devlink)
+{
+ static struct devlink_rate *devlink_rate, *tmp;
+ const struct devlink_ops *ops = devlink->ops;
+
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ if (!devlink_rate->parent)
+ continue;
+
+ refcount_dec(&devlink_rate->parent->refcnt);
+ if (devlink_rate_is_leaf(devlink_rate))
+ ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv,
+ NULL, NULL);
+ else if (devlink_rate_is_node(devlink_rate))
+ ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv,
+ NULL, NULL);
+ }
+ list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) {
+ if (devlink_rate_is_node(devlink_rate)) {
+ ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL);
+ list_del(&devlink_rate->list);
+ kfree(devlink_rate->name);
+ kfree(devlink_rate);
+ }
+ }
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_rate_nodes_destroy);
+
static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
char *name, size_t len)
{
@@ -8630,12 +9328,18 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
switch (attrs->flavour) {
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ case DEVLINK_PORT_FLAVOUR_VIRTUAL:
+ n = snprintf(name, len, "p%u", attrs->phys.port_number);
+ if (n < len && attrs->split)
+ n += snprintf(name + n, len - n, "s%u",
+ attrs->phys.split_subport_number);
if (!attrs->split)
n = snprintf(name, len, "p%u", attrs->phys.port_number);
else
n = snprintf(name, len, "p%us%u",
attrs->phys.port_number,
attrs->phys.split_subport_number);
+
break;
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
@@ -8677,8 +9381,6 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf,
attrs->pci_sf.sf);
break;
- case DEVLINK_PORT_FLAVOUR_VIRTUAL:
- return -EOPNOTSUPP;
}
if (n >= len)
diff --git a/net/core/filter.c b/net/core/filter.c
index d81352ca1b5c..d70187ce851b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3241,9 +3241,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
u32 off = skb_mac_header_len(skb);
int ret;
- if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
- return -ENOTSUPP;
-
ret = skb_cow(skb, len_diff);
if (unlikely(ret < 0))
return ret;
@@ -3255,19 +3252,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* SKB_GSO_TCPV4 needs to be changed into
- * SKB_GSO_TCPV6.
- */
+ /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
if (shinfo->gso_type & SKB_GSO_TCPV4) {
shinfo->gso_type &= ~SKB_GSO_TCPV4;
shinfo->gso_type |= SKB_GSO_TCPV6;
}
-
- /* Due to IPv6 header, MSS needs to be downgraded. */
- skb_decrease_gso_size(shinfo, len_diff);
- /* Header must be checked, and gso_segs recomputed. */
- shinfo->gso_type |= SKB_GSO_DODGY;
- shinfo->gso_segs = 0;
}
skb->protocol = htons(ETH_P_IPV6);
@@ -3282,9 +3271,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
u32 off = skb_mac_header_len(skb);
int ret;
- if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
- return -ENOTSUPP;
-
ret = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(ret < 0))
return ret;
@@ -3296,19 +3282,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* SKB_GSO_TCPV6 needs to be changed into
- * SKB_GSO_TCPV4.
- */
+ /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
if (shinfo->gso_type & SKB_GSO_TCPV6) {
shinfo->gso_type &= ~SKB_GSO_TCPV6;
shinfo->gso_type |= SKB_GSO_TCPV4;
}
-
- /* Due to IPv4 header, MSS can be upgraded. */
- skb_increase_gso_size(shinfo, len_diff);
- /* Header must be checked, and gso_segs recomputed. */
- shinfo->gso_type |= SKB_GSO_DODGY;
- shinfo->gso_segs = 0;
}
skb->protocol = htons(ETH_P_IP);
@@ -3919,6 +3897,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.arg2_type = ARG_ANYTHING,
};
+/* XDP_REDIRECT works by a three-step process, implemented in the functions
+ * below:
+ *
+ * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
+ * of the redirect and store it (along with some other metadata) in a per-CPU
+ * struct bpf_redirect_info.
+ *
+ * 2. When the program returns the XDP_REDIRECT return code, the driver will
+ * call xdp_do_redirect() which will use the information in struct
+ * bpf_redirect_info to actually enqueue the frame into a map type-specific
+ * bulk queue structure.
+ *
+ * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(),
+ * which will flush all the different bulk queues, thus completing the
+ * redirect.
+ *
+ * Pointers to the map entries will be kept around for this whole sequence of
+ * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
+ * the core code; instead, the RCU protection relies on everything happening
+ * inside a single NAPI poll sequence, which means it's between a pair of calls
+ * to local_bh_disable()/local_bh_enable().
+ *
+ * The map entries are marked as __rcu and the map code makes sure to
+ * dereference those pointers with rcu_dereference_check() in a way that works
+ * for both sections that to hold an rcu_read_lock() and sections that are
+ * called from NAPI without a separate rcu_read_lock(). The code below does not
+ * use RCU annotations, but relies on those in the map code.
+ */
void xdp_do_flush(void)
{
__dev_flush();
@@ -3927,6 +3933,23 @@ void xdp_do_flush(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
+void bpf_clear_redirect_map(struct bpf_map *map)
+{
+ struct bpf_redirect_info *ri;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ ri = per_cpu_ptr(&bpf_redirect_info, cpu);
+ /* Avoid polluting remote cacheline due to writes if
+ * not needed. Once we pass this test, we need the
+ * cmpxchg() to make sure it hasn't been changed in
+ * the meantime by remote CPU.
+ */
+ if (unlikely(READ_ONCE(ri->map) == map))
+ cmpxchg(&ri->map, map, NULL);
+ }
+}
+
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
@@ -3934,6 +3957,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
u32 map_id = ri->map_id;
+ struct bpf_map *map;
int err;
ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
@@ -3943,7 +3967,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
- err = dev_map_enqueue(fwd, xdp, dev);
+ map = READ_ONCE(ri->map);
+ if (unlikely(map)) {
+ WRITE_ONCE(ri->map, NULL);
+ err = dev_map_enqueue_multi(xdp, dev, map,
+ ri->flags & BPF_F_EXCLUDE_INGRESS);
+ } else {
+ err = dev_map_enqueue(fwd, xdp, dev);
+ }
break;
case BPF_MAP_TYPE_CPUMAP:
err = cpu_map_enqueue(fwd, xdp, dev);
@@ -3985,13 +4016,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
enum bpf_map_type map_type, u32 map_id)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_map *map;
int err;
switch (map_type) {
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
- err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+ map = READ_ONCE(ri->map);
+ if (unlikely(map)) {
+ WRITE_ONCE(ri->map, NULL);
+ err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+ ri->flags & BPF_F_EXCLUDE_INGRESS);
+ } else {
+ err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+ }
if (unlikely(err))
goto err;
break;
@@ -10008,11 +10047,13 @@ out:
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock_reuseport *reuse,
struct sock *sk, struct sk_buff *skb,
+ struct sock *migrating_sk,
u32 hash)
{
reuse_kern->skb = skb;
reuse_kern->sk = sk;
reuse_kern->selected_sk = NULL;
+ reuse_kern->migrating_sk = migrating_sk;
reuse_kern->data_end = skb->data + skb_headlen(skb);
reuse_kern->hash = hash;
reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -10021,12 +10062,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
struct bpf_prog *prog, struct sk_buff *skb,
+ struct sock *migrating_sk,
u32 hash)
{
struct sk_reuseport_kern reuse_kern;
enum sk_action action;
- bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+ bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
action = BPF_PROG_RUN(prog, &reuse_kern);
if (action == SK_PASS)
@@ -10136,6 +10178,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
return &sk_reuseport_load_bytes_proto;
case BPF_FUNC_skb_load_bytes_relative:
return &sk_reuseport_load_bytes_relative_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_ptr_cookie_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -10165,6 +10209,14 @@ sk_reuseport_is_valid_access(int off, int size,
case offsetof(struct sk_reuseport_md, hash):
return size == size_default;
+ case offsetof(struct sk_reuseport_md, sk):
+ info->reg_type = PTR_TO_SOCKET;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, migrating_sk):
+ info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+ return size == sizeof(__u64);
+
/* Fields that allow narrowing */
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
if (size < sizeof_field(struct sk_buff, protocol))
@@ -10237,6 +10289,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct sk_reuseport_md, bind_inany):
SK_REUSEPORT_LOAD_FIELD(bind_inany);
break;
+
+ case offsetof(struct sk_reuseport_md, sk):
+ SK_REUSEPORT_LOAD_FIELD(sk);
+ break;
+
+ case offsetof(struct sk_reuseport_md, migrating_sk):
+ SK_REUSEPORT_LOAD_FIELD(migrating_sk);
+ break;
}
return insn - insn_buf;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 3ed7c98a98e1..2aadbfc5193b 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -943,8 +943,8 @@ bool __skb_flow_dissect(const struct net *net,
int offset = 0;
ops = skb->dev->dsa_ptr->tag_ops;
- /* Tail taggers don't break flow dissection */
- if (!ops->tail_tag) {
+ /* Only DSA header taggers break flow dissection */
+ if (ops->needed_headroom) {
if (ops->flow_dissect)
ops->flow_dissect(skb, &proto, &offset);
else
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index bf774575ad71..53e85c70c6e5 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3142,7 +3142,7 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
struct pneigh_entry *pn = NULL;
- int bucket = state->bucket;
+ int bucket;
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 283ddb2dbc7d..c40cd8dd75c7 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -60,3 +60,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset);
+EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c310c7c1cef7..0a6b04714558 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -36,6 +36,7 @@
#include <net/ip6_checksum.h>
#include <asm/unaligned.h>
#include <trace/events/napi.h>
+#include <linux/kconfig.h>
/*
* We maintain a small pool of fully-sized skbs, to make sure the
@@ -389,7 +390,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
static atomic_t ip_ident;
struct ipv6hdr *ip6h;
- WARN_ON_ONCE(!irqs_disabled());
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ WARN_ON_ONCE(!irqs_disabled());
udp_len = len + sizeof(*udph);
if (np->ipv6)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 3c4c4c7a0402..5e4eb45b139c 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -17,6 +17,7 @@
#include <linux/dma-mapping.h>
#include <linux/page-flags.h>
#include <linux/mm.h> /* for __put_page() */
+#include <linux/poison.h>
#include <trace/events/page_pool.h>
@@ -221,6 +222,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
return NULL;
}
+ page->pp_magic |= PP_SIGNATURE;
+
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
@@ -263,6 +266,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
put_page(page);
continue;
}
+ page->pp_magic |= PP_SIGNATURE;
pool->alloc.cache[pool->alloc.count++] = page;
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
@@ -341,6 +345,8 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
DMA_ATTR_SKIP_CPU_SYNC);
page_pool_set_dma_addr(page, 0);
skip_dma_unmap:
+ page->pp_magic = 0;
+
/* This may be the last page returned, releasing the pool, so
* it is not safe to reference pool afterwards.
*/
@@ -622,3 +628,25 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
}
}
EXPORT_SYMBOL(page_pool_update_nid);
+
+bool page_pool_return_skb_page(struct page *page)
+{
+ struct page_pool *pp;
+
+ page = compound_head(page);
+ if (unlikely(page->pp_magic != PP_SIGNATURE))
+ return false;
+
+ pp = page->pp;
+
+ /* Driver set this to memory recycling info. Reset it on recycle.
+ * This will *not* work for NIC using a split-page memory model.
+ * The page will be returned to the pool here regardless of the
+ * 'flipped' fragment being in use or not.
+ */
+ page->pp = NULL;
+ page_pool_put_full_page(pp, page, false);
+
+ return true;
+}
+EXPORT_SYMBOL(page_pool_return_skb_page);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 3fba429f1f57..7e258d255e90 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -467,7 +467,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
static void pktgen_run_all_threads(struct pktgen_net *pn);
static void pktgen_reset_all_threads(struct pktgen_net *pn);
-static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn);
+static void pktgen_stop_all_threads(struct pktgen_net *pn);
static void pktgen_stop(struct pktgen_thread *t);
static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
@@ -516,14 +516,11 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
if (!strcmp(data, "stop"))
- pktgen_stop_all_threads_ifs(pn);
-
+ pktgen_stop_all_threads(pn);
else if (!strcmp(data, "start"))
pktgen_run_all_threads(pn);
-
else if (!strcmp(data, "reset"))
pktgen_reset_all_threads(pn);
-
else
return -EINVAL;
@@ -3027,20 +3024,25 @@ static void pktgen_run(struct pktgen_thread *t)
t->control &= ~(T_STOP);
}
-static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn)
+static void pktgen_handle_all_threads(struct pktgen_net *pn, u32 flags)
{
struct pktgen_thread *t;
- func_enter();
-
mutex_lock(&pktgen_thread_lock);
list_for_each_entry(t, &pn->pktgen_threads, th_list)
- t->control |= T_STOP;
+ t->control |= (flags);
mutex_unlock(&pktgen_thread_lock);
}
+static void pktgen_stop_all_threads(struct pktgen_net *pn)
+{
+ func_enter();
+
+ pktgen_handle_all_threads(pn, T_STOP);
+}
+
static int thread_is_running(const struct pktgen_thread *t)
{
const struct pktgen_dev *pkt_dev;
@@ -3103,16 +3105,9 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
static void pktgen_run_all_threads(struct pktgen_net *pn)
{
- struct pktgen_thread *t;
-
func_enter();
- mutex_lock(&pktgen_thread_lock);
-
- list_for_each_entry(t, &pn->pktgen_threads, th_list)
- t->control |= (T_RUN);
-
- mutex_unlock(&pktgen_thread_lock);
+ pktgen_handle_all_threads(pn, T_RUN);
/* Propagate thread->control */
schedule_timeout_interruptible(msecs_to_jiffies(125));
@@ -3122,16 +3117,9 @@ static void pktgen_run_all_threads(struct pktgen_net *pn)
static void pktgen_reset_all_threads(struct pktgen_net *pn)
{
- struct pktgen_thread *t;
-
func_enter();
- mutex_lock(&pktgen_thread_lock);
-
- list_for_each_entry(t, &pn->pktgen_threads, th_list)
- t->control |= (T_REMDEVALL);
-
- mutex_unlock(&pktgen_thread_lock);
+ pktgen_handle_all_threads(pn, T_REMDEVALL);
/* Propagate thread->control */
schedule_timeout_interruptible(msecs_to_jiffies(125));
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ec931b080156..f6af3e74fc44 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -9,7 +9,7 @@
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Fixes:
- * Vitaly E. Lavrov RTA_OK arithmetics was wrong.
+ * Vitaly E. Lavrov RTA_OK arithmetic was wrong.
*/
#include <linux/bitops.h>
@@ -234,7 +234,7 @@ unlock:
* @msgtype: rtnetlink message type
* @doit: Function pointer called for each request message
* @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
*
* Like rtnl_register, but for use by removable modules.
*/
@@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(rtnl_register_module);
* @msgtype: rtnetlink message type
* @doit: Function pointer called for each request message
* @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
*
* Registers the specified function pointers (at least one of them has
* to be non-NULL) to be called whenever a request message for the
@@ -376,12 +376,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops)
if (rtnl_link_ops_get(ops->kind))
return -EEXIST;
- /* The check for setup is here because if ops
+ /* The check for alloc/setup is here because if ops
* does not have that filled up, it is not possible
* to use the ops for creating device. So do not
* fill up dellink as well. That disables rtnl_dellink.
*/
- if (ops->setup && !ops->dellink)
+ if ((ops->alloc || ops->setup) && !ops->dellink)
ops->dellink = unregister_netdevice_queue;
list_add_tail(&ops->list, &link_ops);
@@ -543,7 +543,9 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
{
const struct rtnl_af_ops *ops;
- list_for_each_entry_rcu(ops, &rtnl_af_ops, list) {
+ ASSERT_RTNL();
+
+ list_for_each_entry(ops, &rtnl_af_ops, list) {
if (ops->family == family)
return ops;
}
@@ -1819,6 +1821,16 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
if (rtnl_fill_prop_list(skb, dev))
goto nla_put_failure;
+ if (dev->dev.parent &&
+ nla_put_string(skb, IFLA_PARENT_DEV_NAME,
+ dev_name(dev->dev.parent)))
+ goto nla_put_failure;
+
+ if (dev->dev.parent && dev->dev.parent->bus &&
+ nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME,
+ dev->dev.parent->bus->name))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -1878,6 +1890,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_PERM_ADDRESS] = { .type = NLA_REJECT },
[IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
[IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
+ [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2274,27 +2287,18 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
const struct rtnl_af_ops *af_ops;
- rcu_read_lock();
af_ops = rtnl_af_lookup(nla_type(af));
- if (!af_ops) {
- rcu_read_unlock();
+ if (!af_ops)
return -EAFNOSUPPORT;
- }
- if (!af_ops->set_link_af) {
- rcu_read_unlock();
+ if (!af_ops->set_link_af)
return -EOPNOTSUPP;
- }
if (af_ops->validate_link_af) {
err = af_ops->validate_link_af(dev, af);
- if (err < 0) {
- rcu_read_unlock();
+ if (err < 0)
return err;
- }
}
-
- rcu_read_unlock();
}
}
@@ -2574,7 +2578,7 @@ static int do_set_proto_down(struct net_device *dev,
if (nl_proto_down) {
proto_down = nla_get_u8(nl_proto_down);
- /* Dont turn off protodown if there are active reasons */
+ /* Don't turn off protodown if there are active reasons */
if (!proto_down && dev->proto_down_reason) {
NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons");
return -EBUSY;
@@ -2868,17 +2872,12 @@ static int do_setlink(const struct sk_buff *skb,
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
const struct rtnl_af_ops *af_ops;
- rcu_read_lock();
-
BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af))));
err = af_ops->set_link_af(dev, af, extack);
- if (err < 0) {
- rcu_read_unlock();
+ if (err < 0)
goto errout;
- }
- rcu_read_unlock();
status |= DO_SETLINK_NOTIFY;
}
}
@@ -3177,8 +3176,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
return ERR_PTR(-EINVAL);
}
- dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
- ops->setup, num_tx_queues, num_rx_queues);
+ if (ops->alloc) {
+ dev = ops->alloc(tb, ifname, name_assign_type,
+ num_tx_queues, num_rx_queues);
+ if (IS_ERR(dev))
+ return dev;
+ } else {
+ dev = alloc_netdev_mqs(ops->priv_size, ifname,
+ name_assign_type, ops->setup,
+ num_tx_queues, num_rx_queues);
+ }
+
if (!dev)
return ERR_PTR(-ENOMEM);
@@ -3411,7 +3419,7 @@ replay:
return -EOPNOTSUPP;
}
- if (!ops->setup)
+ if (!ops->alloc && !ops->setup)
return -EOPNOTSUPP;
if (!ifname[0]) {
@@ -3939,12 +3947,12 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
* implement its own handler for this.
*/
if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
- pr_info("%s: FDB only supports static addresses\n", dev->name);
+ netdev_info(dev, "default FDB implementation only supports local addresses\n");
return err;
}
if (vid) {
- pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name);
+ netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n");
return err;
}
@@ -4078,7 +4086,7 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
* implement its own handler for this.
*/
if (!(ndm->ndm_state & NUD_PERMANENT)) {
- pr_info("%s: FDB only supports static addresses\n", dev->name);
+ netdev_info(dev, "default FDB implementation only supports local addresses\n");
return err;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bbc3b4b62032..12aabcda6db2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -70,6 +70,7 @@
#include <net/xfrm.h>
#include <net/mpls.h>
#include <net/mptcp.h>
+#include <net/page_pool.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -645,10 +646,13 @@ static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
- if (skb->head_frag)
+ if (skb->head_frag) {
+ if (skb_pp_recycle(skb, head))
+ return;
skb_free_frag(head);
- else
+ } else {
kfree(head);
+ }
}
static void skb_release_data(struct sk_buff *skb)
@@ -664,7 +668,7 @@ static void skb_release_data(struct sk_buff *skb)
skb_zcopy_clear(skb, true);
for (i = 0; i < shinfo->nr_frags; i++)
- __skb_frag_unref(&shinfo->frags[i]);
+ __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
@@ -1046,6 +1050,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
n->nohdr = 0;
n->peeked = 0;
C(pfmemalloc);
+ C(pp_recycle);
n->destructor = NULL;
C(tail);
C(end);
@@ -1289,7 +1294,7 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg)
}
spin_unlock_irqrestore(&q->lock, flags);
- sk->sk_error_report(sk);
+ sk_error_report(sk);
release:
consume_skb(skb);
@@ -3497,7 +3502,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragto = &skb_shinfo(tgt)->frags[merge];
skb_frag_size_add(fragto, skb_frag_size(fragfrom));
- __skb_frag_unref(fragfrom);
+ __skb_frag_unref(fragfrom, skb->pp_recycle);
}
/* Reposition in the original skb */
@@ -4680,7 +4685,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb_queue_tail(&sk->sk_error_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ sk_error_report(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);
@@ -4711,7 +4716,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
sk->sk_err = 0;
if (skb_next)
- sk->sk_error_report(sk);
+ sk_error_report(sk);
return skb;
}
@@ -5287,6 +5292,13 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (skb_cloned(to))
return false;
+ /* The page pool signature of struct page will eventually figure out
+ * which pages can be recycled or not but for now let's prohibit slab
+ * allocated and page_pool allocated SKBs from being coalesced.
+ */
+ if (to->pp_recycle != from->pp_recycle)
+ return false;
+
if (len <= skb_tailroom(to)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 43ce17a6a585..9b6160a191f8 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -399,29 +399,6 @@ out:
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
-int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
- long timeo, int *err)
-{
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
- int ret = 0;
-
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- return 1;
-
- if (!timeo)
- return ret;
-
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- ret = sk_wait_event(sk, &timeo,
- !list_empty(&psock->ingress_msg) ||
- !skb_queue_empty(&sk->sk_receive_queue), &wait);
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
- return ret;
-}
-EXPORT_SYMBOL_GPL(sk_msg_wait_data);
-
/* Receive sk_msg from psock->ingress_msg to @msg. */
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
int len, int flags)
@@ -601,6 +578,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
return sk_psock_skb_ingress(psock, skb);
}
+static void sock_drop(struct sock *sk, struct sk_buff *skb)
+{
+ sk_drops_add(sk, skb);
+ kfree_skb(skb);
+}
+
static void sk_psock_backlog(struct work_struct *work)
{
struct sk_psock *psock = container_of(work, struct sk_psock, work);
@@ -640,7 +623,7 @@ start:
/* Hard errors break pipe and stop xmit. */
sk_psock_report_error(psock, ret ? -ret : EPIPE);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
- kfree_skb(skb);
+ sock_drop(psock->sk, skb);
goto end;
}
off += ret;
@@ -731,7 +714,7 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
skb_bpf_redirect_clear(skb);
- kfree_skb(skb);
+ sock_drop(psock->sk, skb);
}
__sk_psock_purge_ingress_msg(psock);
}
@@ -847,7 +830,7 @@ out:
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
-static void sk_psock_skb_redirect(struct sk_buff *skb)
+static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
@@ -857,8 +840,8 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
* return code, but then didn't set a redirect interface.
*/
if (unlikely(!sk_other)) {
- kfree_skb(skb);
- return;
+ sock_drop(from->sk, skb);
+ return -EIO;
}
psock_other = sk_psock(sk_other);
/* This error indicates the socket is being torn down or had another
@@ -866,26 +849,30 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
* a socket that is in this state so we drop the skb.
*/
if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
- kfree_skb(skb);
- return;
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
}
spin_lock_bh(&psock_other->ingress_lock);
if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
spin_unlock_bh(&psock_other->ingress_lock);
- kfree_skb(skb);
- return;
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
}
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
spin_unlock_bh(&psock_other->ingress_lock);
+ return 0;
}
-static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
+static void sk_psock_tls_verdict_apply(struct sk_buff *skb,
+ struct sk_psock *from, int verdict)
{
switch (verdict) {
case __SK_REDIRECT:
- sk_psock_skb_redirect(skb);
+ sk_psock_skb_redirect(from, skb);
break;
case __SK_PASS:
case __SK_DROP:
@@ -909,20 +896,21 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
- sk_psock_tls_verdict_apply(skb, psock->sk, ret);
+ sk_psock_tls_verdict_apply(skb, psock, ret);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
-static void sk_psock_verdict_apply(struct sk_psock *psock,
- struct sk_buff *skb, int verdict)
+static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
+ int verdict)
{
struct sock *sk_other;
- int err = -EIO;
+ int err = 0;
switch (verdict) {
case __SK_PASS:
+ err = -EIO;
sk_other = psock->sk;
if (sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
@@ -945,18 +933,25 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
skb_queue_tail(&psock->ingress_skb, skb);
schedule_work(&psock->work);
+ err = 0;
}
spin_unlock_bh(&psock->ingress_lock);
+ if (err < 0) {
+ skb_bpf_redirect_clear(skb);
+ goto out_free;
+ }
}
break;
case __SK_REDIRECT:
- sk_psock_skb_redirect(skb);
+ err = sk_psock_skb_redirect(psock, skb);
break;
case __SK_DROP:
default:
out_free:
- kfree_skb(skb);
+ sock_drop(psock->sk, skb);
}
+
+ return err;
}
static void sk_psock_write_space(struct sock *sk)
@@ -988,7 +983,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
sk = strp->sk;
psock = sk_psock(sk);
if (unlikely(!psock)) {
- kfree_skb(skb);
+ sock_drop(sk, skb);
goto out;
}
prog = READ_ONCE(psock->progs.stream_verdict);
@@ -1109,7 +1104,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
psock = sk_psock(sk);
if (unlikely(!psock)) {
len = 0;
- kfree_skb(skb);
+ sock_drop(sk, skb);
goto out;
}
prog = READ_ONCE(psock->progs.stream_verdict);
@@ -1123,7 +1118,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
- sk_psock_verdict_apply(psock, skb, ret);
+ if (sk_psock_verdict_apply(psock, skb, ret) < 0)
+ len = 0;
out:
rcu_read_unlock();
return len;
diff --git a/net/core/sock.c b/net/core/sock.c
index 946888afef88..ba1c0f75cd45 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -331,6 +331,22 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(__sk_backlog_rcv);
+void sk_error_report(struct sock *sk)
+{
+ sk->sk_error_report(sk);
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ fallthrough;
+ case AF_INET6:
+ trace_inet_sk_error_report(sk);
+ break;
+ default:
+ break;
+ }
+}
+EXPORT_SYMBOL(sk_error_report);
+
static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
struct __kernel_sock_timeval tv;
@@ -776,6 +792,58 @@ void sock_enable_timestamps(struct sock *sk)
}
EXPORT_SYMBOL(sock_enable_timestamps);
+void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
+{
+ switch (optname) {
+ case SO_TIMESTAMP_OLD:
+ __sock_set_timestamps(sk, valbool, false, false);
+ break;
+ case SO_TIMESTAMP_NEW:
+ __sock_set_timestamps(sk, valbool, true, false);
+ break;
+ case SO_TIMESTAMPNS_OLD:
+ __sock_set_timestamps(sk, valbool, false, true);
+ break;
+ case SO_TIMESTAMPNS_NEW:
+ __sock_set_timestamps(sk, valbool, true, true);
+ break;
+ }
+}
+
+int sock_set_timestamping(struct sock *sk, int optname, int val)
+{
+ if (val & ~SOF_TIMESTAMPING_MASK)
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_OPT_ID &&
+ !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM) {
+ if ((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN))
+ return -EINVAL;
+ sk->sk_tskey = tcp_sk(sk)->snd_una;
+ } else {
+ sk->sk_tskey = 0;
+ }
+ }
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY))
+ return -EINVAL;
+
+ sk->sk_tsflags = val;
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+
+ if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+ sock_enable_timestamp(sk,
+ SOCK_TIMESTAMPING_RX_SOFTWARE);
+ else
+ sock_disable_timestamp(sk,
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+ return 0;
+}
+
void sock_set_keepalive(struct sock *sk)
{
lock_sock(sk);
@@ -997,54 +1065,15 @@ set_sndbuf:
break;
case SO_TIMESTAMP_OLD:
- __sock_set_timestamps(sk, valbool, false, false);
- break;
case SO_TIMESTAMP_NEW:
- __sock_set_timestamps(sk, valbool, true, false);
- break;
case SO_TIMESTAMPNS_OLD:
- __sock_set_timestamps(sk, valbool, false, true);
- break;
case SO_TIMESTAMPNS_NEW:
- __sock_set_timestamps(sk, valbool, true, true);
+ sock_set_timestamp(sk, valbool, optname);
break;
+
case SO_TIMESTAMPING_NEW:
case SO_TIMESTAMPING_OLD:
- if (val & ~SOF_TIMESTAMPING_MASK) {
- ret = -EINVAL;
- break;
- }
-
- if (val & SOF_TIMESTAMPING_OPT_ID &&
- !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM) {
- if ((1 << sk->sk_state) &
- (TCPF_CLOSE | TCPF_LISTEN)) {
- ret = -EINVAL;
- break;
- }
- sk->sk_tskey = tcp_sk(sk)->snd_una;
- } else {
- sk->sk_tskey = 0;
- }
- }
-
- if (val & SOF_TIMESTAMPING_OPT_STATS &&
- !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
- ret = -EINVAL;
- break;
- }
-
- sk->sk_tsflags = val;
- sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
-
- if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
- sock_enable_timestamp(sk,
- SOCK_TIMESTAMPING_RX_SOFTWARE);
- else
- sock_disable_timestamp(sk,
- (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+ ret = sock_set_timestamping(sk, optname, val);
break;
case SO_RCVLOWAT:
@@ -1622,6 +1651,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_bound_dev_if;
break;
+ case SO_NETNS_COOKIE:
+ lv = sizeof(u64);
+ if (len != lv)
+ return -EINVAL;
+ v.val64 = sock_net(sk)->net_cookie;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 6f1b82b8ad49..60decd6420ca 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -48,7 +48,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
bpf_map_init_from_attr(&stab->map, attr);
raw_spin_lock_init(&stab->lock);
- stab->sks = bpf_map_area_alloc(stab->map.max_entries *
+ stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries *
sizeof(struct sock *),
stab->map.numa_node);
if (!stab->sks) {
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index b065f0a103ed..3f00a28fe762 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -6,6 +6,7 @@
* selecting the socket index from the array of available sockets.
*/
+#include <net/ip.h>
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
#include <linux/idr.h>
@@ -17,6 +18,74 @@
DEFINE_SPINLOCK(reuseport_lock);
static DEFINE_IDA(reuseport_ida);
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+ struct sock_reuseport *reuse, bool bind_inany);
+
+static int reuseport_sock_index(struct sock *sk,
+ const struct sock_reuseport *reuse,
+ bool closed)
+{
+ int left, right;
+
+ if (!closed) {
+ left = 0;
+ right = reuse->num_socks;
+ } else {
+ left = reuse->max_socks - reuse->num_closed_socks;
+ right = reuse->max_socks;
+ }
+
+ for (; left < right; left++)
+ if (reuse->socks[left] == sk)
+ return left;
+ return -1;
+}
+
+static void __reuseport_add_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ reuse->socks[reuse->num_socks] = sk;
+ /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
+ smp_wmb();
+ reuse->num_socks++;
+}
+
+static bool __reuseport_detach_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ int i = reuseport_sock_index(sk, reuse, false);
+
+ if (i == -1)
+ return false;
+
+ reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+ reuse->num_socks--;
+
+ return true;
+}
+
+static void __reuseport_add_closed_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
+ /* paired with READ_ONCE() in inet_csk_bind_conflict() */
+ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
+}
+
+static bool __reuseport_detach_closed_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ int i = reuseport_sock_index(sk, reuse, true);
+
+ if (i == -1)
+ return false;
+
+ reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+ /* paired with READ_ONCE() in inet_csk_bind_conflict() */
+ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
+
+ return true;
+}
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
@@ -49,6 +118,12 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
if (reuse) {
+ if (reuse->num_closed_socks) {
+ /* sk was shutdown()ed before */
+ ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
+ goto out;
+ }
+
/* Only set reuse->bind_inany if the bind_inany is true.
* Otherwise, it will overwrite the reuse->bind_inany
* which was set by the bind/hash path.
@@ -72,9 +147,9 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
}
reuse->reuseport_id = id;
+ reuse->bind_inany = bind_inany;
reuse->socks[0] = sk;
reuse->num_socks = 1;
- reuse->bind_inany = bind_inany;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
@@ -90,14 +165,30 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
u32 more_socks_size, i;
more_socks_size = reuse->max_socks * 2U;
- if (more_socks_size > U16_MAX)
+ if (more_socks_size > U16_MAX) {
+ if (reuse->num_closed_socks) {
+ /* Make room by removing a closed sk.
+ * The child has already been migrated.
+ * Only reqsk left at this point.
+ */
+ struct sock *sk;
+
+ sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+ RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
+ __reuseport_detach_closed_sock(sk, reuse);
+
+ return reuse;
+ }
+
return NULL;
+ }
more_reuse = __reuseport_alloc(more_socks_size);
if (!more_reuse)
return NULL;
more_reuse->num_socks = reuse->num_socks;
+ more_reuse->num_closed_socks = reuse->num_closed_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
@@ -105,9 +196,13 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
+ memcpy(more_reuse->socks +
+ (more_reuse->max_socks - more_reuse->num_closed_socks),
+ reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
+ reuse->num_closed_socks * sizeof(struct sock *));
more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
- for (i = 0; i < reuse->num_socks; ++i)
+ for (i = 0; i < reuse->max_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
more_reuse);
@@ -152,13 +247,21 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
- lockdep_is_held(&reuseport_lock));
+ lockdep_is_held(&reuseport_lock));
+ if (old_reuse && old_reuse->num_closed_socks) {
+ /* sk was shutdown()ed before */
+ int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
+
+ spin_unlock_bh(&reuseport_lock);
+ return err;
+ }
+
if (old_reuse && old_reuse->num_socks != 1) {
spin_unlock_bh(&reuseport_lock);
return -EBUSY;
}
- if (reuse->num_socks == reuse->max_socks) {
+ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
reuse = reuseport_grow(reuse);
if (!reuse) {
spin_unlock_bh(&reuseport_lock);
@@ -166,10 +269,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
}
}
- reuse->socks[reuse->num_socks] = sk;
- /* paired with smp_rmb() in reuseport_select_sock() */
- smp_wmb();
- reuse->num_socks++;
+ __reuseport_add_sock(sk, reuse);
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
spin_unlock_bh(&reuseport_lock);
@@ -180,15 +280,77 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
}
EXPORT_SYMBOL(reuseport_add_sock);
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+ struct sock_reuseport *reuse, bool bind_inany)
+{
+ if (old_reuse == reuse) {
+ /* If sk was in the same reuseport group, just pop sk out of
+ * the closed section and push sk into the listening section.
+ */
+ __reuseport_detach_closed_sock(sk, old_reuse);
+ __reuseport_add_sock(sk, old_reuse);
+ return 0;
+ }
+
+ if (!reuse) {
+ /* In bind()/listen() path, we cannot carry over the eBPF prog
+ * for the shutdown()ed socket. In setsockopt() path, we should
+ * not change the eBPF prog of listening sockets by attaching a
+ * prog to the shutdown()ed socket. Thus, we will allocate a new
+ * reuseport group and detach sk from the old group.
+ */
+ int id;
+
+ reuse = __reuseport_alloc(INIT_SOCKS);
+ if (!reuse)
+ return -ENOMEM;
+
+ id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+ if (id < 0) {
+ kfree(reuse);
+ return id;
+ }
+
+ reuse->reuseport_id = id;
+ reuse->bind_inany = bind_inany;
+ } else {
+ /* Move sk from the old group to the new one if
+ * - all the other listeners in the old group were close()d or
+ * shutdown()ed, and then sk2 has listen()ed on the same port
+ * OR
+ * - sk listen()ed without bind() (or with autobind), was
+ * shutdown()ed, and then listen()s on another port which
+ * sk2 listen()s on.
+ */
+ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
+ reuse = reuseport_grow(reuse);
+ if (!reuse)
+ return -ENOMEM;
+ }
+ }
+
+ __reuseport_detach_closed_sock(sk, old_reuse);
+ __reuseport_add_sock(sk, reuse);
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+ if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
+ call_rcu(&old_reuse->rcu, reuseport_free_rcu);
+
+ return 0;
+}
+
void reuseport_detach_sock(struct sock *sk)
{
struct sock_reuseport *reuse;
- int i;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
+ /* reuseport_grow() has detached a closed sk */
+ if (!reuse)
+ goto out;
+
/* Notify the bpf side. The sk may be added to a sockarray
* map. If so, sockarray logic will remove it from the map.
*
@@ -201,19 +363,52 @@ void reuseport_detach_sock(struct sock *sk)
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
- for (i = 0; i < reuse->num_socks; i++) {
- if (reuse->socks[i] == sk) {
- reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
- reuse->num_socks--;
- if (reuse->num_socks == 0)
- call_rcu(&reuse->rcu, reuseport_free_rcu);
- break;
- }
- }
+ if (!__reuseport_detach_closed_sock(sk, reuse))
+ __reuseport_detach_sock(sk, reuse);
+
+ if (reuse->num_socks + reuse->num_closed_socks == 0)
+ call_rcu(&reuse->rcu, reuseport_free_rcu);
+
+out:
spin_unlock_bh(&reuseport_lock);
}
EXPORT_SYMBOL(reuseport_detach_sock);
+void reuseport_stop_listen_sock(struct sock *sk)
+{
+ if (sk->sk_protocol == IPPROTO_TCP) {
+ struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
+
+ spin_lock_bh(&reuseport_lock);
+
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
+
+ if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req ||
+ (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
+ /* Migration capable, move sk from the listening section
+ * to the closed section.
+ */
+ bpf_sk_reuseport_detach(sk);
+
+ __reuseport_detach_sock(sk, reuse);
+ __reuseport_add_closed_sock(sk, reuse);
+
+ spin_unlock_bh(&reuseport_lock);
+ return;
+ }
+
+ spin_unlock_bh(&reuseport_lock);
+ }
+
+ /* Not capable to do migration, detach immediately */
+ reuseport_detach_sock(sk);
+}
+EXPORT_SYMBOL(reuseport_stop_listen_sock);
+
static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
struct bpf_prog *prog, struct sk_buff *skb,
int hdr_len)
@@ -244,6 +439,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
return reuse->socks[index];
}
+static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
+ u32 hash, u16 num_socks)
+{
+ int i, j;
+
+ i = j = reciprocal_scale(hash, num_socks);
+ while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+ i++;
+ if (i >= num_socks)
+ i = 0;
+ if (i == j)
+ return NULL;
+ }
+
+ return reuse->socks[i];
+}
+
/**
* reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
* @sk: First socket in the group.
@@ -274,32 +486,21 @@ struct sock *reuseport_select_sock(struct sock *sk,
prog = rcu_dereference(reuse->prog);
socks = READ_ONCE(reuse->num_socks);
if (likely(socks)) {
- /* paired with smp_wmb() in reuseport_add_sock() */
+ /* paired with smp_wmb() in __reuseport_add_sock() */
smp_rmb();
if (!prog || !skb)
goto select_by_hash;
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
- sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+ sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
else
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
- if (!sk2) {
- int i, j;
-
- i = j = reciprocal_scale(hash, socks);
- while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
- i++;
- if (i >= socks)
- i = 0;
- if (i == j)
- goto out;
- }
- sk2 = reuse->socks[i];
- }
+ if (!sk2)
+ sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
}
out:
@@ -308,14 +509,90 @@ out:
}
EXPORT_SYMBOL(reuseport_select_sock);
+/**
+ * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
+ * @sk: close()ed or shutdown()ed socket in the group.
+ * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
+ * NEW_SYN_RECV request socket during 3WHS.
+ * @skb: skb to run through BPF filter.
+ * Returns a socket (with sk_refcnt +1) that should accept the child socket
+ * (or NULL on error).
+ */
+struct sock *reuseport_migrate_sock(struct sock *sk,
+ struct sock *migrating_sk,
+ struct sk_buff *skb)
+{
+ struct sock_reuseport *reuse;
+ struct sock *nsk = NULL;
+ bool allocated = false;
+ struct bpf_prog *prog;
+ u16 socks;
+ u32 hash;
+
+ rcu_read_lock();
+
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+ if (!reuse)
+ goto out;
+
+ socks = READ_ONCE(reuse->num_socks);
+ if (unlikely(!socks))
+ goto failure;
+
+ /* paired with smp_wmb() in __reuseport_add_sock() */
+ smp_rmb();
+
+ hash = migrating_sk->sk_hash;
+ prog = rcu_dereference(reuse->prog);
+ if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+ goto select_by_hash;
+ goto failure;
+ }
+
+ if (!skb) {
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb)
+ goto failure;
+ allocated = true;
+ }
+
+ nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
+
+ if (allocated)
+ kfree_skb(skb);
+
+select_by_hash:
+ if (!nsk)
+ nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
+
+ if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
+ nsk = NULL;
+ goto failure;
+ }
+
+out:
+ rcu_read_unlock();
+ return nsk;
+
+failure:
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+ goto out;
+}
+EXPORT_SYMBOL(reuseport_migrate_sock);
+
int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
- if (sk_unhashed(sk) && sk->sk_reuseport) {
- int err = reuseport_alloc(sk, false);
+ if (sk_unhashed(sk)) {
+ int err;
+ if (!sk->sk_reuseport)
+ return -EINVAL;
+
+ err = reuseport_alloc(sk, false);
if (err)
return err;
} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
@@ -341,13 +618,24 @@ int reuseport_detach_prog(struct sock *sk)
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
- if (!rcu_access_pointer(sk->sk_reuseport_cb))
- return sk->sk_reuseport ? -ENOENT : -EINVAL;
-
old_prog = NULL;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
+
+ /* reuse must be checked after acquiring the reuseport_lock
+ * because reuseport_grow() can detach a closed sk.
+ */
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return sk->sk_reuseport ? -ENOENT : -EINVAL;
+ }
+
+ if (sk_unhashed(sk) && reuse->num_closed_socks) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOENT;
+ }
+
old_prog = rcu_replace_pointer(reuse->prog, old_prog,
lockdep_is_held(&reuseport_lock));
spin_unlock_bh(&reuseport_lock);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 858276e72c68..cc92ccb38432 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -113,8 +113,13 @@ static void mem_allocator_disconnect(void *allocator)
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
+ int type = xdp_rxq->mem.type;
int id = xdp_rxq->mem.id;
+ /* Reset mem info to defaults */
+ xdp_rxq->mem.id = 0;
+ xdp_rxq->mem.type = 0;
+
if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
WARN(1, "Missing register, driver bug");
return;
@@ -123,7 +128,7 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
if (id == 0)
return;
- if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
+ if (type == MEM_TYPE_PAGE_POOL) {
rcu_read_lock();
xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
page_pool_destroy(xa->page_pool);
@@ -144,10 +149,6 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
-
- /* Reset mem info to defaults */
- xdp_rxq->mem.id = 0;
- xdp_rxq->mem.type = 0;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
@@ -584,3 +585,31 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
return __xdp_build_skb_from_frame(xdpf, skb, dev);
}
EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
+
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
+{
+ unsigned int headroom, totalsize;
+ struct xdp_frame *nxdpf;
+ struct page *page;
+ void *addr;
+
+ headroom = xdpf->headroom + sizeof(*xdpf);
+ totalsize = headroom + xdpf->len;
+
+ if (unlikely(totalsize > PAGE_SIZE))
+ return NULL;
+ page = dev_alloc_page();
+ if (!page)
+ return NULL;
+ addr = page_to_virt(page);
+
+ memcpy(addr, xdpf, totalsize);
+
+ nxdpf = addr;
+ nxdpf->data = addr + headroom;
+ nxdpf->frame_sz = PAGE_SIZE;
+ nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+ nxdpf->mem.id = 0;
+
+ return nxdpf;
+}