summaryrefslogtreecommitdiff
path: root/net/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-26 21:48:21 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-26 21:48:21 -0700
commit1a9239bb4253f9076b5b4b2a1a4e8d7defd77a95 (patch)
tree286dda5e84757594218e684b94b01b3a3cac15a2 /net/sched
parente61f33273ca755b3e2ebee4520a76097199dc7a8 (diff)
parent023b1e9d265ca0662111a9df23d22b4632717a8a (diff)
Merge tag 'net-next-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Continue Netlink conversions to per-namespace RTNL lock (IPv4 routing, routing rules, routing next hops, ARP ioctls) - Continue extending the use of netdev instance locks. As a driver opt-in protect queue operations and (in due course) ethtool operations with the instance lock and not RTNL lock. - Support collecting TCP timestamps (data submitted, sent, acked) in BPF, allowing for transparent (to the application) and lower overhead tracking of TCP RPC performance. - Tweak existing networking Rx zero-copy infra to support zero-copy Rx via io_uring. - Optimize MPTCP performance in single subflow mode by 29%. - Enable GRO on packets which went thru XDP CPU redirect (were queued for processing on a different CPU). Improving TCP stream performance up to 2x. - Improve performance of contended connect() by 200% by searching for an available 4-tuple under RCU rather than a spin lock. Bring an additional 229% improvement by tweaking hash distribution. - Avoid unconditionally touching sk_tsflags on RX, improving performance under UDP flood by as much as 10%. - Avoid skb_clone() dance in ping_rcv() to improve performance under ping flood. - Avoid FIB lookup in netfilter if socket is available, 20% perf win. - Rework network device creation (in-kernel) API to more clearly identify network namespaces and their roles. There are up to 4 namespace roles but we used to have just 2 netns pointer arguments, interpreted differently based on context. - Use sysfs_break_active_protection() instead of trylock to avoid deadlocks between unregistering objects and sysfs access. - Add a new sysctl and sockopt for capping max retransmit timeout in TCP. - Support masking port and DSCP in routing rule matches. - Support dumping IPv4 multicast addresses with RTM_GETMULTICAST. - Support specifying at what time packet should be sent on AF_XDP sockets. - Expose TCP ULP diagnostic info (for TLS and MPTCP) to non-admin users. - Add Netlink YAML spec for WiFi (nl80211) and conntrack. - Introduce EXPORT_IPV6_MOD() and EXPORT_IPV6_MOD_GPL() for symbols which only need to be exported when IPv6 support is built as a module. - Age FDB entries based on Rx not Tx traffic in VxLAN, similar to normal bridging. - Allow users to specify source port range for GENEVE tunnels. - netconsole: allow attaching kernel release, CPU ID and task name to messages as metadata Driver API: - Continue rework / fixing of Energy Efficient Ethernet (EEE) across the SW layers. Delegate the responsibilities to phylink where possible. Improve its handling in phylib. - Support symmetric OR-XOR RSS hashing algorithm. - Support tracking and preserving IRQ affinity by NAPI itself. - Support loopback mode speed selection for interface selftests. Device drivers: - Remove the IBM LCS driver for s390 - Remove the sb1000 cable modem driver - Add support for SFP module access over SMBus - Add MCTP transport driver for MCTP-over-USB - Enable XDP metadata support in multiple drivers - Ethernet high-speed NICs: - Broadcom (bnxt): - add PCIe TLP Processing Hints (TPH) support for new AMD platforms - support dumping RoCE queue state for debug - opt into instance locking - Intel (100G, ice, idpf): - ice: rework MSI-X IRQ management and distribution - ice: support for E830 devices - iavf: add support for Rx timestamping - iavf: opt into instance locking - nVidia/Mellanox: - mlx4: use page pool memory allocator for Rx - mlx5: support for one PTP device per hardware clock - mlx5: support for 200Gbps per-lane link modes - mlx5: move IPSec policy check after decryption - AMD/Solarflare: - support FW flashing via devlink - Cisco (enic): - use page pool memory allocator for Rx - enable 32, 64 byte CQEs - get max rx/tx ring size from the device - Meta (fbnic): - support flow steering and RSS configuration - report queue stats - support TCP segmentation - support IRQ coalescing - support ring size configuration - Marvell/Cavium: - support AF_XDP - Wangxun: - support for PTP clock and timestamping - Huawei (hibmcge): - checksum offload - add more statistics - Ethernet virtual: - VirtIO net: - aggressively suppress Tx completions, improve perf by 96% with 1 CPU and 55% with 2 CPUs - expose NAPI to IRQ mapping and persist NAPI settings - Google (gve): - support XDP in DQO RDA Queue Format - opt into instance locking - Microsoft vNIC: - support BIG TCP - Ethernet NICs consumer, and embedded: - Synopsys (stmmac): - cleanup Tx and Tx clock setting and other link-focused cleanups - enable SGMII and 2500BASEX mode switching for Intel platforms - support Sophgo SG2044 - Broadcom switches (b53): - support for BCM53101 - TI: - iep: add perout configuration support - icssg: support XDP - Cadence (macb): - implement BQL - Xilinx (axinet): - support dynamic IRQ moderation and changing coalescing at runtime - implement BQL - report standard stats - MediaTek: - support phylink managed EEE - Intel: - igc: don't restart the interface on every XDP program change - RealTek (r8169): - support reading registers of internal PHYs directly - increase max jumbo packet size on RTL8125/RTL8126 - Airoha: - support for RISC-V NPU packet processing unit - enable scatter-gather and support MTU up to 9kB - Tehuti (tn40xx): - support cards with TN4010 MAC and an Aquantia AQR105 PHY - Ethernet PHYs: - support for TJA1102S, TJA1121 - dp83tg720: add randomized polling intervals for link detection - dp83822: support changing the transmit amplitude voltage - support for LEDs on 88q2xxx - CAN: - canxl: support Remote Request Substitution bit access - flexcan: add S32G2/S32G3 SoC - WiFi: - remove cooked monitor support - strict mode for better AP testing - basic EPCS support - OMI RX bandwidth reduction support - batman-adv: add support for jumbo frames - WiFi drivers: - RealTek (rtw88): - support RTL8814AE and RTL8814AU - RealTek (rtw89): - switch using wiphy_lock and wiphy_work - add BB context to manipulate two PHY as preparation of MLO - improve BT-coexistence mechanism to play A2DP smoothly - Intel (iwlwifi): - add new iwlmld sub-driver for latest HW/FW combinations - MediaTek (mt76): - preparation for mt7996 Multi-Link Operation (MLO) support - Qualcomm/Atheros (ath12k): - continued work on MLO - Silabs (wfx): - Wake-on-WLAN support - Bluetooth: - add support for skb TX SND/COMPLETION timestamping - hci_core: enable buffer flow control for SCO/eSCO - coredump: log devcd dumps into the monitor - Bluetooth drivers: - intel: add support to configure TX power - nxp: handle bootloader error during cmd5 and cmd7" * tag 'net-next-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1681 commits) unix: fix up for "apparmor: add fine grained af_unix mediation" mctp: Fix incorrect tx flow invalidation condition in mctp-i2c net: usb: asix: ax88772: Increase phy_name size net: phy: Introduce PHY_ID_SIZE — minimum size for PHY ID string net: libwx: fix Tx L4 checksum net: libwx: fix Tx descriptor content for some tunnel packets atm: Fix NULL pointer dereference net: tn40xx: add pci-id of the aqr105-based Tehuti TN4010 cards net: tn40xx: prepare tn40xx driver to find phy of the TN9510 card net: tn40xx: create swnode for mdio and aqr105 phy and add to mdiobus net: phy: aquantia: add essential functions to aqr105 driver net: phy: aquantia: search for firmware-name in fwnode net: phy: aquantia: add probe function to aqr105 for firmware loading net: phy: Add swnode support to mdiobus_scan gve: add XDP DROP and PASS support for DQ gve: update XDP allocation path support RX buffer posting gve: merge packet buffer size fields gve: update GQ RX to use buf_size gve: introduce config-based allocation for XDP gve: remove xdp_xsk_done and xdp_xsk_wakeup statistics ...
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/act_tunnel_key.c8
-rw-r--r--net/sched/em_meta.c2
-rw-r--r--net/sched/sch_api.c216
-rw-r--r--net/sched/sch_qfq.c2
4 files changed, 152 insertions, 76 deletions
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index af7c99845948..ae5dea7c48a8 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -571,8 +571,8 @@ static void tunnel_key_release(struct tc_action *a)
static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
+ const u8 *src = ip_tunnel_info_opts(info);
int len = info->options_len;
- u8 *src = (u8 *)(info + 1);
struct nlattr *start;
start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
@@ -580,7 +580,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
return -EMSGSIZE;
while (len > 0) {
- struct geneve_opt *opt = (struct geneve_opt *)src;
+ const struct geneve_opt *opt = (const struct geneve_opt *)src;
if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS,
opt->opt_class) ||
@@ -603,7 +603,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
- struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1);
+ const struct vxlan_metadata *md = ip_tunnel_info_opts(info);
struct nlattr *start;
start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN);
@@ -622,7 +622,7 @@ static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb,
static int tunnel_key_erspan_opts_dump(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
- struct erspan_metadata *md = (struct erspan_metadata *)(info + 1);
+ const struct erspan_metadata *md = ip_tunnel_info_opts(info);
struct nlattr *start;
start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 8996c73c9779..3f2e707a11d1 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -460,7 +460,7 @@ META_COLLECTOR(int_sk_fwd_alloc)
*err = -1;
return;
}
- dst->value = sk_forward_alloc_get(sk);
+ dst->value = READ_ONCE(sk->sk_forward_alloc);
}
META_COLLECTOR(int_sk_sndbuf)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 0b102a0ca83d..defb05c1fba4 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/hashtable.h>
+#include <net/netdev_lock.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -1278,9 +1279,11 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
* We replay the request because the device may
* go away in the mean time.
*/
+ netdev_unlock_ops(dev);
rtnl_unlock();
request_module(NET_SCH_ALIAS_PREFIX "%s", name);
rtnl_lock();
+ netdev_lock_ops(dev);
ops = qdisc_lookup_ops(kind);
if (ops != NULL) {
/* We will try again qdisc_lookup_ops,
@@ -1504,27 +1507,18 @@ const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
* Delete/get qdisc.
*/
-static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
- struct netlink_ext_ack *extack)
+static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack,
+ struct net_device *dev,
+ struct nlattr *tca[TCA_MAX + 1],
+ struct tcmsg *tcm)
{
struct net *net = sock_net(skb->sk);
- struct tcmsg *tcm = nlmsg_data(n);
- struct nlattr *tca[TCA_MAX + 1];
- struct net_device *dev;
- u32 clid;
struct Qdisc *q = NULL;
struct Qdisc *p = NULL;
+ u32 clid;
int err;
- err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
- rtm_tca_policy, extack);
- if (err < 0)
- return err;
-
- dev = __dev_get_by_index(net, tcm->tcm_ifindex);
- if (!dev)
- return -ENODEV;
-
clid = tcm->tcm_parent;
if (clid) {
if (clid != TC_H_ROOT) {
@@ -1581,6 +1575,31 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
return 0;
}
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tcmsg *tcm = nlmsg_data(n);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ int err;
+
+ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, extack);
+ if (err < 0)
+ return err;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ netdev_lock_ops(dev);
+ err = __tc_get_qdisc(skb, n, extack, dev, tca, tcm);
+ netdev_unlock_ops(dev);
+
+ return err;
+}
+
static bool req_create_or_replace(struct nlmsghdr *n)
{
return (n->nlmsg_flags & NLM_F_CREATE &&
@@ -1600,35 +1619,19 @@ static bool req_change(struct nlmsghdr *n)
!(n->nlmsg_flags & NLM_F_EXCL));
}
-/*
- * Create/change qdisc.
- */
-static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
- struct netlink_ext_ack *extack)
+static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack,
+ struct net_device *dev,
+ struct nlattr *tca[TCA_MAX + 1],
+ struct tcmsg *tcm,
+ bool *replay)
{
- struct net *net = sock_net(skb->sk);
- struct tcmsg *tcm;
- struct nlattr *tca[TCA_MAX + 1];
- struct net_device *dev;
+ struct Qdisc *q = NULL;
+ struct Qdisc *p = NULL;
u32 clid;
- struct Qdisc *q, *p;
int err;
-replay:
- /* Reinit, just in case something touches this. */
- err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
- rtm_tca_policy, extack);
- if (err < 0)
- return err;
-
- tcm = nlmsg_data(n);
clid = tcm->tcm_parent;
- q = p = NULL;
-
- dev = __dev_get_by_index(net, tcm->tcm_ifindex);
- if (!dev)
- return -ENODEV;
-
if (clid) {
if (clid != TC_H_ROOT) {
@@ -1754,7 +1757,7 @@ replay:
}
err = qdisc_change(q, tca, extack);
if (err == 0)
- qdisc_notify(net, skb, n, clid, NULL, q, extack);
+ qdisc_notify(sock_net(skb->sk), skb, n, clid, NULL, q, extack);
return err;
create_n_graft:
@@ -1787,8 +1790,10 @@ create_n_graft2:
tca, &err, extack);
}
if (q == NULL) {
- if (err == -EAGAIN)
- goto replay;
+ if (err == -EAGAIN) {
+ *replay = true;
+ return 0;
+ }
return err;
}
@@ -1803,6 +1808,41 @@ graft:
return 0;
}
+/*
+ * Create/change qdisc.
+ */
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ struct tcmsg *tcm;
+ bool replay;
+ int err;
+
+replay:
+ /* Reinit, just in case something touches this. */
+ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, extack);
+ if (err < 0)
+ return err;
+
+ tcm = nlmsg_data(n);
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ replay = false;
+ netdev_lock_ops(dev);
+ err = __tc_modify_qdisc(skb, n, extack, dev, tca, tcm, &replay);
+ netdev_unlock_ops(dev);
+ if (replay)
+ goto replay;
+
+ return err;
+}
+
static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
struct netlink_callback *cb,
int *q_idx_p, int s_q_idx, bool recur,
@@ -1887,17 +1927,23 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
s_q_idx = 0;
q_idx = 0;
+ netdev_lock_ops(dev);
if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
skb, cb, &q_idx, s_q_idx,
- true, tca[TCA_DUMP_INVISIBLE]) < 0)
+ true, tca[TCA_DUMP_INVISIBLE]) < 0) {
+ netdev_unlock_ops(dev);
goto done;
+ }
dev_queue = dev_ingress_queue(dev);
if (dev_queue &&
tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
skb, cb, &q_idx, s_q_idx, false,
- tca[TCA_DUMP_INVISIBLE]) < 0)
+ tca[TCA_DUMP_INVISIBLE]) < 0) {
+ netdev_unlock_ops(dev);
goto done;
+ }
+ netdev_unlock_ops(dev);
cont:
idx++;
@@ -2134,15 +2180,15 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
#endif
-static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
- struct netlink_ext_ack *extack)
+static int __tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack,
+ struct net_device *dev,
+ struct nlattr *tca[TCA_MAX + 1],
+ struct tcmsg *tcm)
{
struct net *net = sock_net(skb->sk);
- struct tcmsg *tcm = nlmsg_data(n);
- struct nlattr *tca[TCA_MAX + 1];
- struct net_device *dev;
- struct Qdisc *q = NULL;
const struct Qdisc_class_ops *cops;
+ struct Qdisc *q = NULL;
unsigned long cl = 0;
unsigned long new_cl;
u32 portid;
@@ -2150,15 +2196,6 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
u32 qid;
int err;
- err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
- rtm_tca_policy, extack);
- if (err < 0)
- return err;
-
- dev = __dev_get_by_index(net, tcm->tcm_ifindex);
- if (!dev)
- return -ENODEV;
-
/*
parent == TC_H_UNSPEC - unspecified parent.
parent == TC_H_ROOT - class is root, which has no parent.
@@ -2273,6 +2310,31 @@ out:
return err;
}
+static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tcmsg *tcm = nlmsg_data(n);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ int err;
+
+ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, extack);
+ if (err < 0)
+ return err;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ netdev_lock_ops(dev);
+ err = __tc_ctl_tclass(skb, n, extack, dev, tca, tcm);
+ netdev_unlock_ops(dev);
+
+ return err;
+}
+
struct qdisc_dump_args {
struct qdisc_walker w;
struct sk_buff *skb;
@@ -2349,20 +2411,12 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
return 0;
}
-static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
+static int __tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb,
+ struct tcmsg *tcm, struct net_device *dev)
{
- struct tcmsg *tcm = nlmsg_data(cb->nlh);
- struct net *net = sock_net(skb->sk);
struct netdev_queue *dev_queue;
- struct net_device *dev;
int t, s_t;
- if (nlmsg_len(cb->nlh) < sizeof(*tcm))
- return 0;
- dev = dev_get_by_index(net, tcm->tcm_ifindex);
- if (!dev)
- return 0;
-
s_t = cb->args[0];
t = 0;
@@ -2379,10 +2433,32 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
done:
cb->args[0] = t;
- dev_put(dev);
return skb->len;
}
+static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int err;
+
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
+ return 0;
+
+ dev = dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return 0;
+
+ netdev_lock_ops(dev);
+ err = __tc_dump_tclass(skb, cb, tcm, dev);
+ netdev_unlock_ops(dev);
+
+ dev_put(dev);
+
+ return err;
+}
+
#ifdef CONFIG_PROC_FS
static int psched_show(struct seq_file *seq, void *v)
{
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 6a07cdbdb9e1..2cfbc977fe6d 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -447,7 +447,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (q->wsum + delta_w > QFQ_MAX_WSUM) {
NL_SET_ERR_MSG_FMT_MOD(extack,
- "total weight out of range (%d + %u)\n",
+ "total weight out of range (%d + %u)",
delta_w, q->wsum);
return -EINVAL;
}