From 011dd3b3f83f9c89605c640424e05845b84f2dad Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:33 +0100 Subject: net: Make dev_set_hwtstamp_phylib accessible Make the dev_set_hwtstamp_phylib function accessible in prevision to use it from ethtool to reset the tstamp current configuration. Reviewed-by: Florian Fainelli Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index feeddf95f450..9a66cf5015f2 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -322,9 +322,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in * dev->priv_flags. */ -static int dev_set_hwtstamp_phylib(struct net_device *dev, - struct kernel_hwtstamp_config *cfg, - struct netlink_ext_ack *extack) +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; bool phy_ts = phy_has_hwtstamp(dev->phydev); @@ -363,6 +363,7 @@ static int dev_set_hwtstamp_phylib(struct net_device *dev, return 0; } +EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib); static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { -- cgit From 51bdf3165f012827644c474a6d905baa3de3f1ea Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:40 +0100 Subject: net: Replace hwtstamp_source by timestamping layer Replace hwtstamp_source which is only used by the kernel_hwtstamp_config structure by the more widely use timestamp_layer structure. This is done to prepare the support of selectable timestamping source. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9a66cf5015f2..267cd00269d0 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -332,7 +332,7 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, bool changed = false; int err; - cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; + cfg->source = phy_ts ? PHY_TIMESTAMPING : MAC_TIMESTAMPING; if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { err = ops->ndo_hwtstamp_get(dev, &old_cfg); -- cgit From 0f7f463d4821a4f52fa5c0a961389e651d50c384 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:41 +0100 Subject: net: Change the API of PHY default timestamp to MAC Change the API to select MAC default time stamping instead of the PHY. Indeed the PHY is closer to the wire therefore theoretically it has less delay than the MAC timestamping but the reality is different. Due to lower time stamping clock frequency, latency in the MDIO bus and no PHC hardware synchronization between different PHY, the PHY PTP is often less precise than the MAC. The exception is for PHY designed specially for PTP case but these devices are not very widespread. For not breaking the compatibility I introduce a default_timestamp flag in phy_device that is set by the phy driver to know we are using the old API behavior. The phy_set_timestamp function is called at each call of phy_attach_direct. In case of MAC driver using phylink this function is called when the interface is turned up. Then if the interface goes down and up again the last choice of timestamp will be overwritten by the default choice. A solution could be to cache the timestamp status but it can bring other issues. In case of SFP, if we change the module, it doesn't make sense to blindly re-set the timestamp back to PHY, if the new module has a PHY with mediocre timestamping capabilities. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ net/core/dev_ioctl.c | 36 ++++++++++++++++++++++-------------- net/core/timestamping.c | 10 ++++++++++ 3 files changed, 35 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index af53f6d838ce..05ce00632892 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10212,6 +10212,9 @@ int register_netdevice(struct net_device *dev) dev->rtnl_link_state == RTNL_LINK_INITIALIZED) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); + if (dev->ethtool_ops->get_ts_info) + dev->ts_layer = MAC_TIMESTAMPING; + out: return ret; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 267cd00269d0..bc8be9749376 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -259,9 +259,7 @@ static int dev_eth_ioctl(struct net_device *dev, * @dev: Network device * @cfg: Timestamping configuration structure * - * Helper for enforcing a common policy that phylib timestamping, if available, - * should take precedence in front of hardware timestamping provided by the - * netdev. + * Helper for calling the selected hardware provider timestamping. * * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and * there only exists a phydev->mii_ts->hwtstamp() method. So this will return @@ -271,10 +269,14 @@ static int dev_eth_ioctl(struct net_device *dev, static int dev_get_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { - if (phy_has_hwtstamp(dev->phydev)) + enum timestamping_layer ts_layer = dev->ts_layer; + + if (ts_layer == PHY_TIMESTAMPING) return phy_hwtstamp_get(dev->phydev, cfg); + else if (ts_layer == MAC_TIMESTAMPING) + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); - return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); + return -EOPNOTSUPP; } static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) @@ -315,9 +317,8 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) * @cfg: Timestamping configuration structure * @extack: Netlink extended ack message structure, for error reporting * - * Helper for enforcing a common policy that phylib timestamping, if available, - * should take precedence in front of hardware timestamping provided by the - * netdev. If the netdev driver needs to perform specific actions even for PHY + * Helper for calling the selected hardware provider timestamping. + * If the netdev driver needs to perform specific actions even for PHY * timestamping to work properly (a switch port must trap the timestamped * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in * dev->priv_flags. @@ -327,20 +328,26 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; - bool phy_ts = phy_has_hwtstamp(dev->phydev); + enum timestamping_layer ts_layer = dev->ts_layer; struct kernel_hwtstamp_config old_cfg = {}; bool changed = false; int err; - cfg->source = phy_ts ? PHY_TIMESTAMPING : MAC_TIMESTAMPING; + cfg->source = ts_layer; + + if (ts_layer != PHY_TIMESTAMPING && + ts_layer != MAC_TIMESTAMPING) + return -EOPNOTSUPP; - if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + if (ts_layer == PHY_TIMESTAMPING && + dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS) { err = ops->ndo_hwtstamp_get(dev, &old_cfg); if (err) return err; } - if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + if (ts_layer == MAC_TIMESTAMPING || + dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS) { err = ops->ndo_hwtstamp_set(dev, cfg, extack); if (err) { if (extack->_msg) @@ -349,10 +356,11 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, } } - if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) + if (ts_layer == PHY_TIMESTAMPING && + dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS) changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); - if (phy_ts) { + if (ts_layer == PHY_TIMESTAMPING) { err = phy_hwtstamp_set(dev->phydev, cfg, extack); if (err) { if (changed) diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 04840697fe79..5cf51a523fb3 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -21,6 +21,7 @@ static unsigned int classify(const struct sk_buff *skb) void skb_clone_tx_timestamp(struct sk_buff *skb) { + enum timestamping_layer ts_layer; struct mii_timestamper *mii_ts; struct sk_buff *clone; unsigned int type; @@ -28,6 +29,10 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) if (!skb->sk) return; + ts_layer = skb->dev->ts_layer; + if (ts_layer != PHY_TIMESTAMPING) + return; + type = classify(skb); if (type == PTP_CLASS_NONE) return; @@ -44,12 +49,17 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { + enum timestamping_layer ts_layer; struct mii_timestamper *mii_ts; unsigned int type; if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts) return false; + ts_layer = skb->dev->ts_layer; + if (ts_layer != PHY_TIMESTAMPING) + return false; + if (skb_headroom(skb) < ETH_HLEN) return false; -- cgit From 289354f21b2c3fac93e956efd45f256a88a4d997 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Nov 2023 18:38:05 -0800 Subject: net: partial revert of the "Make timestamping selectable: series Revert following commits: commit acec05fb78ab ("net_tstamp: Add TIMESTAMPING SOFTWARE and HARDWARE mask") commit 11d55be06df0 ("net: ethtool: Add a command to expose current time stamping layer") commit bb8645b00ced ("netlink: specs: Introduce new netlink command to get current timestamp") commit d905f9c75329 ("net: ethtool: Add a command to list available time stamping layers") commit aed5004ee7a0 ("netlink: specs: Introduce new netlink command to list available time stamping layers") commit 51bdf3165f01 ("net: Replace hwtstamp_source by timestamping layer") commit 0f7f463d4821 ("net: Change the API of PHY default timestamp to MAC") commit 091fab122869 ("net: ethtool: ts: Update GET_TS to reply the current selected timestamp") commit 152c75e1d002 ("net: ethtool: ts: Let the active time stamping layer be selectable") commit ee60ea6be0d3 ("netlink: specs: Introduce time stamping set command") They need more time for reviews. Link: https://lore.kernel.org/all/20231118183529.6e67100c@kernel.org/ Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 --- net/core/dev_ioctl.c | 36 ++++++++++++++---------------------- net/core/timestamping.c | 10 ---------- 3 files changed, 14 insertions(+), 35 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 05ce00632892..af53f6d838ce 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10212,9 +10212,6 @@ int register_netdevice(struct net_device *dev) dev->rtnl_link_state == RTNL_LINK_INITIALIZED) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); - if (dev->ethtool_ops->get_ts_info) - dev->ts_layer = MAC_TIMESTAMPING; - out: return ret; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index bc8be9749376..9a66cf5015f2 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -259,7 +259,9 @@ static int dev_eth_ioctl(struct net_device *dev, * @dev: Network device * @cfg: Timestamping configuration structure * - * Helper for calling the selected hardware provider timestamping. + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. * * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and * there only exists a phydev->mii_ts->hwtstamp() method. So this will return @@ -269,14 +271,10 @@ static int dev_eth_ioctl(struct net_device *dev, static int dev_get_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { - enum timestamping_layer ts_layer = dev->ts_layer; - - if (ts_layer == PHY_TIMESTAMPING) + if (phy_has_hwtstamp(dev->phydev)) return phy_hwtstamp_get(dev->phydev, cfg); - else if (ts_layer == MAC_TIMESTAMPING) - return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); - return -EOPNOTSUPP; + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); } static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) @@ -317,8 +315,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) * @cfg: Timestamping configuration structure * @extack: Netlink extended ack message structure, for error reporting * - * Helper for calling the selected hardware provider timestamping. - * If the netdev driver needs to perform specific actions even for PHY + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. If the netdev driver needs to perform specific actions even for PHY * timestamping to work properly (a switch port must trap the timestamped * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in * dev->priv_flags. @@ -328,26 +327,20 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; - enum timestamping_layer ts_layer = dev->ts_layer; + bool phy_ts = phy_has_hwtstamp(dev->phydev); struct kernel_hwtstamp_config old_cfg = {}; bool changed = false; int err; - cfg->source = ts_layer; - - if (ts_layer != PHY_TIMESTAMPING && - ts_layer != MAC_TIMESTAMPING) - return -EOPNOTSUPP; + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; - if (ts_layer == PHY_TIMESTAMPING && - dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS) { + if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { err = ops->ndo_hwtstamp_get(dev, &old_cfg); if (err) return err; } - if (ts_layer == MAC_TIMESTAMPING || - dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS) { + if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { err = ops->ndo_hwtstamp_set(dev, cfg, extack); if (err) { if (extack->_msg) @@ -356,11 +349,10 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, } } - if (ts_layer == PHY_TIMESTAMPING && - dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS) + if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); - if (ts_layer == PHY_TIMESTAMPING) { + if (phy_ts) { err = phy_hwtstamp_set(dev->phydev, cfg, extack); if (err) { if (changed) diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 5cf51a523fb3..04840697fe79 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -21,7 +21,6 @@ static unsigned int classify(const struct sk_buff *skb) void skb_clone_tx_timestamp(struct sk_buff *skb) { - enum timestamping_layer ts_layer; struct mii_timestamper *mii_ts; struct sk_buff *clone; unsigned int type; @@ -29,10 +28,6 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) if (!skb->sk) return; - ts_layer = skb->dev->ts_layer; - if (ts_layer != PHY_TIMESTAMPING) - return; - type = classify(skb); if (type == PTP_CLASS_NONE) return; @@ -49,17 +44,12 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { - enum timestamping_layer ts_layer; struct mii_timestamper *mii_ts; unsigned int type; if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts) return false; - ts_layer = skb->dev->ts_layer; - if (ts_layer != PHY_TIMESTAMPING) - return false; - if (skb_headroom(skb) < ETH_HLEN) return false; -- cgit From ac40916a3f7243efbe6e129ebf495b5c33a3adfe Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 15 Nov 2023 20:01:08 +0800 Subject: rtnetlink: introduce nlmsg_new_large and use it in rtnl_getlink if a PF has 256 or more VFs, ip link command will allocate an order 3 memory or more, and maybe trigger OOM due to memory fragment, the VFs needed memory size is computed in rtnl_vfinfo_size. so introduce nlmsg_new_large which calls netlink_alloc_large_skb in which vmalloc is used for large memory, to avoid the failure of allocating memory ip invoked oom-killer: gfp_mask=0xc2cc0(GFP_KERNEL|__GFP_NOWARN|\ __GFP_COMP|__GFP_NOMEMALLOC), order=3, oom_score_adj=0 CPU: 74 PID: 204414 Comm: ip Kdump: loaded Tainted: P OE Call Trace: dump_stack+0x57/0x6a dump_header+0x4a/0x210 oom_kill_process+0xe4/0x140 out_of_memory+0x3e8/0x790 __alloc_pages_slowpath.constprop.116+0x953/0xc50 __alloc_pages_nodemask+0x2af/0x310 kmalloc_large_node+0x38/0xf0 __kmalloc_node_track_caller+0x417/0x4d0 __kmalloc_reserve.isra.61+0x2e/0x80 __alloc_skb+0x82/0x1c0 rtnl_getlink+0x24f/0x370 rtnetlink_rcv_msg+0x12c/0x350 netlink_rcv_skb+0x50/0x100 netlink_unicast+0x1b2/0x280 netlink_sendmsg+0x355/0x4a0 sock_sendmsg+0x5b/0x60 ____sys_sendmsg+0x1ea/0x250 ___sys_sendmsg+0x88/0xd0 __sys_sendmsg+0x5e/0xa0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f95a65a5b70 Cc: Yunsheng Lin Signed-off-by: Li RongQing Link: https://lore.kernel.org/r/20231115120108.3711-1-lirongqing@baidu.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e8431c6c8490..592164c2a540 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3849,7 +3849,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; err = -ENOBUFS; - nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); + nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask)); if (nskb == NULL) goto out; -- cgit From dd891b5b106fa7346d75b3ee3448fa0071422f3d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2023 10:41:40 -0800 Subject: net: do not send a MOVE event when netdev changes netns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Networking supports changing netdevice's netns and name at the same time. This allows avoiding name conflicts and having to rename the interface in multiple steps. E.g. netns1={eth0, eth1}, netns2={eth1} - we want to move netns1:eth1 to netns2 and call it eth0 there. If we can't rename "in flight" we'd need to (1) rename eth1 -> $tmp, (2) change netns, (3) rename $tmp -> eth0. To rename the underlying struct device we have to call device_rename(). The rename()'s MOVE event, however, doesn't "belong" to either the old or the new namespace. If there are conflicts on both sides it's actually impossible to issue a real MOVE (old name -> new name) without confusing user space. And Daniel reports that such confusions do in fact happen for systemd, in real life. Since we already issue explicit REMOVE and ADD events manually - suppress the MOVE event completely. Move the ADD after the rename, so that the REMOVE uses the old name, and the ADD the new one. If there is no rename this changes the picture as follows: Before: old ns | KERNEL[213.399289] remove /devices/virtual/net/eth0 (net) new ns | KERNEL[213.401302] add /devices/virtual/net/eth0 (net) new ns | KERNEL[213.401397] move /devices/virtual/net/eth0 (net) After: old ns | KERNEL[266.774257] remove /devices/virtual/net/eth0 (net) new ns | KERNEL[266.774509] add /devices/virtual/net/eth0 (net) If there is a rename and a conflict (using the exact eth0/eth1 example explained above) we get this: Before: old ns | KERNEL[224.316833] remove /devices/virtual/net/eth1 (net) new ns | KERNEL[224.318551] add /devices/virtual/net/eth1 (net) new ns | KERNEL[224.319662] move /devices/virtual/net/eth0 (net) After: old ns | KERNEL[333.033166] remove /devices/virtual/net/eth1 (net) new ns | KERNEL[333.035098] add /devices/virtual/net/eth0 (net) Note that "in flight" rename is only performed when needed. If there is no conflict for old name in the target netns - the rename will be performed separately by dev_change_name(), as if the rename was a different command, and there will still be a MOVE event for the rename: Before: old ns | KERNEL[194.416429] remove /devices/virtual/net/eth0 (net) new ns | KERNEL[194.418809] add /devices/virtual/net/eth0 (net) new ns | KERNEL[194.418869] move /devices/virtual/net/eth0 (net) new ns | KERNEL[194.420866] move /devices/virtual/net/eth1 (net) After: old ns | KERNEL[71.917520] remove /devices/virtual/net/eth0 (net) new ns | KERNEL[71.919155] add /devices/virtual/net/eth0 (net) new ns | KERNEL[71.920729] move /devices/virtual/net/eth1 (net) If deleting the MOVE event breaks some user space we should insert an explicit kobject_uevent(MOVE) after the ADD, like this: @@ -11192,6 +11192,12 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, kobject_uevent(&dev->dev.kobj, KOBJ_ADD); netdev_adjacent_add_links(dev); + /* User space wants an explicit MOVE event, issue one unless + * dev_change_name() will get called later and issue one. + */ + if (!pat || new_name[0]) + kobject_uevent(&dev->dev.kobj, KOBJ_MOVE); + /* Adapt owner in case owning user namespace of target network * namespace is different from the original one. */ Reported-by: Daniel Gröber Link: https://lore.kernel.org/all/20231010121003.x3yi6fihecewjy4e@House.clients.dxld.at/ Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/all/20231120184140.578375-1-kuba@kernel.org/ Signed-off-by: Jakub Kicinski --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index af53f6d838ce..0bdc6bf758f3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11181,17 +11181,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_net_set(dev, net); dev->ifindex = new_ifindex; - /* Send a netdev-add uevent to the new namespace */ - kobject_uevent(&dev->dev.kobj, KOBJ_ADD); - netdev_adjacent_add_links(dev); - if (new_name[0]) /* Rename the netdev to prepared name */ strscpy(dev->name, new_name, IFNAMSIZ); /* Fixup kobjects */ + dev_set_uevent_suppress(&dev->dev, 1); err = device_rename(&dev->dev, dev->name); + dev_set_uevent_suppress(&dev->dev, 0); WARN_ON(err); + /* Send a netdev-add uevent to the new namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); + /* Adapt owner in case owning user namespace of target network * namespace is different from the original one. */ -- cgit From 5027ec19f1049a07df5b0a37b1f462514cf2724b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2023 16:00:34 -0800 Subject: net: page_pool: split the page_pool_params into fast and slow struct page_pool is rather performance critical and we use 16B of the first cache line to store 2 pointers used only by test code. Future patches will add more informational (non-fast path) attributes. It's convenient for the user of the API to not have to worry which fields are fast and which are slow path. Use struct groups to split the params into the two categories internally. Acked-by: Jesper Dangaard Brouer Reviewed-by: Mina Almasry Reviewed-by: Ilias Apalodimas Link: https://lore.kernel.org/r/20231121000048.789613-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dec544337236..ab22a2fdae57 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -173,7 +173,8 @@ static int page_pool_init(struct page_pool *pool, { unsigned int ring_qsize = 1024; /* Default */ - memcpy(&pool->p, params, sizeof(pool->p)); + memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); + memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); /* Validate only known flags were used */ if (pool->p.flags & ~(PP_FLAG_ALL)) @@ -388,8 +389,8 @@ static void page_pool_set_pp_info(struct page_pool *pool, * the overhead is negligible. */ page_pool_fragment_page(page, 1); - if (pool->p.init_callback) - pool->p.init_callback(page, pool->p.init_arg); + if (pool->slow.init_callback) + pool->slow.init_callback(page, pool->slow.init_arg); } static void page_pool_clear_pp_info(struct page *page) -- cgit From 2da0cac1e9494f34c5a3438e5c4c7e662e1b7445 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2023 16:00:35 -0800 Subject: net: page_pool: avoid touching slow on the fastpath To fully benefit from previous commit add one byte of state in the first cache line recording if we need to look at the slow part. The packing isn't all that impressive right now, we create a 7B hole. I'm expecting Olek's rework will reshuffle this, anyway. Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Reviewed-by: Mina Almasry Link: https://lore.kernel.org/r/20231121000048.789613-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ab22a2fdae57..df2a06d7da52 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -212,6 +212,8 @@ static int page_pool_init(struct page_pool *pool, */ } + pool->has_init_callback = !!pool->slow.init_callback; + #ifdef CONFIG_PAGE_POOL_STATS pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); if (!pool->recycle_stats) @@ -389,7 +391,7 @@ static void page_pool_set_pp_info(struct page_pool *pool, * the overhead is negligible. */ page_pool_fragment_page(page, 1); - if (pool->slow.init_callback) + if (pool->has_init_callback) pool->slow.init_callback(page, pool->slow.init_arg); } -- cgit From 23cfaf67ba5d2f013d2576b8a9173c45a4a7f895 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:28 -0800 Subject: net: page_pool: factor out uninit We'll soon (next change in the series) need a fuller unwind path in page_pool_create() so create the inverse of page_pool_init(). Reviewed-by: Ilias Apalodimas Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Reviewed-by: Shakeel Butt Signed-off-by: Paolo Abeni --- net/core/page_pool.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index df2a06d7da52..2e4575477e71 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -238,6 +238,18 @@ static int page_pool_init(struct page_pool *pool, return 0; } +static void page_pool_uninit(struct page_pool *pool) +{ + ptr_ring_cleanup(&pool->ring, NULL); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + put_device(pool->p.dev); + +#ifdef CONFIG_PAGE_POOL_STATS + free_percpu(pool->recycle_stats); +#endif +} + /** * page_pool_create() - create a page pool. * @params: parameters, see struct page_pool_params @@ -821,14 +833,7 @@ static void __page_pool_destroy(struct page_pool *pool) if (pool->disconnect) pool->disconnect(pool); - ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->p.flags & PP_FLAG_DMA_MAP) - put_device(pool->p.dev); - -#ifdef CONFIG_PAGE_POOL_STATS - free_percpu(pool->recycle_stats); -#endif + page_pool_uninit(pool); kfree(pool); } -- cgit From f17c69649c698e4df3cfe0010b7bbf142dec3e40 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:29 -0800 Subject: net: page_pool: id the page pools To give ourselves the flexibility of creating netlink commands and ability to refer to page pool instances in uAPIs create IDs for page pools. Reviewed-by: Ilias Apalodimas Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Reviewed-by: Shakeel Butt Signed-off-by: Paolo Abeni --- net/core/Makefile | 2 +- net/core/page_pool.c | 21 ++++++++++++++++----- net/core/page_pool_priv.h | 9 +++++++++ net/core/page_pool_user.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 6 deletions(-) create mode 100644 net/core/page_pool_priv.h create mode 100644 net/core/page_pool_user.c (limited to 'net/core') diff --git a/net/core/Makefile b/net/core/Makefile index 0cb734cbc24b..821aec06abf1 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -18,7 +18,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o -obj-$(CONFIG_PAGE_POOL) += page_pool.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 2e4575477e71..a8d96ea38d18 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -23,6 +23,8 @@ #include +#include "page_pool_priv.h" + #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) @@ -264,13 +266,21 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) return ERR_PTR(-ENOMEM); err = page_pool_init(pool, params); - if (err < 0) { - pr_warn("%s() gave up with errno %d\n", __func__, err); - kfree(pool); - return ERR_PTR(err); - } + if (err < 0) + goto err_free; + + err = page_pool_list(pool); + if (err) + goto err_uninit; return pool; + +err_uninit: + page_pool_uninit(pool); +err_free: + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); } EXPORT_SYMBOL(page_pool_create); @@ -833,6 +843,7 @@ static void __page_pool_destroy(struct page_pool *pool) if (pool->disconnect) pool->disconnect(pool); + page_pool_unlist(pool); page_pool_uninit(pool); kfree(pool); } diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h new file mode 100644 index 000000000000..c17ea092b4ab --- /dev/null +++ b/net/core/page_pool_priv.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PAGE_POOL_PRIV_H +#define __PAGE_POOL_PRIV_H + +int page_pool_list(struct page_pool *pool); +void page_pool_unlist(struct page_pool *pool); + +#endif diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c new file mode 100644 index 000000000000..630d1eeecf2a --- /dev/null +++ b/net/core/page_pool_user.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#include "page_pool_priv.h" + +static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); +static DEFINE_MUTEX(page_pools_lock); + +int page_pool_list(struct page_pool *pool) +{ + static u32 id_alloc_next; + int err; + + mutex_lock(&page_pools_lock); + err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b, + &id_alloc_next, GFP_KERNEL); + if (err < 0) + goto err_unlock; + + mutex_unlock(&page_pools_lock); + return 0; + +err_unlock: + mutex_unlock(&page_pools_lock); + return err; +} + +void page_pool_unlist(struct page_pool *pool) +{ + mutex_lock(&page_pools_lock); + xa_erase(&page_pools, pool->user.id); + mutex_unlock(&page_pools_lock); +} -- cgit From 083772c9f972dcc248913b52a0dec1025baa1e16 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:30 -0800 Subject: net: page_pool: record pools per netdev Link the page pools with netdevs. This needs to be netns compatible so we have two options. Either we record the pools per netns and have to worry about moving them as the netdev gets moved. Or we record them directly on the netdev so they move with the netdev without any extra work. Implement the latter option. Since pools may outlast netdev we need a place to store orphans. In time honored tradition use loopback for this purpose. Reviewed-by: Mina Almasry Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/core/page_pool_user.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'net/core') diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 630d1eeecf2a..e5c7f078fbd4 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -1,14 +1,31 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include +#include #include #include "page_pool_priv.h" static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); +/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user. + * Ordering: inside rtnl_lock + */ static DEFINE_MUTEX(page_pools_lock); +/* Page pools are only reachable from user space (via netlink) if they are + * linked to a netdev at creation time. Following page pool "visibility" + * states are possible: + * - normal + * - user.list: linked to real netdev, netdev: real netdev + * - orphaned - real netdev has disappeared + * - user.list: linked to lo, netdev: lo + * - invisible - either (a) created without netdev linking, (b) unlisted due + * to error, or (c) the entire namespace which owned this pool disappeared + * - user.list: unhashed, netdev: unknown + */ + int page_pool_list(struct page_pool *pool) { static u32 id_alloc_next; @@ -20,6 +37,10 @@ int page_pool_list(struct page_pool *pool) if (err < 0) goto err_unlock; + if (pool->slow.netdev) + hlist_add_head(&pool->user.list, + &pool->slow.netdev->page_pools); + mutex_unlock(&page_pools_lock); return 0; @@ -32,5 +53,68 @@ void page_pool_unlist(struct page_pool *pool) { mutex_lock(&page_pools_lock); xa_erase(&page_pools, pool->user.id); + hlist_del(&pool->user.list); + mutex_unlock(&page_pools_lock); +} + +static void page_pool_unreg_netdev_wipe(struct net_device *netdev) +{ + struct page_pool *pool; + struct hlist_node *n; + + mutex_lock(&page_pools_lock); + hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) { + hlist_del_init(&pool->user.list); + pool->slow.netdev = NET_PTR_POISON; + } + mutex_unlock(&page_pools_lock); +} + +static void page_pool_unreg_netdev(struct net_device *netdev) +{ + struct page_pool *pool, *last; + struct net_device *lo; + + lo = dev_net(netdev)->loopback_dev; + + mutex_lock(&page_pools_lock); + last = NULL; + hlist_for_each_entry(pool, &netdev->page_pools, user.list) { + pool->slow.netdev = lo; + last = pool; + } + if (last) + hlist_splice_init(&netdev->page_pools, &last->user.list, + &lo->page_pools); mutex_unlock(&page_pools_lock); } + +static int +page_pool_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + if (hlist_empty(&netdev->page_pools)) + return NOTIFY_OK; + + if (netdev->ifindex != LOOPBACK_IFINDEX) + page_pool_unreg_netdev(netdev); + else + page_pool_unreg_netdev_wipe(netdev); + return NOTIFY_OK; +} + +static struct notifier_block page_pool_netdevice_nb = { + .notifier_call = page_pool_netdevice_event, +}; + +static int __init page_pool_user_init(void) +{ + return register_netdevice_notifier(&page_pool_netdevice_nb); +} + +subsys_initcall(page_pool_user_init); -- cgit From 02b3de80c5f879f92e5f4bb3f535d172e0fc0ea0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:31 -0800 Subject: net: page_pool: stash the NAPI ID for easier access To avoid any issues with race conditions on accessing napi and having to think about the lifetime of NAPI objects in netlink GET - stash the napi_id to which page pool was linked at creation time. Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/core/page_pool_user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index e5c7f078fbd4..2888aa8dd3e4 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -37,9 +37,11 @@ int page_pool_list(struct page_pool *pool) if (err < 0) goto err_unlock; - if (pool->slow.netdev) + if (pool->slow.netdev) { hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); + pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0; + } mutex_unlock(&page_pools_lock); return 0; -- cgit From 950ab53b77ab829defeb22bc98d40a5e926ae018 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:34 -0800 Subject: net: page_pool: implement GET in the netlink API Expose the very basic page pool information via netlink. Example using ynl-py for a system with 9 queues: $ ./cli.py --no-schema --spec netlink/specs/netdev.yaml \ --dump page-pool-get [{'id': 19, 'ifindex': 2, 'napi-id': 147}, {'id': 18, 'ifindex': 2, 'napi-id': 146}, {'id': 17, 'ifindex': 2, 'napi-id': 145}, {'id': 16, 'ifindex': 2, 'napi-id': 144}, {'id': 15, 'ifindex': 2, 'napi-id': 143}, {'id': 14, 'ifindex': 2, 'napi-id': 142}, {'id': 13, 'ifindex': 2, 'napi-id': 141}, {'id': 12, 'ifindex': 2, 'napi-id': 140}, {'id': 11, 'ifindex': 2, 'napi-id': 139}, {'id': 10, 'ifindex': 2, 'napi-id': 138}] Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/core/netdev-genl-gen.c | 27 ++++++++++ net/core/netdev-genl-gen.h | 3 ++ net/core/page_pool_user.c | 127 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+) (limited to 'net/core') diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index ea9231378aa6..bfde13981c77 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -10,11 +10,24 @@ #include +/* Integer value ranges */ +static const struct netlink_range_validation netdev_a_page_pool_id_range = { + .min = 1ULL, + .max = 4294967295ULL, +}; + /* NETDEV_CMD_DEV_GET - do */ static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* NETDEV_CMD_PAGE_POOL_GET - do */ +#ifdef CONFIG_PAGE_POOL +static const struct nla_policy netdev_page_pool_get_nl_policy[NETDEV_A_PAGE_POOL_ID + 1] = { + [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), +}; +#endif /* CONFIG_PAGE_POOL */ + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -29,6 +42,20 @@ static const struct genl_split_ops netdev_nl_ops[] = { .dumpit = netdev_nl_dev_get_dumpit, .flags = GENL_CMD_CAP_DUMP, }, +#ifdef CONFIG_PAGE_POOL + { + .cmd = NETDEV_CMD_PAGE_POOL_GET, + .doit = netdev_nl_page_pool_get_doit, + .policy = netdev_page_pool_get_nl_policy, + .maxattr = NETDEV_A_PAGE_POOL_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_PAGE_POOL_GET, + .dumpit = netdev_nl_page_pool_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#endif /* CONFIG_PAGE_POOL */ }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 7b370c073e7d..a011d12abff4 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -13,6 +13,9 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 2888aa8dd3e4..7eb37c31fce9 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -5,8 +5,10 @@ #include #include #include +#include #include "page_pool_priv.h" +#include "netdev-genl-gen.h" static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); /* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user. @@ -26,6 +28,131 @@ static DEFINE_MUTEX(page_pools_lock); * - user.list: unhashed, netdev: unknown */ +typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info); + +static int +netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill) +{ + struct page_pool *pool; + struct sk_buff *rsp; + int err; + + mutex_lock(&page_pools_lock); + pool = xa_load(&page_pools, id); + if (!pool || hlist_unhashed(&pool->user.list) || + !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) { + err = -ENOENT; + goto err_unlock; + } + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) { + err = -ENOMEM; + goto err_unlock; + } + + err = fill(rsp, pool, info); + if (err) + goto err_free_msg; + + mutex_unlock(&page_pools_lock); + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); +err_unlock: + mutex_unlock(&page_pools_lock); + return err; +} + +struct page_pool_dump_cb { + unsigned long ifindex; + u32 pp_id; +}; + +static int +netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb, + pp_nl_fill_cb fill) +{ + struct page_pool_dump_cb *state = (void *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + struct page_pool *pool; + int err = 0; + + rtnl_lock(); + mutex_lock(&page_pools_lock); + for_each_netdev_dump(net, netdev, state->ifindex) { + hlist_for_each_entry(pool, &netdev->page_pools, user.list) { + if (state->pp_id && state->pp_id < pool->user.id) + continue; + + state->pp_id = pool->user.id; + err = fill(skb, pool, info); + if (err) + break; + } + + state->pp_id = 0; + } + mutex_unlock(&page_pools_lock); + rtnl_unlock(); + + if (skb->len && err == -EMSGSIZE) + return skb->len; + return err; +} + +static int +page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id)) + goto err_cancel; + + if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && + nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, + pool->slow.netdev->ifindex)) + goto err_cancel; + if (pool->user.napi_id && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id)) + goto err_cancel; + + genlmsg_end(rsp, hdr); + + return 0; +err_cancel: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + u32 id; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID)) + return -EINVAL; + + id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]); + + return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill); +} + +int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill); +} + int page_pool_list(struct page_pool *pool) { static u32 id_alloc_next; -- cgit From d2ef6aa077bdd0b3495dba5dcae6d3f19579b20b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:35 -0800 Subject: net: page_pool: add netlink notifications for state changes Generate netlink notifications about page pool state changes. Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/core/netdev-genl-gen.c | 1 + net/core/netdev-genl-gen.h | 1 + net/core/page_pool_user.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) (limited to 'net/core') diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index bfde13981c77..47fb5e1b6369 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -60,6 +60,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { static const struct genl_multicast_group netdev_nl_mcgrps[] = { [NETDEV_NLGRP_MGMT] = { "mgmt", }, + [NETDEV_NLGRP_PAGE_POOL] = { "page-pool", }, }; struct genl_family netdev_nl_family __ro_after_init = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index a011d12abff4..738097847100 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -19,6 +19,7 @@ int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, enum { NETDEV_NLGRP_MGMT, + NETDEV_NLGRP_PAGE_POOL, }; extern struct genl_family netdev_nl_family; diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 7eb37c31fce9..1577fef880c9 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -135,6 +135,37 @@ err_cancel: return -EMSGSIZE; } +static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + struct net *net; + + lockdep_assert_held(&page_pools_lock); + + /* 'invisible' page pools don't matter */ + if (hlist_unhashed(&pool->user.list)) + return; + net = dev_net(pool->slow.netdev); + + if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL)) + return; + + genl_info_init_ntf(&info, &netdev_nl_family, cmd); + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + if (page_pool_nl_fill(ntf, pool, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&netdev_nl_family, net, ntf, + 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL); +} + int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 id; @@ -168,6 +199,8 @@ int page_pool_list(struct page_pool *pool) hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0; + + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); } mutex_unlock(&page_pools_lock); @@ -181,6 +214,7 @@ err_unlock: void page_pool_unlist(struct page_pool *pool) { mutex_lock(&page_pools_lock); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF); xa_erase(&page_pools, pool->user.id); hlist_del(&pool->user.list); mutex_unlock(&page_pools_lock); @@ -210,6 +244,8 @@ static void page_pool_unreg_netdev(struct net_device *netdev) last = NULL; hlist_for_each_entry(pool, &netdev->page_pools, user.list) { pool->slow.netdev = lo; + netdev_nl_page_pool_event(pool, + NETDEV_CMD_PAGE_POOL_CHANGE_NTF); last = pool; } if (last) -- cgit From 7aee8429eedd0970d8add2fb5b856bfc5f5f1fc1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:36 -0800 Subject: net: page_pool: report amount of memory held by page pools Advanced deployments need the ability to check memory use of various system components. It makes it possible to make informed decisions about memory allocation and to find regressions and leaks. Report memory use of page pools. Report both number of references and bytes held. Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/core/page_pool.c | 13 +++++++++---- net/core/page_pool_priv.h | 2 ++ net/core/page_pool_user.c | 8 ++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a8d96ea38d18..566390759294 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -529,7 +529,7 @@ EXPORT_SYMBOL(page_pool_alloc_pages); */ #define _distance(a, b) (s32)((a) - (b)) -static s32 page_pool_inflight(struct page_pool *pool) +s32 page_pool_inflight(const struct page_pool *pool, bool strict) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); @@ -537,8 +537,13 @@ static s32 page_pool_inflight(struct page_pool *pool) inflight = _distance(hold_cnt, release_cnt); - trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); - WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); + if (strict) { + trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); + WARN(inflight < 0, "Negative(%d) inflight packet-pages", + inflight); + } else { + inflight = max(0, inflight); + } return inflight; } @@ -881,7 +886,7 @@ static int page_pool_release(struct page_pool *pool) int inflight; page_pool_scrub(pool); - inflight = page_pool_inflight(pool); + inflight = page_pool_inflight(pool, true); if (!inflight) __page_pool_destroy(pool); diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h index c17ea092b4ab..72fb21ea1ddc 100644 --- a/net/core/page_pool_priv.h +++ b/net/core/page_pool_priv.h @@ -3,6 +3,8 @@ #ifndef __PAGE_POOL_PRIV_H #define __PAGE_POOL_PRIV_H +s32 page_pool_inflight(const struct page_pool *pool, bool strict); + int page_pool_list(struct page_pool *pool); void page_pool_unlist(struct page_pool *pool); diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 1577fef880c9..2db71e718485 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -110,6 +110,7 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { + size_t inflight, refsz; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -127,6 +128,13 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id)) goto err_cancel; + inflight = page_pool_inflight(pool, false); + refsz = PAGE_SIZE << pool->p.order; + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, + inflight * refsz)) + goto err_cancel; + genlmsg_end(rsp, hdr); return 0; -- cgit From 69cb4952b6f6a226c1c0a7ca400398aaa8f75cf2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:37 -0800 Subject: net: page_pool: report when page pool was destroyed Report when page pool was destroyed. Together with the inflight / memory use reporting this can serve as a replacement for the warning about leaked page pools we currently print to dmesg. Example output for a fake leaked page pool using some hacks in netdevsim (one "live" pool, and one "leaked" on the same dev): $ ./cli.py --no-schema --spec netlink/specs/netdev.yaml \ --dump page-pool-get [{'id': 2, 'ifindex': 3}, {'id': 1, 'ifindex': 3, 'destroyed': 133, 'inflight': 1}] Tested-by: Dragos Tatulea Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/core/page_pool.c | 1 + net/core/page_pool_priv.h | 1 + net/core/page_pool_user.c | 12 ++++++++++++ 3 files changed, 14 insertions(+) (limited to 'net/core') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 566390759294..a821fb5fe054 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -953,6 +953,7 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_release(pool)) return; + page_pool_detached(pool); pool->defer_start = jiffies; pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h index 72fb21ea1ddc..90665d40f1eb 100644 --- a/net/core/page_pool_priv.h +++ b/net/core/page_pool_priv.h @@ -6,6 +6,7 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict); int page_pool_list(struct page_pool *pool); +void page_pool_detached(struct page_pool *pool); void page_pool_unlist(struct page_pool *pool); #endif diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 2db71e718485..bd5ca94f683f 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -134,6 +134,10 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, inflight * refsz)) goto err_cancel; + if (pool->user.detach_time && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, + pool->user.detach_time)) + goto err_cancel; genlmsg_end(rsp, hdr); @@ -219,6 +223,14 @@ err_unlock: return err; } +void page_pool_detached(struct page_pool *pool) +{ + mutex_lock(&page_pools_lock); + pool->user.detach_time = ktime_get_boottime_seconds(); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); + mutex_unlock(&page_pools_lock); +} + void page_pool_unlist(struct page_pool *pool) { mutex_lock(&page_pools_lock); -- cgit From d49010adae737638447369a4eff8f1aab736b076 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:38 -0800 Subject: net: page_pool: expose page pool stats via netlink Dump the stats into netlink. More clever approaches like dumping the stats per-CPU for each CPU individually to see where the packets get consumed can be implemented in the future. A trimmed example from a real (but recently booted system): $ ./cli.py --no-schema --spec netlink/specs/netdev.yaml \ --dump page-pool-stats-get [{'info': {'id': 19, 'ifindex': 2}, 'alloc-empty': 48, 'alloc-fast': 3024, 'alloc-refill': 0, 'alloc-slow': 48, 'alloc-slow-high-order': 0, 'alloc-waive': 0, 'recycle-cache-full': 0, 'recycle-cached': 0, 'recycle-released-refcnt': 0, 'recycle-ring': 0, 'recycle-ring-full': 0}, {'info': {'id': 18, 'ifindex': 2}, 'alloc-empty': 66, 'alloc-fast': 11811, 'alloc-refill': 35, 'alloc-slow': 66, 'alloc-slow-high-order': 0, 'alloc-waive': 0, 'recycle-cache-full': 1145, 'recycle-cached': 6541, 'recycle-released-refcnt': 0, 'recycle-ring': 1275, 'recycle-ring-full': 0}, {'info': {'id': 17, 'ifindex': 2}, 'alloc-empty': 73, 'alloc-fast': 62099, 'alloc-refill': 413, ... Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/core/netdev-genl-gen.c | 32 ++++++++++++++ net/core/netdev-genl-gen.h | 7 +++ net/core/page_pool.c | 2 +- net/core/page_pool_user.c | 103 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 143 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 47fb5e1b6369..dccd8c3a141e 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -16,6 +16,17 @@ static const struct netlink_range_validation netdev_a_page_pool_id_range = { .max = 4294967295ULL, }; +static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = { + .min = 1ULL, + .max = 2147483647ULL, +}; + +/* Common nested types */ +const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { + [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), + [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), +}; + /* NETDEV_CMD_DEV_GET - do */ static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), @@ -28,6 +39,13 @@ static const struct nla_policy netdev_page_pool_get_nl_policy[NETDEV_A_PAGE_POOL }; #endif /* CONFIG_PAGE_POOL */ +/* NETDEV_CMD_PAGE_POOL_STATS_GET - do */ +#ifdef CONFIG_PAGE_POOL_STATS +static const struct nla_policy netdev_page_pool_stats_get_nl_policy[NETDEV_A_PAGE_POOL_STATS_INFO + 1] = { + [NETDEV_A_PAGE_POOL_STATS_INFO] = NLA_POLICY_NESTED(netdev_page_pool_info_nl_policy), +}; +#endif /* CONFIG_PAGE_POOL_STATS */ + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -56,6 +74,20 @@ static const struct genl_split_ops netdev_nl_ops[] = { .flags = GENL_CMD_CAP_DUMP, }, #endif /* CONFIG_PAGE_POOL */ +#ifdef CONFIG_PAGE_POOL_STATS + { + .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET, + .doit = netdev_nl_page_pool_stats_get_doit, + .policy = netdev_page_pool_stats_get_nl_policy, + .maxattr = NETDEV_A_PAGE_POOL_STATS_INFO, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET, + .dumpit = netdev_nl_page_pool_stats_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#endif /* CONFIG_PAGE_POOL_STATS */ }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 738097847100..649e4b46eccf 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -11,11 +11,18 @@ #include +/* Common nested types */ +extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; + int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, + struct genl_info *info); +int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a821fb5fe054..3d0938a60646 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -71,7 +71,7 @@ static const char pp_stats[][ETH_GSTRING_LEN] = { * is passed to this API which is filled in. The caller can then report * those stats to the user (perhaps via ethtool, debugfs, etc.). */ -bool page_pool_get_stats(struct page_pool *pool, +bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats) { int cpu = 0; diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index bd5ca94f683f..1426434a7e15 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "page_pool_priv.h" @@ -106,6 +107,108 @@ netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb, return err; } +static int +page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info) +{ +#ifdef CONFIG_PAGE_POOL_STATS + struct page_pool_stats stats = {}; + struct nlattr *nest; + void *hdr; + + if (!page_pool_get_stats(pool, &stats)) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO); + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) || + (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && + nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, + pool->slow.netdev->ifindex))) + goto err_cancel_nest; + + nla_nest_end(rsp, nest); + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST, + stats.alloc_stats.fast) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW, + stats.alloc_stats.slow) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER, + stats.alloc_stats.slow_high_order) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY, + stats.alloc_stats.empty) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL, + stats.alloc_stats.refill) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE, + stats.alloc_stats.waive) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED, + stats.recycle_stats.cached) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL, + stats.recycle_stats.cache_full) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING, + stats.recycle_stats.ring) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL, + stats.recycle_stats.ring_full) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT, + stats.recycle_stats.released_refcnt)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + + return 0; +err_cancel_nest: + nla_nest_cancel(rsp, nest); +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +#else + GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS"); + return -EOPNOTSUPP; +#endif +} + +int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)]; + struct nlattr *nest; + int err; + u32 id; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO)) + return -EINVAL; + + nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO]; + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + netdev_page_pool_info_nl_policy, + info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID)) + return -EINVAL; + if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NETDEV_A_PAGE_POOL_IFINDEX], + "selecting by ifindex not supported"); + return -EINVAL; + } + + id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]); + + return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill); +} + +int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill); +} + static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) -- cgit From be0096676e230b43730b8936ac393d155b4e3262 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:39 -0800 Subject: net: page_pool: mute the periodic warning for visible page pools Mute the periodic "stalled pool shutdown" warning if the page pool is visible to user space. Rolling out a driver using page pools to just a few hundred hosts at Meta surfaces applications which fail to reap their broken sockets. Obviously it's best if the applications are fixed, but we don't generally print warnings for application resource leaks. Admins can now depend on the netlink interface for getting page pool info to detect buggy apps. While at it throw in the ID of the pool into the message, in rare cases (pools from destroyed netns) this will make finding the pool with a debugger easier. Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/core/page_pool.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 3d0938a60646..c2e7c9a6efbe 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -897,18 +897,21 @@ static void page_pool_release_retry(struct work_struct *wq) { struct delayed_work *dwq = to_delayed_work(wq); struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); + void *netdev; int inflight; inflight = page_pool_release(pool); if (!inflight) return; - /* Periodic warning */ - if (time_after_eq(jiffies, pool->defer_warn)) { + /* Periodic warning for page pools the user can't see */ + netdev = READ_ONCE(pool->slow.netdev); + if (time_after_eq(jiffies, pool->defer_warn) && + (!netdev || netdev == NET_PTR_POISON)) { int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; - pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", - __func__, inflight, sec); + pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", + __func__, pool->user.id, inflight, sec); pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; } -- cgit From 48eb03dd26304c24f03bdbb9382e89c8564e71df Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:08 -0800 Subject: xsk: Add TX timestamp and TX checksum offload support This change actually defines the (initial) metadata layout that should be used by AF_XDP userspace (xsk_tx_metadata). The first field is flags which requests appropriate offloads, followed by the offload-specific fields. The supported per-device offloads are exported via netlink (new xsk-flags). The offloads themselves are still implemented in a bit of a framework-y fashion that's left from my initial kfunc attempt. I'm introducing new xsk_tx_metadata_ops which drivers are supposed to implement. The drivers are also supposed to call xsk_tx_metadata_request/xsk_tx_metadata_complete in the right places. Since xsk_tx_metadata_{request,_complete} are static inline, we don't incur any extra overhead doing indirect calls. The benefit of this scheme is as follows: - keeps all metadata layout parsing away from driver code - makes it easy to grep and see which drivers implement what - don't need any extra flags to maintain to keep track of what offloads are implemented; if the callback is implemented - the offload is supported (used by netlink reporting code) Two offloads are defined right now: 1. XDP_TXMD_FLAGS_CHECKSUM: skb-style csum_start+csum_offset 2. XDP_TXMD_FLAGS_TIMESTAMP: writes TX timestamp back into metadata area upon completion (tx_timestamp field) XDP_TXMD_FLAGS_TIMESTAMP is also implemented for XDP_COPY mode: it writes SW timestamp from the skb destructor (note I'm reusing hwtstamps to pass metadata pointer). The struct is forward-compatible and can be extended in the future by appending more fields. Reviewed-by: Song Yoong Siang Signed-off-by: Stanislav Fomichev Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231127190319.1190813-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- net/core/netdev-genl.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index fe61f85bcf33..10f2124e9e23 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "netdev-genl-gen.h" @@ -13,6 +14,7 @@ static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { + u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; @@ -26,11 +28,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC + if (netdev->xsk_tx_metadata_ops) { + if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) + xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; + if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) + xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + } + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, - xdp_rx_meta, NETDEV_A_DEV_PAD)) { + xdp_rx_meta, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, + xsk_features, NETDEV_A_DEV_PAD)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } -- cgit From 7577bc8249c3fc86096ef1b1c9a8f4b6232231e7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 28 Nov 2023 18:29:20 -0800 Subject: tcp: Don't pass cookie to __cookie_v[46]_check(). tcp_hdr(skb) and SYN Cookie are passed to __cookie_v[46]_check(), but none of the callers passes cookie other than ntohl(th->ack_seq) - 1. Let's fetch it in __cookie_v[46]_check() instead of passing the cookie over and over. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231129022924.96156-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 7e4d7c3bcc84..0adaa4afa35f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7238,7 +7238,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES - u32 cookie; int ret; if (unlikely(!sk || th_len < sizeof(*th))) @@ -7260,8 +7259,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; - cookie = ntohl(th->ack_seq) - 1; - /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ @@ -7270,7 +7267,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; - ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); + ret = __cookie_v4_check((struct iphdr *)iph, th); break; #if IS_BUILTIN(CONFIG_IPV6) @@ -7281,7 +7278,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_family != AF_INET6) return -EINVAL; - ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); + ret = __cookie_v6_check((struct ipv6hdr *)iph, th); break; #endif /* CONFIG_IPV6 */ @@ -7734,9 +7731,7 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th) { - u32 cookie = ntohl(th->ack_seq) - 1; - - if (__cookie_v4_check(iph, th, cookie) > 0) + if (__cookie_v4_check(iph, th) > 0) return 0; return -EACCES; @@ -7757,9 +7752,7 @@ BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th) { #if IS_BUILTIN(CONFIG_IPV6) - u32 cookie = ntohl(th->ack_seq) - 1; - - if (__cookie_v6_check(iph, th, cookie) > 0) + if (__cookie_v6_check(iph, th) > 0) return 0; return -EACCES; -- cgit From f9893fdac319bb2817e5e7818870264d7fb2eb02 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Nov 2023 09:22:59 +0000 Subject: net: page_pool: fix general protection fault in page_pool_unlist syzbot was able to trigger a crash [1] in page_pool_unlist() page_pool_list() only inserts a page pool into a netdev page pool list if a netdev was set in params. Even if the kzalloc() call in page_pool_create happens to initialize pool->user.list, I chose to be more explicit in page_pool_list() adding one INIT_HLIST_NODE(). We could test in page_pool_unlist() if netdev was set, but since netdev can be changed to lo, it seems more robust to check if pool->user.list is hashed before calling hlist_del(). [1] Illegal XDP return value 4294946546 on prog (id 2) dev N/A, expect packet loss! general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 0 PID: 5064 Comm: syz-executor391 Not tainted 6.7.0-rc2-syzkaller-00533-ga379972973a8 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/10/2023 RIP: 0010:__hlist_del include/linux/list.h:988 [inline] RIP: 0010:hlist_del include/linux/list.h:1002 [inline] RIP: 0010:page_pool_unlist+0xd1/0x170 net/core/page_pool_user.c:342 Code: df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 90 00 00 00 4c 8b a3 f0 06 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 e2 48 c1 ea 03 <80> 3c 02 00 75 68 48 85 ed 49 89 2c 24 74 24 e8 1b ca 07 f9 48 8d RSP: 0018:ffffc900039ff768 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff88814ae02000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff88814ae026f0 RBP: 0000000000000000 R08: 0000000000000000 R09: fffffbfff1d57fdc R10: ffffffff8eabfee3 R11: ffffffff8aa0008b R12: 0000000000000000 R13: ffff88814ae02000 R14: dffffc0000000000 R15: 0000000000000001 FS: 000055555717a380(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000002555398 CR3: 0000000025044000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __page_pool_destroy net/core/page_pool.c:851 [inline] page_pool_release+0x507/0x6b0 net/core/page_pool.c:891 page_pool_destroy+0x1ac/0x4c0 net/core/page_pool.c:956 xdp_test_run_teardown net/bpf/test_run.c:216 [inline] bpf_test_run_xdp_live+0x1578/0x1af0 net/bpf/test_run.c:388 bpf_prog_test_run_xdp+0x827/0x1530 net/bpf/test_run.c:1254 bpf_prog_test_run kernel/bpf/syscall.c:4041 [inline] __sys_bpf+0x11bf/0x4920 kernel/bpf/syscall.c:5402 __do_sys_bpf kernel/bpf/syscall.c:5488 [inline] __se_sys_bpf kernel/bpf/syscall.c:5486 [inline] __x64_sys_bpf+0x78/0xc0 kernel/bpf/syscall.c:5486 Fixes: 083772c9f972 ("net: page_pool: record pools per netdev") Reported-and-tested-by: syzbot+f9f8efb58a4db2ca98d0@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Tested-by: Andrew Lunn Link: https://lore.kernel.org/r/20231130092259.3797753-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/page_pool_user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 1426434a7e15..ffe5244e5597 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -310,6 +310,7 @@ int page_pool_list(struct page_pool *pool) if (err < 0) goto err_unlock; + INIT_HLIST_NODE(&pool->user.list); if (pool->slow.netdev) { hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); @@ -339,7 +340,8 @@ void page_pool_unlist(struct page_pool *pool) mutex_lock(&page_pools_lock); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF); xa_erase(&page_pools, pool->user.id); - hlist_del(&pool->user.list); + if (!hlist_unhashed(&pool->user.list)) + hlist_del(&pool->user.list); mutex_unlock(&page_pools_lock); } -- cgit From 18fd64d2542292713b0322e6815be059bdee440c Mon Sep 17 00:00:00 2001 From: Coco Li Date: Wed, 29 Nov 2023 07:27:54 +0000 Subject: netns-ipv4: reorganize netns_ipv4 fast path variables Reorganize fast path variables on tx-txrx-rx order. Fastpath cacheline ends after sysctl_tcp_rmem. There are only read-only variables here. (write is on the control path and not considered in this case) Below data generated with pahole on x86 architecture. Fast path variables span cache lines before change: 4 Fast path variables span cache lines after change: 2 Suggested-by: Eric Dumazet Reviewed-by: Wei Wang Reviewed-by: David Ahern Signed-off-by: Coco Li Reviewed-by: Eric Dumazet Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- net/core/net_namespace.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f4183c4c1ec8..cb8bcbff9e83 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1099,11 +1099,56 @@ out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } +#ifdef CONFIG_NET_NS +static void __init netns_ipv4_struct_check(void) +{ + /* TX readonly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_early_retrans); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_tso_win_divisor); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_tso_rtt_log); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_autocorking); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_min_snd_mss); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_notsent_lowat); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_limit_output_bytes); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_min_rtt_wlen); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_wmem); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_ip_fwd_use_pmtu); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33); + + /* TXRX readonly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx, + sysctl_tcp_moderate_rcvbuf); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1); + + /* RX readonly hotpath cache line */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_ip_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_reordering); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_rmem); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18); +} +#endif + void __init net_ns_init(void) { struct net_generic *ng; #ifdef CONFIG_NET_NS + netns_ipv4_struct_check(); net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, SLAB_PANIC|SLAB_ACCOUNT, NULL); -- cgit From bc877956272f0521fef107838555817112a450dc Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:28:29 -0800 Subject: netdev-genl: spec: Extend netdev netlink spec in YAML for queue Add support in netlink spec(netdev.yaml) for queue information. Add code generated from the spec. Note: The "queue-type" attribute takes values 0 and 1 for rx and tx queue type respectively. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147330963.5260.2576294626647300472.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- net/core/netdev-genl-gen.c | 26 ++++++++++++++++++++++++++ net/core/netdev-genl-gen.h | 3 +++ net/core/netdev-genl.c | 10 ++++++++++ 3 files changed, 39 insertions(+) (limited to 'net/core') diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index dccd8c3a141e..b1dcf88c82cf 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -46,6 +46,18 @@ static const struct nla_policy netdev_page_pool_stats_get_nl_policy[NETDEV_A_PAG }; #endif /* CONFIG_PAGE_POOL_STATS */ +/* NETDEV_CMD_QUEUE_GET - do */ +static const struct nla_policy netdev_queue_get_do_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, +}; + +/* NETDEV_CMD_QUEUE_GET - dump */ +static const struct nla_policy netdev_queue_get_dump_nl_policy[NETDEV_A_QUEUE_IFINDEX + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -88,6 +100,20 @@ static const struct genl_split_ops netdev_nl_ops[] = { .flags = GENL_CMD_CAP_DUMP, }, #endif /* CONFIG_PAGE_POOL_STATS */ + { + .cmd = NETDEV_CMD_QUEUE_GET, + .doit = netdev_nl_queue_get_doit, + .policy = netdev_queue_get_do_nl_policy, + .maxattr = NETDEV_A_QUEUE_TYPE, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_QUEUE_GET, + .dumpit = netdev_nl_queue_get_dumpit, + .policy = netdev_queue_get_dump_nl_policy, + .maxattr = NETDEV_A_QUEUE_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 649e4b46eccf..086623c1797a 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -23,6 +23,9 @@ int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_queue_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 10f2124e9e23..35e2d692f651 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -140,6 +140,16 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { -- cgit From 2a502ff0c4e42a739b5aa550c901bf3852795532 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:28:34 -0800 Subject: net: Add queue and napi association Add the napi pointer in netdev queue for tracking the napi instance for each queue. This achieves the queue<->napi mapping. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147331483.5260.15723438819994285695.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3950ced396b5..0a2eecbeef38 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6400,6 +6400,43 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } EXPORT_SYMBOL(dev_set_threaded); +/** + * netif_queue_set_napi - Associate queue with the napi + * @dev: device to which NAPI and queue belong + * @queue_index: Index of queue + * @type: queue type as RX or TX + * @napi: NAPI context, pass NULL to clear previously set NAPI + * + * Set queue with its corresponding napi context. This should be done after + * registering the NAPI handler for the queue-vector and the queues have been + * mapped to the corresponding interrupt vector. + */ +void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, + enum netdev_queue_type type, struct napi_struct *napi) +{ + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + + if (WARN_ON_ONCE(napi && !napi->dev)) + return; + if (dev->reg_state >= NETREG_REGISTERED) + ASSERT_RTNL(); + + switch (type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(dev, queue_index); + rxq->napi = napi; + return; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(dev, queue_index); + txq->napi = napi; + return; + default: + return; + } +} +EXPORT_SYMBOL(netif_queue_set_napi); + void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { -- cgit From 6b6171db7fc8f7a6d0f0f0acb7aff16cecf14f42 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:28:46 -0800 Subject: netdev-genl: Add netlink framework functions for queue Implement the netdev netlink framework functions for exposing queue information. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147332603.5260.7982559672617639065.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 187 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 184 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 35e2d692f651..3bc1661e6ebf 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -7,9 +7,23 @@ #include #include #include +#include #include "netdev-genl-gen.h" +struct netdev_nl_dump_ctx { + unsigned long ifindex; + unsigned int rxq_idx; + unsigned int txq_idx; +}; + +static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) +{ + NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx); + + return (struct netdev_nl_dump_ctx *)cb->ctx; +} + static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) @@ -122,12 +136,13 @@ err_free_msg: int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; int err = 0; rtnl_lock(); - for_each_netdev_dump(net, netdev, cb->args[0]) { + for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) break; @@ -140,14 +155,180 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int +netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, + u32 q_idx, u32 q_type, const struct genl_info *info) +{ + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || + nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(netdev, q_idx); + if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, + rxq->napi->napi_id)) + goto nla_put_failure; + break; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(netdev, q_idx); + if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, + txq->napi->napi_id)) + goto nla_put_failure; + } + + genlmsg_end(rsp, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, + u32 q_type) +{ + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + if (q_id >= netdev->real_num_rx_queues) + return -EINVAL; + return 0; + case NETDEV_QUEUE_TYPE_TX: + if (q_id >= netdev->real_num_tx_queues) + return -EINVAL; + } + return 0; +} + +static int +netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, + u32 q_type, const struct genl_info *info) +{ + int err = 0; + + if (!(netdev->flags & IFF_UP)) + return err; + + err = netdev_nl_queue_validate(netdev, q_idx, q_type); + if (err) + return err; + + return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); +} + int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + u32 q_id, q_type, ifindex; + struct net_device *netdev; + struct sk_buff *rsp; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) + return -EINVAL; + + q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); + q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + rtnl_lock(); + + netdev = __dev_get_by_index(genl_info_net(info), ifindex); + if (netdev) + err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); + else + err = -ENODEV; + + rtnl_unlock(); + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + int err = 0; + int i; + + if (!(netdev->flags & IFF_UP)) + return err; + + for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { + err = netdev_nl_queue_fill_one(rsp, netdev, i, + NETDEV_QUEUE_TYPE_RX, info); + if (err) + return err; + ctx->rxq_idx = i++; + } + for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { + err = netdev_nl_queue_fill_one(rsp, netdev, i, + NETDEV_QUEUE_TYPE_TX, info); + if (err) + return err; + ctx->txq_idx = i++; + } + + return err; } int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - return -EOPNOTSUPP; + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + u32 ifindex = 0; + int err = 0; + + if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + rtnl_lock(); + if (ifindex) { + netdev = __dev_get_by_index(net, ifindex); + if (netdev) + err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); + else + err = -ENODEV; + } else { + for_each_netdev_dump(net, netdev, ctx->ifindex) { + err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); + if (err < 0) + break; + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + } + } + rtnl_unlock(); + + if (err != -EMSGSIZE) + return err; + + return skb->len; } static int netdev_genl_netdevice_event(struct notifier_block *nb, -- cgit From ff9991499fb53575c45eb92cd064bcd7141bb572 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:28:51 -0800 Subject: netdev-genl: spec: Extend netdev netlink spec in YAML for NAPI Add support in netlink spec(netdev.yaml) for napi related information. Add code generated from the spec. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147333119.5260.7050639053080529108.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- net/core/netdev-genl-gen.c | 24 ++++++++++++++++++++++++ net/core/netdev-genl-gen.h | 2 ++ net/core/netdev-genl.c | 10 ++++++++++ 3 files changed, 36 insertions(+) (limited to 'net/core') diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index b1dcf88c82cf..be7f2ebd61b2 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -58,6 +58,16 @@ static const struct nla_policy netdev_queue_get_dump_nl_policy[NETDEV_A_QUEUE_IF [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* NETDEV_CMD_NAPI_GET - do */ +static const struct nla_policy netdev_napi_get_do_nl_policy[NETDEV_A_NAPI_ID + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, +}; + +/* NETDEV_CMD_NAPI_GET - dump */ +static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFINDEX + 1] = { + [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -114,6 +124,20 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_QUEUE_IFINDEX, .flags = GENL_CMD_CAP_DUMP, }, + { + .cmd = NETDEV_CMD_NAPI_GET, + .doit = netdev_nl_napi_get_doit, + .policy = netdev_napi_get_do_nl_policy, + .maxattr = NETDEV_A_NAPI_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_NAPI_GET, + .dumpit = netdev_nl_napi_get_dumpit, + .policy = netdev_napi_get_dump_nl_policy, + .maxattr = NETDEV_A_NAPI_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 086623c1797a..a47f2bcbe4fa 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -26,6 +26,8 @@ int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 3bc1661e6ebf..4c8ea6a4f015 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -155,6 +155,16 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) -- cgit From 27f91aaf49b3a50e5a02ad5fa27b7c453d029a72 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:28:56 -0800 Subject: netdev-genl: Add netlink framework functions for napi Implement the netdev netlink framework functions for napi support. The netdev structure tracks all the napi instances and napi fields. The napi instances and associated parameters can be retrieved this way. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147333637.5260.14807433239805550815.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 +- net/core/dev.h | 2 + net/core/netdev-genl.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0a2eecbeef38..13c9d1580e62 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -165,7 +165,6 @@ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); -static struct napi_struct *napi_by_id(unsigned int napi_id); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -6139,7 +6138,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ -static struct napi_struct *napi_by_id(unsigned int napi_id) +struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; diff --git a/net/core/dev.h b/net/core/dev.h index 5aa45f0fd4ae..7795b8ad841d 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -145,4 +145,6 @@ void xdp_do_check_flushed(struct napi_struct *napi); #else static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif + +struct napi_struct *napi_by_id(unsigned int napi_id); #endif diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 4c8ea6a4f015..d55c92f9f26c 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -8,13 +8,16 @@ #include #include #include +#include #include "netdev-genl-gen.h" +#include "dev.h" struct netdev_nl_dump_ctx { unsigned long ifindex; unsigned int rxq_idx; unsigned int txq_idx; + unsigned int napi_id; }; static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) @@ -155,14 +158,128 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int +netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, + const struct genl_info *info) +{ + void *hdr; + + if (WARN_ON_ONCE(!napi->dev)) + return -EINVAL; + if (!(napi->dev->flags & IFF_UP)) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (napi->napi_id >= MIN_NAPI_ID && + nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + goto nla_put_failure; + + if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) + goto nla_put_failure; + + genlmsg_end(rsp, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct napi_struct *napi; + struct sk_buff *rsp; + u32 napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + rtnl_lock(); + + napi = napi_by_id(napi_id); + if (napi) + err = netdev_nl_napi_fill_one(rsp, napi, info); + else + err = -EINVAL; + + rtnl_unlock(); + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + struct napi_struct *napi; + int err = 0; + + if (!(netdev->flags & IFF_UP)) + return err; + + list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (ctx->napi_id && napi->napi_id >= ctx->napi_id) + continue; + + err = netdev_nl_napi_fill_one(rsp, napi, info); + if (err) + return err; + ctx->napi_id = napi->napi_id; + } + return err; } int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - return -EOPNOTSUPP; + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + u32 ifindex = 0; + int err = 0; + + if (info->attrs[NETDEV_A_NAPI_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); + + rtnl_lock(); + if (ifindex) { + netdev = __dev_get_by_index(net, ifindex); + if (netdev) + err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); + else + err = -ENODEV; + } else { + for_each_netdev_dump(net, netdev, ctx->ifindex) { + err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); + if (err < 0) + break; + ctx->napi_id = 0; + } + } + rtnl_unlock(); + + if (err != -EMSGSIZE) + return err; + + return skb->len; } static int -- cgit From 26793bfb5d6072326d1465343e7cbf6156abca4f Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:29:07 -0800 Subject: net: Add NAPI IRQ support Add support to associate the interrupt vector number for a NAPI instance. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147334728.5260.13221803396905901904.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 1 + net/core/netdev-genl.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 13c9d1580e62..1d720fc59161 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6471,6 +6471,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, */ if (dev->threaded && napi_kthread_create(napi)) dev->threaded = 0; + netif_napi_set_irq(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight); diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index d55c92f9f26c..9753c19e36de 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -180,7 +180,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) goto nla_put_failure; + if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) + goto nla_put_failure; + genlmsg_end(rsp, hdr); + return 0; nla_put_failure: -- cgit From db4704f4e4dfce835e934609fca735a648ce26e8 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:29:18 -0800 Subject: netdev-genl: Add PID for the NAPI thread In the threaded NAPI mode, expose the PID of the NAPI thread. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147335818.5260.10253384006102593087.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/core') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9753c19e36de..fd98936da3ae 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -163,6 +163,7 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { void *hdr; + pid_t pid; if (WARN_ON_ONCE(!napi->dev)) return -EINVAL; @@ -183,6 +184,12 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; + if (napi->thread) { + pid = task_pid_nr(napi->thread); + if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) + goto nla_put_failure; + } + genlmsg_end(rsp, hdr); return 0; -- cgit From 43a71cd66b9c0a4af3d15d8644359fde35bdbed0 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Mon, 4 Dec 2023 20:12:30 +0000 Subject: net-device: reorganize net_device fast path variables Reorganize fast path variables on tx-txrx-rx order Fastpath variables end after npinfo. Below data generated with pahole on x86 architecture. Fast path variables span cache lines before change: 12 Fast path variables span cache lines after change: 4 Suggested-by: Eric Dumazet Signed-off-by: Coco Li Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20231204201232.520025-2-lixiaoyan@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1d720fc59161..c5679dfbaa70 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11609,6 +11609,60 @@ static struct pernet_operations __net_initdata default_device_ops = { .exit_batch = default_device_exit_batch, }; +static void __init net_dev_struct_check(void) +{ + /* TX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq); +#ifdef CONFIG_XPS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 152); + + /* TXRX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 30); + + /* RX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net); +#ifdef CONFIG_NETPOLL + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 96); +} + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -11626,6 +11680,8 @@ static int __init net_dev_init(void) BUG_ON(!dev_boot_phase); + net_dev_struct_check(); + if (dev_proc_init()) goto out; -- cgit From facd15dfd69122042502d99ab8c9f888b48ee994 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Dec 2023 21:47:07 +0100 Subject: net: core: synchronize link-watch when carrier is queried There are multiple ways to query for the carrier state: through rtnetlink, sysfs, and (possibly) ethtool. Synchronize linkwatch work before these operations so that we don't have a situation where userspace queries the carrier state between the driver's carrier off->on transition and linkwatch running and expects it to work, when really (at least) TX cannot work until linkwatch has run. I previously posted a longer explanation of how this applies to wireless [1] but with this wireless can simply query the state before sending data, to ensure the kernel is ready for it. [1] https://lore.kernel.org/all/346b21d87c69f817ea3c37caceb34f1f56255884.camel@sipsolutions.net/ Signed-off-by: Johannes Berg Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20231204214706.303c62768415.I1caedccae72ee5a45c9085c5eb49c145ce1c0dd5@changeid Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- net/core/dev.h | 1 - net/core/link_watch.c | 2 +- net/core/net-sysfs.c | 8 +++++++- net/core/rtnetlink.c | 8 ++++++++ 5 files changed, 17 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c5679dfbaa70..0432b04cf9b0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10548,7 +10548,7 @@ void netdev_run_todo(void) write_lock(&dev_base_lock); dev->reg_state = NETREG_UNREGISTERED; write_unlock(&dev_base_lock); - linkwatch_forget_dev(dev); + linkwatch_sync_dev(dev); } while (!list_empty(&list)) { diff --git a/net/core/dev.h b/net/core/dev.h index 7795b8ad841d..cf93e188785b 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -30,7 +30,6 @@ int __init dev_proc_init(void); #endif void linkwatch_init_dev(struct net_device *dev); -void linkwatch_forget_dev(struct net_device *dev); void linkwatch_run_queue(void); void dev_addr_flush(struct net_device *dev); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index c469d1c4db5d..a19f21403339 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -245,7 +245,7 @@ static void __linkwatch_run_queue(int urgent_only) spin_unlock_irq(&lweventlist_lock); } -void linkwatch_forget_dev(struct net_device *dev) +void linkwatch_sync_dev(struct net_device *dev) { unsigned long flags; int clean = 0; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fccaa5bac0ed..d9b33e923b18 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -194,8 +194,14 @@ static ssize_t carrier_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); - if (netif_running(netdev)) + if (netif_running(netdev)) { + /* Synchronize carrier state with link watch, + * see also rtnl_getlink(). + */ + linkwatch_sync_dev(netdev); + return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); + } return -EINVAL; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 592164c2a540..5e0ab4c08f72 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3853,6 +3853,14 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (nskb == NULL) goto out; + /* Synchronize the carrier state so we don't report a state + * that we're not actually going to honour immediately; if + * the driver just did a carrier off->on transition, we can + * only TX if link watch work has run, but without this we'd + * already report carrier on, even if it doesn't work yet. + */ + linkwatch_sync_dev(dev); + err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, -- cgit From 4cbb270e115bc197ff2046aeb54cc951666b16ec Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:19 -0800 Subject: bpf: take into account BPF token when fetching helper protos Instead of performing unconditional system-wide bpf_capable() and perfmon_capable() calls inside bpf_base_func_proto() function (and other similar ones) to determine eligibility of a given BPF helper for a given program, use previously recorded BPF token during BPF_PROG_LOAD command handling to inform the decision. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 0adaa4afa35f..0bf2a03d8203 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,7 +87,7 @@ #include "dev.h" static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id); +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { @@ -7841,7 +7841,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -7934,7 +7934,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -7953,7 +7953,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8140,7 +8140,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8199,7 +8199,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) @@ -8260,7 +8260,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8302,7 +8302,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8346,7 +8346,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8357,7 +8357,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8384,7 +8384,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -11215,7 +11215,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -11397,7 +11397,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -11731,7 +11731,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = { }; static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id) +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; @@ -11760,10 +11760,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } - if (!perfmon_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; -- cgit From 8062fb12de99b2da33754c6a3be1bfc30d9a35f4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:20 -0800 Subject: bpf: consistently use BPF token throughout BPF verifier logic Remove remaining direct queries to perfmon_capable() and bpf_capable() in BPF verifier logic and instead use BPF token (if available) to make decisions about privileges. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 0bf2a03d8203..adcfc2c25754 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8559,7 +8559,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } @@ -8571,7 +8571,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: -- cgit From b8dbbbc535a95acd66035cf75872cd7524c0b12f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Dec 2023 17:00:11 +0100 Subject: net: rtnetlink: remove local list in __linkwatch_run_queue() Due to linkwatch_forget_dev() (and perhaps others?) checking for list_empty(&dev->link_watch_list), we must have all manipulations of even the local on-stack list 'wrk' here under spinlock, since even that list can be reached otherwise via dev->link_watch_list. This is already the case, but makes this a bit counter-intuitive, often local lists are used to _not_ have to use locking for their local use. Remove the local list as it doesn't seem to serve any purpose. While at it, move a variable declaration into the loop using it. Reviewed-by: Jiri Pirko Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20231205170011.56576dcc1727.I698b72219d9f6ce789bd209b8f6dffd0ca32a8f2@changeid Signed-off-by: Jakub Kicinski --- net/core/link_watch.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/link_watch.c b/net/core/link_watch.c index a19f21403339..7be5b3ab32bd 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -192,8 +192,6 @@ static void __linkwatch_run_queue(int urgent_only) #define MAX_DO_DEV_PER_LOOP 100 int do_dev = MAX_DO_DEV_PER_LOOP; - struct net_device *dev; - LIST_HEAD(wrk); /* Give urgent case more budget */ if (urgent_only) @@ -215,11 +213,11 @@ static void __linkwatch_run_queue(int urgent_only) clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); - list_splice_init(&lweventlist, &wrk); + while (!list_empty(&lweventlist) && do_dev > 0) { + struct net_device *dev; - while (!list_empty(&wrk) && do_dev > 0) { - - dev = list_first_entry(&wrk, struct net_device, link_watch_list); + dev = list_first_entry(&lweventlist, struct net_device, + link_watch_list); list_del_init(&dev->link_watch_list); if (!netif_device_present(dev) || @@ -237,9 +235,6 @@ static void __linkwatch_run_queue(int urgent_only) spin_lock_irq(&lweventlist_lock); } - /* Add the remaining work back to lweventlist */ - list_splice_init(&wrk, &lweventlist); - if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); -- cgit From bf17b36ccdd5b7b9dd482d7753bcb9aff2d21d39 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Dec 2023 17:21:23 +0100 Subject: net: sysfs: fix locking in carrier read My previous patch added a call to linkwatch_sync_dev(), but that of course needs to be called under RTNL, which I missed earlier, but now saw RCU warnings from. Fix that by acquiring the RTNL in a similar fashion to how other files do it here. Fixes: facd15dfd691 ("net: core: synchronize link-watch when carrier is queried") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20231206172122.859df6ba937f.I9c80608bcfbab171943ff4942b52dbd5e97fe06e@changeid Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index d9b33e923b18..a09d507c5b03 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -193,6 +193,10 @@ static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); if (netif_running(netdev)) { /* Synchronize carrier state with link watch, @@ -200,10 +204,11 @@ static ssize_t carrier_show(struct device *dev, */ linkwatch_sync_dev(netdev); - return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); + ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); } + rtnl_unlock(); - return -EINVAL; + return ret; } static DEVICE_ATTR_RW(carrier); -- cgit From 9a64d4c93eee6b2efb7a02ec98d9480946424509 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 8 Dec 2023 10:52:15 +0100 Subject: Revert "net: rtnetlink: remove local list in __linkwatch_run_queue()" This reverts commit b8dbbbc535a9 ("net: rtnetlink: remove local list in __linkwatch_run_queue()"). It's evidently broken when there's a non-urgent work that gets added back, and then the loop can never finish. While reverting, add a note about that. Reported-by: Marek Szyprowski Fixes: b8dbbbc535a9 ("net: rtnetlink: remove local list in __linkwatch_run_queue()") Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/core/link_watch.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 7be5b3ab32bd..429571c258da 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -192,6 +192,11 @@ static void __linkwatch_run_queue(int urgent_only) #define MAX_DO_DEV_PER_LOOP 100 int do_dev = MAX_DO_DEV_PER_LOOP; + /* Use a local list here since we add non-urgent + * events back to the global one when called with + * urgent_only=1. + */ + LIST_HEAD(wrk); /* Give urgent case more budget */ if (urgent_only) @@ -213,11 +218,12 @@ static void __linkwatch_run_queue(int urgent_only) clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); - while (!list_empty(&lweventlist) && do_dev > 0) { + list_splice_init(&lweventlist, &wrk); + + while (!list_empty(&wrk) && do_dev > 0) { struct net_device *dev; - dev = list_first_entry(&lweventlist, struct net_device, - link_watch_list); + dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); if (!netif_device_present(dev) || @@ -235,6 +241,9 @@ static void __linkwatch_run_queue(int urgent_only) spin_lock_irq(&lweventlist_lock); } + /* Add the remaining work back to lweventlist */ + list_splice_init(&wrk, &lweventlist); + if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); -- cgit From e6795330f88b4f643c649a02662d47b779340535 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:38 +0100 Subject: xdp: Add VLAN tag hint Implement functionality that enables drivers to expose VLAN tag to XDP code. VLAN tag is represented by 2 variables: - protocol ID, which is passed to bpf code in BE - VLAN TCI, in host byte order Acked-by: Stanislav Fomichev Signed-off-by: Larysa Zaremba Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20231205210847.28460-10-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- net/core/xdp.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'net/core') diff --git a/net/core/xdp.c b/net/core/xdp.c index b6f1d6dab3f2..4869c1c2d8f3 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -736,6 +736,39 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, return -EOPNOTSUPP; } +/** + * bpf_xdp_metadata_rx_vlan_tag - Get XDP packet outermost VLAN tag + * @ctx: XDP context pointer. + * @vlan_proto: Destination pointer for VLAN Tag protocol identifier (TPID). + * @vlan_tci: Destination pointer for VLAN TCI (VID + DEI + PCP) + * + * In case of success, ``vlan_proto`` contains *Tag protocol identifier (TPID)*, + * usually ``ETH_P_8021Q`` or ``ETH_P_8021AD``, but some networks can use + * custom TPIDs. ``vlan_proto`` is stored in **network byte order (BE)** + * and should be used as follows: + * ``if (vlan_proto == bpf_htons(ETH_P_8021Q)) do_something();`` + * + * ``vlan_tci`` contains the remaining 16 bits of a VLAN tag. + * Driver is expected to provide those in **host byte order (usually LE)**, + * so the bpf program should not perform byte conversion. + * According to 802.1Q standard, *VLAN TCI (Tag control information)* + * is a bit field that contains: + * *VLAN identifier (VID)* that can be read with ``vlan_tci & 0xfff``, + * *Drop eligible indicator (DEI)* - 1 bit, + * *Priority code point (PCP)* - 3 bits. + * For detailed meaning of DEI and PCP, please refer to other sources. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc + * * ``-ENODATA`` : VLAN tag was not stripped or is not available + */ +__bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, + __be16 *vlan_proto, u16 *vlan_tci) +{ + return -EOPNOTSUPP; +} + __bpf_kfunc_end_defs(); BTF_SET8_START(xdp_metadata_kfunc_ids) -- cgit From 0a149ab78ee220c75eef797abea7a29f4490e226 Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Tue, 12 Dec 2023 12:46:11 +0800 Subject: page_pool: transition to reference count management after page draining To support multiple users referencing the same fragment, 'pp_frag_count' is renamed to 'pp_ref_count', transitioning pp pages from fragment management to reference count management after draining based on the suggestion from [1]. The idea is that the concept of fragmenting exists before the page is drained, and all related functions retain their current names. However, once the page is drained, its management shifts to being governed by 'pp_ref_count'. Therefore, all functions associated with that lifecycle stage of a pp page are renamed. [1] http://lore.kernel.org/netdev/f71d9448-70c8-8793-dc9a-0eb48a570300@huawei.com Signed-off-by: Liang Chen Reviewed-by: Yunsheng Lin Reviewed-by: Ilias Apalodimas Reviewed-by: Mina Almasry Link: https://lore.kernel.org/r/20231212044614.42733-2-liangchen.linux@gmail.com Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index c2e7c9a6efbe..c92b757369f2 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -677,8 +677,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, return NULL; } -void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { @@ -687,7 +687,7 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, page_pool_return_page(pool, page); } } -EXPORT_SYMBOL(page_pool_put_defragged_page); +EXPORT_SYMBOL(page_pool_put_unrefed_page); /** * page_pool_put_page_bulk() - release references on multiple pages @@ -714,7 +714,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, struct page *page = virt_to_head_page(data[i]); /* It is not the last user for the page frag case */ - if (!page_pool_is_last_frag(page)) + if (!page_pool_is_last_ref(page)) continue; page = __page_pool_put_page(pool, page, -1, false); @@ -756,7 +756,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool, long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ - if (likely(page_pool_defrag_page(page, drain_count))) + if (likely(page_pool_unref_page(page, drain_count))) return NULL; if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { @@ -777,7 +777,7 @@ static void page_pool_free_frag(struct page_pool *pool) pool->frag_page = NULL; - if (!page || page_pool_defrag_page(page, drain_count)) + if (!page || page_pool_unref_page(page, drain_count)) return; page_pool_return_page(pool, page); -- cgit From c3f687d8dfeb33cffbb8f47c30002babfc4895d2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 7 Dec 2023 16:52:32 -0800 Subject: net: page_pool: factor out releasing DMA from releasing the page Releasing the DMA mapping will be useful for other types of pages, so factor it out. Make sure compiler inlines it, to avoid any regressions. Signed-off-by: Mina Almasry Reviewed-by: Shakeel Butt Reviewed-by: Ilias Apalodimas Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index c92b757369f2..dd5a72533f2b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -548,21 +548,16 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) return inflight; } -/* Disconnects a page (from a page_pool). API users can have a need - * to disconnect a page (from a page_pool), to allow it to be used as - * a regular page (that will eventually be returned to the normal - * page-allocator via put_page). - */ -static void page_pool_return_page(struct page_pool *pool, struct page *page) +static __always_inline +void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) { dma_addr_t dma; - int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) /* Always account for inflight pages, even if we didn't * map them */ - goto skip_dma_unmap; + return; dma = page_pool_get_dma_addr(page); @@ -571,7 +566,19 @@ static void page_pool_return_page(struct page_pool *pool, struct page *page) PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr(page, 0); -skip_dma_unmap: +} + +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +void page_pool_return_page(struct page_pool *pool, struct page *page) +{ + int count; + + __page_pool_release_page_dma(pool, page); + page_pool_clear_pp_info(page); /* This may be the last page returned, releasing the pool, so -- cgit From 4944566706b27918ca15eda913889db296792415 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Dec 2023 10:48:59 +0000 Subject: net: increase optmem_max default value For many years, /proc/sys/net/core/optmem_max default value on a 64bit kernel has been 20 KB. Regular usage of TCP tx zerocopy needs a bit more. Google has used 128KB as the default value for 7 years without any problem. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: Willem de Bruijn Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/core/sock.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index fef349dd72fa..08ecdc68d2df 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -283,8 +283,10 @@ EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -/* Maximal space eaten by iovec or ancillary data plus some space */ -int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); +/* Limits per socket sk_omem_alloc usage. + * TCP zerocopy regular usage needs 128 KB. + */ +int sysctl_optmem_max __read_mostly = 128 * 1024; EXPORT_SYMBOL(sysctl_optmem_max); int sysctl_tstamp_allow_data __read_mostly = 1; -- cgit From f5769faeec36b9d5b9df2c3e4f05a76d04ffd9c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Dec 2023 10:49:00 +0000 Subject: net: Namespace-ify sysctl_optmem_max optmem_max being used in tx zerocopy, we want to be able to control it on a netns basis. Following patch changes two tests. Tested: oqq130:~# cat /proc/sys/net/core/optmem_max 131072 oqq130:~# echo 1000000 >/proc/sys/net/core/optmem_max oqq130:~# cat /proc/sys/net/core/optmem_max 1000000 oqq130:~# unshare -n oqq130:~# cat /proc/sys/net/core/optmem_max 131072 oqq130:~# exit logout oqq130:~# cat /proc/sys/net/core/optmem_max 1000000 Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/core/bpf_sk_storage.c | 3 ++- net/core/filter.c | 12 +++++++----- net/core/net_namespace.c | 4 ++++ net/core/sock.c | 10 ++-------- net/core/sysctl_net_core.c | 15 ++++++++------- 5 files changed, 23 insertions(+), 21 deletions(-) (limited to 'net/core') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index cca7594be92e..6c4d90b24d46 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -275,9 +275,10 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { - int optmem_max = READ_ONCE(sysctl_optmem_max); struct sock *sk = (struct sock *)owner; + int optmem_max; + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { diff --git a/net/core/filter.c b/net/core/filter.c index eedb33f3e998..6d89a9cf33c9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1219,8 +1219,8 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); u32 filter_size = bpf_prog_size(fp->prog->len); - int optmem_max = READ_ONCE(sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (filter_size <= optmem_max && @@ -1550,12 +1550,13 @@ EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); - int err; + int err, optmem_max; if (IS_ERR(prog)) return PTR_ERR(prog); - if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); @@ -1594,7 +1595,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; - int err; + int err, optmem_max; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; @@ -1622,7 +1623,8 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ - if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) { + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) { err = -ENOMEM; goto err_prog_put; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb8bcbff9e83..72799533426b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -372,6 +372,10 @@ out_undo: static int __net_init net_defaults_init_net(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; + /* Limits per socket sk_omem_alloc usage. + * TCP zerocopy regular usage needs 128 KB. + */ + net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; return 0; diff --git a/net/core/sock.c b/net/core/sock.c index 08ecdc68d2df..446e945f736b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -283,12 +283,6 @@ EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -/* Limits per socket sk_omem_alloc usage. - * TCP zerocopy regular usage needs 128 KB. - */ -int sysctl_optmem_max __read_mostly = 128 * 1024; -EXPORT_SYMBOL(sysctl_optmem_max); - int sysctl_tstamp_allow_data __read_mostly = 1; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); @@ -2653,7 +2647,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > - READ_ONCE(sysctl_optmem_max)) + READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); @@ -2671,7 +2665,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - int optmem_max = READ_ONCE(sysctl_optmem_max); + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if ((unsigned int)size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 03f1edb948d7..0f0cb1465e08 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -508,13 +508,6 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "optmem_max", - .data = &sysctl_optmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tstamp_allow_data", .data = &sysctl_tstamp_allow_data, @@ -673,6 +666,14 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, + { + .procname = "optmem_max", + .data = &init_net.core.sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .proc_handler = proc_dointvec_minmax + }, { .procname = "txrehash", .data = &init_net.core.sysctl_txrehash, -- cgit From aaf153aecef1d1831d0d6d371d5c11cf02f0337e Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Fri, 15 Dec 2023 11:30:09 +0800 Subject: page_pool: halve BIAS_MAX for multiple user references of a fragment Up to now, we were only subtracting from the number of used page fragments to figure out when a page could be freed or recycled. A following patch introduces support for multiple users referencing the same fragment. So reduce the initial page fragments value to half to avoid overflowing. Signed-off-by: Liang Chen Reviewed-by: Yunsheng Lin Reviewed-by: Mina Almasry Reviewed-by: Ilias Apalodimas Signed-off-by: David S. Miller --- net/core/page_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dd5a72533f2b..4933762e5a6b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -28,7 +28,7 @@ #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) -#define BIAS_MAX LONG_MAX +#define BIAS_MAX (LONG_MAX >> 1) #ifdef CONFIG_PAGE_POOL_STATS /* alloc_stat_inc is intended to be used in softirq context */ -- cgit From 8cfa2dee325f72f286f8f3210f867cbb981f2302 Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Fri, 15 Dec 2023 11:30:10 +0800 Subject: skbuff: Add a function to check if a page belongs to page_pool Wrap code for checking if a page is a page_pool page into a function for better readability and ease of reuse. Signed-off-by: Liang Chen Reviewed-by: Yunsheng Lin Reviewed-by: Ilias Apalodimas Reviewed-by: Mina Almasry Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 83af8aaeb893..2f977fffd3ec 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -890,6 +890,11 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +static bool is_pp_page(struct page *page) +{ + return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; +} + #if IS_ENABLED(CONFIG_PAGE_POOL) bool napi_pp_put_page(struct page *page, bool napi_safe) { @@ -905,7 +910,7 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) * and page_is_pfmemalloc() is checked in __page_pool_put_page() * to avoid recycling the pfmemalloc page. */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + if (unlikely(!is_pp_page(page))) return false; pp = page->pp; -- cgit From f7dc3248dcfbdd81b5be64272f38b87a8e8085e7 Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Fri, 15 Dec 2023 11:30:11 +0800 Subject: skbuff: Optimization of SKB coalescing for page pool In order to address the issues encountered with commit 1effe8ca4e34 ("skbuff: fix coalescing for page_pool fragment recycling"), the combination of the following condition was excluded from skb coalescing: from->pp_recycle = 1 from->cloned = 1 to->pp_recycle = 1 However, with page pool environments, the aforementioned combination can be quite common(ex. NetworkMananger may lead to the additional packet_type being registered, thus the cloning). In scenarios with a higher number of small packets, it can significantly affect the success rate of coalescing. For example, considering packets of 256 bytes size, our comparison of coalescing success rate is as follows: Without page pool: 70% With page pool: 13% Consequently, this has an impact on performance: Without page pool: 2.57 Gbits/sec With page pool: 2.26 Gbits/sec Therefore, it seems worthwhile to optimize this scenario and enable coalescing of this particular combination. To achieve this, we need to ensure the correct increment of the "from" SKB page's page pool reference count (pp_ref_count). Following this optimization, the success rate of coalescing measured in our environment has improved as follows: With page pool: 60% This success rate is approaching the rate achieved without using page pool, and the performance has also been improved: With page pool: 2.52 Gbits/sec Below is the performance comparison for small packets before and after this optimization. We observe no impact to packets larger than 4K. packet size before after improved (bytes) (Gbits/sec) (Gbits/sec) 128 1.19 1.27 7.13% 256 2.26 2.52 11.75% 512 4.13 4.81 16.50% 1024 6.17 6.73 9.05% 2048 14.54 15.47 6.45% 4096 25.44 27.87 9.52% Signed-off-by: Liang Chen Reviewed-by: Yunsheng Lin Suggested-by: Jason Wang Reviewed-by: Mina Almasry Signed-off-by: David S. Miller --- net/core/skbuff.c | 52 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2f977fffd3ec..4d4b11b0a83d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -947,6 +947,37 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) return napi_pp_put_page(virt_to_page(data), napi_safe); } +/** + * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb + * @skb: page pool aware skb + * + * Increase the fragment reference count (pp_ref_count) of a skb. This is + * intended to gain fragment references only for page pool aware skbs, + * i.e. when skb->pp_recycle is true, and not for fragments in a + * non-pp-recycling skb. It has a fallback to increase references on normal + * pages, as page pool aware skbs may also have normal page fragments. + */ +static int skb_pp_frag_ref(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo; + struct page *head_page; + int i; + + if (!skb->pp_recycle) + return -EINVAL; + + shinfo = skb_shinfo(skb); + + for (i = 0; i < shinfo->nr_frags; i++) { + head_page = compound_head(skb_frag_page(&shinfo->frags[i])); + if (likely(is_pp_page(head_page))) + page_pool_ref_page(head_page); + else + page_ref_inc(head_page); + } + return 0; +} + static void skb_kfree_head(void *head, unsigned int end_offset) { if (end_offset == SKB_SMALL_HEAD_HEADROOM) @@ -5770,17 +5801,12 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return false; /* In general, avoid mixing page_pool and non-page_pool allocated - * pages within the same SKB. Additionally avoid dealing with clones - * with page_pool pages, in case the SKB is using page_pool fragment - * references (page_pool_alloc_frag()). Since we only take full page - * references for cloned SKBs at the moment that would result in - * inconsistent reference counts. - * In theory we could take full references if @from is cloned and - * !@to->pp_recycle but its tricky (due to potential race with - * the clone disappearing) and rare, so not worth dealing with. + * pages within the same SKB. In theory we could take full + * references if @from is cloned and !@to->pp_recycle but its + * tricky (due to potential race with the clone disappearing) and + * rare, so not worth dealing with. */ - if (to->pp_recycle != from->pp_recycle || - (from->pp_recycle && skb_cloned(from))) + if (to->pp_recycle != from->pp_recycle) return false; if (len <= skb_tailroom(to)) { @@ -5837,8 +5863,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ - for (i = 0; i < from_shinfo->nr_frags; i++) - __skb_frag_ref(&from_shinfo->frags[i]); + if (skb_pp_frag_ref(from)) { + for (i = 0; i < from_shinfo->nr_frags; i++) + __skb_frag_ref(&from_shinfo->frags[i]); + } to->truesize += delta; to->len += len; -- cgit From 174523479aae31b17c043de127c87ff2aef3d54e Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 15 Dec 2023 14:57:11 -0300 Subject: net: rtnl: use rcu_replace_pointer_rtnl in rtnl_unregister_* With the introduction of the rcu_replace_pointer_rtnl helper, cleanup the rtnl_unregister_* functions to use the helper instead of open coding it. Signed-off-by: Pedro Tammela Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5e0ab4c08f72..94c4572512b8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -342,8 +342,7 @@ int rtnl_unregister(int protocol, int msgtype) return -ENOENT; } - link = rtnl_dereference(tab[msgindex]); - RCU_INIT_POINTER(tab[msgindex], NULL); + link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); @@ -368,18 +367,13 @@ void rtnl_unregister_all(int protocol) BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL); if (!tab) { rtnl_unlock(); return; } - RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { - link = rtnl_dereference(tab[msgindex]); - if (!link) - continue; - - RCU_INIT_POINTER(tab[msgindex], NULL); + link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); -- cgit From 2130c519a401e576647040043cb46d6fdc361dcc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 18 Dec 2023 15:19:04 -0800 Subject: bpf: Use nla_ok() instead of checking nla_len directly nla_len may also be too short to be sane, in which case after recent changes nla_len() will return a wrapped value. Fixes: 172db56d90d2 ("netlink: Return unsigned value for nla_len()") Reported-by: syzbot+f43a23b6e622797c7a28@syzkaller.appspotmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann Reviewed-by: Simon Horman Link: https://lore.kernel.org/bpf/20231218231904.260440-1-kuba@kernel.org --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 4ff6100c6a27..3cc52b82bab8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -203,7 +203,7 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; nla = (struct nlattr *) &skb->data[a]; - if (nla->nla_len > skb->len - a) + if (!nla_ok(nla, skb->len - a)) return 0; nla = nla_find_nested(nla, x); -- cgit From d17aff807f845cf93926c28705216639c7279110 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Dec 2023 07:37:35 -0800 Subject: Revert BPF token-related functionality This patch includes the following revert (one conflicting BPF FS patch and three token patch sets, represented by merge commits): - revert 0f5d5454c723 "Merge branch 'bpf-fs-mount-options-parsing-follow-ups'"; - revert 750e785796bb "bpf: Support uid and gid when mounting bpffs"; - revert 733763285acf "Merge branch 'bpf-token-support-in-libbpf-s-bpf-object'"; - revert c35919dcce28 "Merge branch 'bpf-token-and-bpf-fs-based-delegation'". Link: https://lore.kernel.org/bpf/CAHk-=wg7JuFYwGy=GOMbRCtOL+jwSQsdUaBsRWkDVYbxipbM5A@mail.gmail.com Signed-off-by: Andrii Nakryiko --- net/core/filter.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 3cc52b82bab8..24061f29c9dd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,7 +87,7 @@ #include "dev.h" static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); +bpf_sk_base_func_proto(enum bpf_func_id func_id); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { @@ -7862,7 +7862,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -7955,7 +7955,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -7974,7 +7974,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8161,7 +8161,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8220,7 +8220,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) @@ -8281,7 +8281,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8323,7 +8323,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8367,7 +8367,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; #endif default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8378,7 +8378,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8405,7 +8405,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8580,7 +8580,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!bpf_token_capable(prog->aux->token, CAP_BPF)) + if (!bpf_capable()) return false; break; } @@ -8592,7 +8592,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!bpf_token_capable(prog->aux->token, CAP_BPF)) + if (!bpf_capable()) return false; break; default: @@ -11236,7 +11236,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -11418,7 +11418,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -11752,7 +11752,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = { }; static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +bpf_sk_base_func_proto(enum bpf_func_id func_id) { const struct bpf_func_proto *func; @@ -11781,10 +11781,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } - if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) + if (!perfmon_capable()) return NULL; return func; -- cgit From e0cd06f7fcb51b8acd6e68e64cc805be1283de9d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 17 Dec 2023 10:32:37 +0200 Subject: rtnetlink: bridge: Use a different policy for MDB bulk delete For MDB bulk delete we will need to validate 'MDBA_SET_ENTRY' differently compared to regular delete. Specifically, allow the ifindex to be zero (in case not filtering on bridge port) and force the address to be zero as bulk delete based on address is not supported. Do that by introducing a new policy and choosing the correct policy based on the presence of the 'NLM_F_BULK' flag in the netlink message header. Use nlmsg_parse() for strict validation. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 94c4572512b8..a0fad4d8856c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -6410,17 +6410,64 @@ static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack); } +static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct br_mdb_entry *entry = nla_data(attr); + struct br_mdb_entry zero_entry = {}; + + if (nla_len(attr) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); + return -EINVAL; + } + + if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { + NL_SET_ERR_MSG(extack, "Unknown entry state"); + return -EINVAL; + } + + if (entry->flags) { + NL_SET_ERR_MSG(extack, "Entry flags cannot be set"); + return -EINVAL; + } + + if (entry->vid >= VLAN_N_VID - 1) { + NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); + return -EINVAL; + } + + if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) { + NL_SET_ERR_MSG(extack, "Entry address cannot be set"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = { + [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + rtnl_validate_mdb_entry_del_bulk, + sizeof(struct br_mdb_entry)), + [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, +}; + static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; struct net *net = sock_net(skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; - err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, - MDBA_SET_ENTRY_MAX, mdba_policy, extack); + if (!del_bulk) + err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, + MDBA_SET_ENTRY_MAX, mdba_policy, + extack); + else + err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, + mdba_del_bulk_policy, extack); if (err) return err; -- cgit From d8e81f131178dad603c6817421056030ed2f4ac2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 17 Dec 2023 10:32:39 +0200 Subject: rtnetlink: bridge: Invoke MDB bulk deletion when needed Invoke the new MDB bulk deletion device operation when the 'NLM_F_BULK' flag is set in the netlink message header. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a0fad4d8856c..ec185e131766 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -6488,6 +6488,14 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } + if (del_bulk) { + if (!dev->netdev_ops->ndo_mdb_del_bulk) { + NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion"); + return -EOPNOTSUPP; + } + return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack); + } + if (!dev->netdev_ops->ndo_mdb_del) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; -- cgit From 2601e9c4b1176253e33025ca24e56ed67c8d434f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 17 Dec 2023 10:32:42 +0200 Subject: rtnetlink: bridge: Enable MDB bulk deletion Now that both the common code as well as individual drivers support MDB bulk deletion, allow user space to make such requests. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ec185e131766..5f6ed6da3cfc 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -6741,5 +6741,6 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, + RTNL_FLAG_BULK_DEL_SUPPORTED); } -- cgit From fb2780721ca5e9f78bbe4544b819b929a982df9c Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Sat, 16 Dec 2023 17:44:34 -0300 Subject: net: sched: Move drop_reason to struct tc_skb_cb Move drop_reason from struct tcf_result to skb cb - more specifically to struct tc_skb_cb. With that, we'll be able to also set the drop reason for the remaining qdiscs (aside from clsact) that do not have access to tcf_result when time comes to set the skb drop reason. Signed-off-by: Victor Nogueira Acked-by: Daniel Borkmann Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0432b04cf9b0..16af89a733e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3923,14 +3923,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; - res.drop_reason = *drop_reason; + tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); /* Only tcf related quirks below. */ switch (ret) { case TC_ACT_SHOT: - *drop_reason = res.drop_reason; + *drop_reason = tcf_get_drop_reason(skb); mini_qdisc_qstats_cpu_drop(miniq); break; case TC_ACT_OK: -- cgit From b6a3c6066afc2cb7b92f45c67ab0b12ded81cb11 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Sat, 16 Dec 2023 17:44:35 -0300 Subject: net: sched: Make tc-related drop reason more flexible for remaining qdiscs Incrementing on Daniel's patch[1], make tc-related drop reason more flexible for remaining qdiscs - that is, all qdiscs aside from clsact. In essence, the drop reason will be set by cls_api and act_api in case any error occurred in the data path. With that, we can give the user more detailed information so that they can distinguish between a policy drop or an error drop. [1] https://lore.kernel.org/all/20231009092655.22025-1-daniel@iogearbox.net Signed-off-by: Victor Nogueira Acked-by: Daniel Borkmann Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 16af89a733e4..b87504078320 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3753,6 +3753,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); + if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && qdisc_run_begin(q)) { @@ -3782,7 +3784,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, no_lock_out: if (unlikely(to_free)) kfree_skb_list_reason(to_free, - SKB_DROP_REASON_QDISC_DROP); + tcf_get_drop_reason(to_free)); return rc; } @@ -3837,7 +3839,8 @@ no_lock_out: } spin_unlock(root_lock); if (unlikely(to_free)) - kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); + kfree_skb_list_reason(to_free, + tcf_get_drop_reason(to_free)); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; -- cgit From 90abde49ea85a8af9a56bbab8c419aefc77f919a Mon Sep 17 00:00:00 2001 From: "Radu Pirea (NXP OSS)" Date: Tue, 19 Dec 2023 16:53:25 +0200 Subject: net: rename dsa_realloc_skb to skb_ensure_writable_head_tail Rename dsa_realloc_skb to skb_ensure_writable_head_tail and move it to skbuff.c to use it as helper. Signed-off-by: Radu Pirea (NXP OSS) Signed-off-by: David S. Miller --- net/core/skbuff.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ce5687ddb768..12d22c0b8551 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5995,6 +5995,31 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) } EXPORT_SYMBOL(skb_ensure_writable); +int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev) +{ + int needed_headroom = dev->needed_headroom; + int needed_tailroom = dev->needed_tailroom; + + /* For tail taggers, we need to pad short frames ourselves, to ensure + * that the tail tag does not fail at its role of being at the end of + * the packet, once the conduit interface pads the frame. Account for + * that pad length here, and pad later. + */ + if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) + needed_tailroom += ETH_ZLEN - skb->len; + /* skb_headroom() returns unsigned int... */ + needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); + needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); + + if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) + /* No reallocation needed, yay! */ + return 0; + + return pskb_expand_head(skb, needed_headroom, needed_tailroom, + GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable_head_tail); + /* remove VLAN header from packet and update csum accordingly. * expects a non skb_vlan_tag_present skb with a vlan tag payload */ -- cgit From 3fb65f6bc7dc19ce32efd4ea26cd0f59ac328ad5 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Wed, 20 Dec 2023 07:37:57 +0800 Subject: net: pktgen: Use wait_event_freezable_timeout() for freezable kthread A freezable kernel thread can enter frozen state during freezing by either calling try_to_freeze() or using wait_event_freezable() and its variants. So for the following snippet of code in a kernel thread loop: wait_event_interruptible_timeout(); try_to_freeze(); We can change it to a simple wait_event_freezable_timeout() and then eliminate a function call. Signed-off-by: Kevin Hao Signed-off-by: David S. Miller --- net/core/pktgen.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 57cea67b7562..ea55a758a475 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3669,10 +3669,8 @@ static int pktgen_thread_worker(void *arg) if (unlikely(!pkt_dev && t->control == 0)) { if (t->net->pktgen_exiting) break; - wait_event_interruptible_timeout(t->queue, - t->control != 0, - HZ/10); - try_to_freeze(); + wait_event_freezable_timeout(t->queue, + t->control != 0, HZ / 10); continue; } -- cgit From cd4d7263d58ab98fd4dee876776e4da6c328faa3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 20 Dec 2023 17:43:58 +0200 Subject: genetlink: Use internal flags for multicast groups As explained in commit e03781879a0d ("drop_monitor: Require 'CAP_SYS_ADMIN' when joining "events" group"), the "flags" field in the multicast group structure reuses uAPI flags despite the field not being exposed to user space. This makes it impossible to extend its use without adding new uAPI flags, which is inappropriate for internal kernel checks. Solve this by adding internal flags (i.e., "GENL_MCAST_*") and convert the existing users to use them instead of the uAPI flags. Tested using the reproducers in commit 44ec98ea5ea9 ("psample: Require 'CAP_NET_ADMIN' when joining "packets" group") and commit e03781879a0d ("drop_monitor: Require 'CAP_SYS_ADMIN' when joining "events" group"). No functional changes intended. Signed-off-by: Ido Schimmel Reviewed-by: Mat Martineau Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index b240d9aae4a6..b0f221d658be 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -183,7 +183,7 @@ out: } static const struct genl_multicast_group dropmon_mcgrps[] = { - { .name = "events", .cap_sys_admin = 1 }, + { .name = "events", .flags = GENL_MCAST_CAP_SYS_ADMIN, }, }; static void send_dm_alert(struct work_struct *work) -- cgit From 02018c544ef113e980a2349eba89003d6f399d22 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:34 +0100 Subject: net: phy: Introduce ethernet link topology representation Link topologies containing multiple network PHYs attached to the same net_device can be found when using a PHY as a media converter for use with an SFP connector, on which an SFP transceiver containing a PHY can be used. With the current model, the transceiver's PHY can't be used for operations such as cable testing, timestamping, macsec offload, etc. The reason being that most of the logic for these configuration, coming from either ethtool netlink or ioctls tend to use netdev->phydev, which in multi-phy systems will reference the PHY closest to the MAC. Introduce a numbering scheme allowing to enumerate PHY devices that belong to any netdev, which can in turn allow userspace to take more precise decisions with regard to each PHY's configuration. The numbering is maintained per-netdev, in a phy_device_list. The numbering works similarly to a netdevice's ifindex, with identifiers that are only recycled once INT_MAX has been reached. This prevents races that could occur between PHY listing and SFP transceiver removal/insertion. The identifiers are assigned at phy_attach time, as the numbering depends on the netdevice the phy is attached to. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f9d4b550ef4b..df04cbf77551 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -153,6 +153,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" @@ -10875,6 +10876,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif + phy_link_topo_init(&dev->link_topo); + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); -- cgit From 993498e537af9260e697219ce41b41b22b6199cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Dec 2023 14:07:47 +0000 Subject: net-device: move gso_partial_features to net_device_read_tx dev->gso_partial_features is read from tx fast path for GSO packets. Move it to appropriate section to avoid a cache line miss. Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Cc: David Ahern Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index df04cbf77551..31588a50b757 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11629,6 +11629,7 @@ static void __init net_dev_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); @@ -11642,7 +11643,7 @@ static void __init net_dev_struct_check(void) #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); #endif - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 152); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); /* TXRX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); -- cgit From b4c1d4d9734cda4394da5b59ebf7d9ca3579561a Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Tue, 2 Jan 2024 15:15:19 +0800 Subject: fib: remove unnecessary input parameters in fib_default_rule_add When fib_default_rule_add is invoked, the value of the input parameter 'flags' is always 0. Rules uses kzalloc to allocate memory, so 'flags' has been initialized to 0. Therefore, remove the input parameter 'flags' in fib_default_rule_add. Signed-off-by: Zhengchao Shao Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240102071519.3781384-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- net/core/fib_rules.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 75282222e0b4..96622bfb838a 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -53,7 +53,7 @@ bool fib_rule_matchall(const struct fib_rule *rule) EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, - u32 pref, u32 table, u32 flags) + u32 pref, u32 table) { struct fib_rule *r; @@ -65,7 +65,6 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; - r->flags = flags; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; -- cgit From d3d344a1ca69d8fb2413e29e6400f3ad58a05c06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Jan 2024 16:22:20 +0000 Subject: net-device: move xdp_prog to net_device_read_rx xdp_prog is used in receive path, both from XDP enabled drivers and from netif_elide_gro(). This patch also removes two 4-bytes holes. Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Cc: Simon Horman Link: https://lore.kernel.org/r/20240102162220.750823-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 31588a50b757..bc4ac49d4643 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11670,7 +11670,7 @@ static void __init net_dev_struct_check(void) #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 96); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104); } /* -- cgit From fe1eb24bd5ade085914248c527044e942f75e06a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 4 Jan 2024 16:04:35 -0800 Subject: Revert "Introduce PHY listing and link_topology tracking" This reverts commit 32bb4515e34469975abc936deb0a116c4a445817. This reverts commit d078d480639a4f3b5fc2d56247afa38e0956483a. This reverts commit fcc4b105caa4b844bf043375bf799c20a9c99db1. This reverts commit 345237dbc1bdbb274c9fb9ec38976261ff4a40b8. This reverts commit 7db69ec9cfb8b4ab50420262631fb2d1908b25bf. This reverts commit 95132a018f00f5dad38bdcfd4180d1af955d46f6. This reverts commit 63d5eaf35ac36cad00cfb3809d794ef0078c822b. This reverts commit c29451aefcb42359905d18678de38e52eccb3bb5. This reverts commit 2ab0edb505faa9ac90dee1732571390f074e8113. This reverts commit dedd702a35793ab462fce4c737eeba0badf9718e. This reverts commit 034fcc210349b873ece7356905be5c6ca11eef2a. This reverts commit 9c5625f559ad6fe9f6f733c11475bf470e637d34. This reverts commit 02018c544ef113e980a2349eba89003d6f399d22. Looks like we need more time for reviews, and incremental changes will be hard to make sense of. So revert. Link: https://lore.kernel.org/all/ZZP6FV5sXEf+xd58@shell.armlinux.org.uk/ Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bc4ac49d4643..f01a9b858347 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -153,7 +153,6 @@ #include #include #include -#include #include "dev.h" #include "net-sysfs.h" @@ -10876,8 +10875,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif - phy_link_topo_init(&dev->link_topo); - dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); -- cgit From c4a5ee9c09aa11f394b57674f525085034f5f68b Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 5 Jan 2024 14:56:34 +0800 Subject: fib: rules: remove repeated assignment in fib_nl2rule In fib_nl2rule(), 'err' variable has been set to -EINVAL during declaration, and no need to set the 'err' variable to -EINVAL again. So, remove it. Signed-off-by: Zhengchao Shao Reviewed-by: Simon Horman Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/fib_rules.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 96622bfb838a..3f933ffcefc3 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -593,7 +593,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); - err = -EINVAL; if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; -- cgit