summaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c2613
1 files changed, 1886 insertions, 727 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 6ea1d20676fb..9094c0fb8c68 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -92,6 +92,7 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
@@ -105,6 +106,7 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
+#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
@@ -154,12 +156,16 @@
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
+#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
#include <net/rps.h>
+#include <linux/phy_link_topology.h>
#include "dev.h"
+#include "devmem.h"
#include "net-sysfs.h"
static DEFINE_SPINLOCK(ptype_lock);
@@ -178,8 +184,6 @@ static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
-static DECLARE_RWSEM(devnet_rename_sem);
-
static inline void dev_base_seq_inc(struct net *net)
{
unsigned int val = net->dev_base_seq + 1;
@@ -458,7 +462,9 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
* PP consumers must pay attention to run APIs in the appropriate context
* (e.g. NAPI context).
*/
-static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
+DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
#ifdef CONFIG_LOCKDEP
/*
@@ -568,10 +574,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
- if (pt->type == htons(ETH_P_ALL))
- return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
- else
- return pt->dev ? &pt->dev->ptype_specific :
+ if (pt->type == htons(ETH_P_ALL)) {
+ if (!pt->af_packet_net && !pt->dev)
+ return NULL;
+
+ return pt->dev ? &pt->dev->ptype_all :
+ &pt->af_packet_net->ptype_all;
+ }
+
+ if (pt->dev)
+ return &pt->dev->ptype_specific;
+
+ return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}
@@ -592,6 +606,9 @@ void dev_add_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
+ if (WARN_ON_ONCE(!head))
+ return;
+
spin_lock(&ptype_lock);
list_add_rcu(&pt->list, head);
spin_unlock(&ptype_lock);
@@ -616,6 +633,9 @@ void __dev_remove_pack(struct packet_type *pt)
struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
+ if (!head)
+ return;
+
spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
@@ -751,6 +771,80 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
}
EXPORT_SYMBOL_GPL(dev_fill_forward_path);
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *napi_by_id(unsigned int napi_id)
+{
+ unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+ struct napi_struct *napi;
+
+ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+ if (napi->napi_id == napi_id)
+ return napi;
+
+ return NULL;
+}
+
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *
+netdev_napi_by_id(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ napi = napi_by_id(napi_id);
+ if (!napi)
+ return NULL;
+
+ if (WARN_ON_ONCE(!napi->dev))
+ return NULL;
+ if (!net_eq(net, dev_net(napi->dev)))
+ return NULL;
+
+ return napi;
+}
+
+/**
+ * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
+ * @net: the applicable net namespace
+ * @napi_id: ID of a NAPI of a target device
+ *
+ * Find a NAPI instance with @napi_id. Lock its device.
+ * The device must be in %NETREG_REGISTERED state for lookup to succeed.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to NAPI, its device with lock held, NULL if not found.
+ */
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+ struct net_device *dev;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ dev = napi->dev;
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev, net);
+ if (!dev)
+ return NULL;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (napi && napi->dev != dev)
+ napi = NULL;
+ rcu_read_unlock();
+
+ if (!napi)
+ netdev_unlock(dev);
+ return napi;
+}
+
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
@@ -925,21 +1019,138 @@ EXPORT_SYMBOL(netdev_get_by_index);
* its reference counter increased so the caller must be careful
* about locking. The caller must hold RCU lock.
*/
-
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
struct napi_struct *napi;
WARN_ON_ONCE(!rcu_read_lock_held());
- if (napi_id < MIN_NAPI_ID)
+ if (!napi_id_valid(napi_id))
return NULL;
napi = napi_by_id(napi_id);
return napi ? napi->dev : NULL;
}
-EXPORT_SYMBOL(dev_get_by_napi_id);
+
+/* Release the held reference on the net_device, and if the net_device
+ * is still registered try to lock the instance lock. If device is being
+ * unregistered NULL will be returned (but the reference has been released,
+ * either way!)
+ *
+ * This helper is intended for locking net_device after it has been looked up
+ * using a lockless lookup helper. Lock prevents the instance from going away.
+ */
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net)
+{
+ netdev_lock(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+static struct net_device *
+__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net)
+{
+ netdev_lock_ops_compat(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock_ops_compat(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+/**
+ * netdev_get_by_index_lock() - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. If a valid device
+ * with @ifindex is found it will be returned with netdev->lock held.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to a device with lock held, NULL if not found.
+ */
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock(dev, net);
+}
+
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock_ops_compat(dev, net);
+}
+
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
+
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock_ops_compat(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock_ops_compat(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
static DEFINE_SEQLOCK(netdev_rename_lock);
@@ -952,6 +1163,7 @@ void netdev_copy_name(struct net_device *dev, char *name)
strscpy(name, dev->name, IFNAMSIZ);
} while (read_seqretry(&netdev_rename_lock, seq));
}
+EXPORT_IPV6_MOD_GPL(netdev_copy_name);
/**
* netdev_get_name - get a netdevice name, knowing its ifindex.
@@ -980,6 +1192,12 @@ out:
return ret;
}
+static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
+ const char *ha)
+{
+ return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
+}
+
/**
* dev_getbyhwaddr_rcu - find a device by its hardware address
* @net: the applicable net namespace
@@ -988,7 +1206,7 @@ out:
*
* Search for an interface by MAC address. Returns NULL if the device
* is not found or a pointer to the device.
- * The caller must hold RCU or RTNL.
+ * The caller must hold RCU.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
@@ -1000,14 +1218,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
struct net_device *dev;
for_each_netdev_rcu(net, dev)
- if (dev->type == type &&
- !memcmp(dev->dev_addr, ha, dev->addr_len))
+ if (dev_addr_cmp(dev, type, ha))
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
+/**
+ * dev_getbyhwaddr() - find a device by its hardware address
+ * @net: the applicable net namespace
+ * @type: media type of device
+ * @ha: hardware address
+ *
+ * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
+ * rtnl_lock.
+ *
+ * Context: rtnl_lock() must be held.
+ * Return: pointer to the net_device, or NULL if not found
+ */
+struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
+ const char *ha)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ for_each_netdev(net, dev)
+ if (dev_addr_cmp(dev, type, ha))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_getbyhwaddr);
+
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev, *ret = NULL;
@@ -1025,33 +1268,32 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
EXPORT_SYMBOL(dev_getfirstbyhwtype);
/**
- * __dev_get_by_flags - find any device with given flags
- * @net: the applicable net namespace
- * @if_flags: IFF_* values
- * @mask: bitmask of bits in if_flags to check
+ * netdev_get_by_flags_rcu - find any device with given flags
+ * @net: the applicable net namespace
+ * @tracker: tracking object for the acquired reference
+ * @if_flags: IFF_* values
+ * @mask: bitmask of bits in if_flags to check
+ *
+ * Search for any interface with the given flags.
*
- * Search for any interface with the given flags. Returns NULL if a device
- * is not found or a pointer to the device. Must be called inside
- * rtnl_lock(), and result refcount is unchanged.
+ * Context: rcu_read_lock() must be held.
+ * Returns: NULL if a device is not found or a pointer to the device.
*/
-
-struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
- unsigned short mask)
+struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
+ unsigned short if_flags, unsigned short mask)
{
- struct net_device *dev, *ret;
-
- ASSERT_RTNL();
+ struct net_device *dev;
- ret = NULL;
- for_each_netdev(net, dev) {
- if (((dev->flags ^ if_flags) & mask) == 0) {
- ret = dev;
- break;
+ for_each_netdev_rcu(net, dev) {
+ if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) {
+ netdev_hold(dev, tracker, GFP_ATOMIC);
+ return dev;
}
}
- return ret;
+
+ return NULL;
}
-EXPORT_SYMBOL(__dev_get_by_flags);
+EXPORT_IPV6_MOD(netdev_get_by_flags_rcu);
/**
* dev_valid_name - check if name is okay for network device
@@ -1198,33 +1440,18 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev,
return ret < 0 ? ret : 0;
}
-/**
- * dev_change_name - change name of a device
- * @dev: device
- * @newname: name (or format string) must be at least IFNAMSIZ
- *
- * Change name of a device, can pass format strings "eth%d".
- * for wildcarding.
- */
-int dev_change_name(struct net_device *dev, const char *newname)
+int netif_change_name(struct net_device *dev, const char *newname)
{
+ struct net *net = dev_net(dev);
unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
- struct net *net;
-
- ASSERT_RTNL();
- BUG_ON(!dev_net(dev));
-
- net = dev_net(dev);
- down_write(&devnet_rename_sem);
+ ASSERT_RTNL_NET(net);
- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- up_write(&devnet_rename_sem);
+ if (!strncmp(newname, dev->name, IFNAMSIZ))
return 0;
- }
memcpy(oldname, dev->name, IFNAMSIZ);
@@ -1232,10 +1459,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
err = dev_get_valid_name(net, dev, newname);
write_sequnlock_bh(&netdev_rename_lock);
- if (err < 0) {
- up_write(&devnet_rename_sem);
+ if (err < 0)
return err;
- }
if (oldname[0] && !strchr(oldname, '%'))
netdev_info(dev, "renamed from %s%s\n", oldname,
@@ -1247,14 +1472,13 @@ int dev_change_name(struct net_device *dev, const char *newname)
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
+ write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
WRITE_ONCE(dev->name_assign_type, old_assign_type);
- up_write(&devnet_rename_sem);
return ret;
}
- up_write(&devnet_rename_sem);
-
netdev_adjacent_rename_links(dev, oldname);
netdev_name_node_del(dev->name_node);
@@ -1270,7 +1494,6 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- down_write(&devnet_rename_sem);
write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
write_sequnlock_bh(&netdev_rename_lock);
@@ -1287,15 +1510,7 @@ rollback:
return err;
}
-/**
- * dev_set_alias - change ifalias of a device
- * @dev: device
- * @alias: name up to IFALIASZ
- * @len: limit of bytes to copy from info
- *
- * Set ifalias for a device,
- */
-int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+int netif_set_alias(struct net_device *dev, const char *alias, size_t len)
{
struct dev_ifalias *new_alias = NULL;
@@ -1321,7 +1536,6 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
return len;
}
-EXPORT_SYMBOL(dev_set_alias);
/**
* dev_get_alias - get ifalias of a device
@@ -1358,16 +1572,10 @@ void netdev_features_change(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_features_change);
-/**
- * netdev_state_change - device changes state
- * @dev: device to cause notification
- *
- * Called to indicate a device has changed state. This function calls
- * the notifier chains for netdev_chain and sends a NEWLINK message
- * to the routing socket.
- */
-void netdev_state_change(struct net_device *dev)
+void netif_state_change(struct net_device *dev)
{
+ netdev_ops_assert_locked_or_invisible(dev);
+
if (dev->flags & IFF_UP) {
struct netdev_notifier_change_info change_info = {
.info.dev = dev,
@@ -1378,7 +1586,6 @@ void netdev_state_change(struct net_device *dev)
rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
}
}
-EXPORT_SYMBOL(netdev_state_change);
/**
* __netdev_notify_peers - notify network peers about existence of @dev,
@@ -1467,6 +1674,8 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
set_bit(__LINK_STATE_START, &dev->state);
+ netdev_ops_assert_locked(dev);
+
if (ops->ndo_validate_addr)
ret = ops->ndo_validate_addr(dev);
@@ -1478,7 +1687,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
- dev->flags |= IFF_UP;
+ netif_set_up(dev, true);
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1487,20 +1696,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
return ret;
}
-/**
- * dev_open - prepare an interface for use.
- * @dev: device to open
- * @extack: netlink extended ack
- *
- * Takes a device from down to up state. The device's private open
- * function is invoked and then the multicast lists are loaded. Finally
- * the device is moved into the up state and a %NETDEV_UP message is
- * sent to the netdev notifier chain.
- *
- * Calling this function on an active interface is a nop. On a failure
- * a negative errno code is returned.
- */
-int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
+int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
int ret;
@@ -1516,7 +1712,6 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
return ret;
}
-EXPORT_SYMBOL(dev_open);
static void __dev_close_many(struct list_head *head)
{
@@ -1554,10 +1749,13 @@ static void __dev_close_many(struct list_head *head)
* We allow it to be called even after a DETACH hot-plug
* event.
*/
+
+ netdev_ops_assert_locked(dev);
+
if (ops->ndo_stop)
ops->ndo_stop(dev);
- dev->flags &= ~IFF_UP;
+ netif_set_up(dev, false);
netpoll_poll_enable(dev);
}
}
@@ -1571,7 +1769,7 @@ static void __dev_close(struct net_device *dev)
list_del(&single);
}
-void dev_close_many(struct list_head *head, bool unlink)
+void netif_close_many(struct list_head *head, bool unlink)
{
struct net_device *dev, *tmp;
@@ -1589,39 +1787,21 @@ void dev_close_many(struct list_head *head, bool unlink)
list_del_init(&dev->close_list);
}
}
-EXPORT_SYMBOL(dev_close_many);
+EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL");
-/**
- * dev_close - shutdown an interface.
- * @dev: device to shutdown
- *
- * This function moves an active device into down state. A
- * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
- * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
- * chain.
- */
-void dev_close(struct net_device *dev)
+void netif_close(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
list_add(&dev->close_list, &single);
- dev_close_many(&single, true);
+ netif_close_many(&single, true);
list_del(&single);
}
}
-EXPORT_SYMBOL(dev_close);
-
+EXPORT_SYMBOL(netif_close);
-/**
- * dev_disable_lro - disable Large Receive Offload on a device
- * @dev: device
- *
- * Disable Large Receive Offload (LRO) on a net device. Must be
- * called under RTNL. This is needed if received packets may be
- * forwarded to another interface.
- */
-void dev_disable_lro(struct net_device *dev)
+void netif_disable_lro(struct net_device *dev)
{
struct net_device *lower_dev;
struct list_head *iter;
@@ -1632,10 +1812,13 @@ void dev_disable_lro(struct net_device *dev)
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
- netdev_for_each_lower_dev(dev, lower_dev, iter)
- dev_disable_lro(lower_dev);
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ netdev_lock_ops(lower_dev);
+ netif_disable_lro(lower_dev);
+ netdev_unlock_ops(lower_dev);
+ }
}
-EXPORT_SYMBOL(dev_disable_lro);
+EXPORT_IPV6_MOD(netif_disable_lro);
/**
* dev_disable_gro_hw - disable HW Generic Receive Offload on a device
@@ -1723,7 +1906,9 @@ static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
int err;
for_each_netdev(net, dev) {
+ netdev_lock_ops(dev);
err = call_netdevice_register_notifiers(nb, dev);
+ netdev_unlock_ops(dev);
if (err)
goto rollback;
}
@@ -1767,14 +1952,19 @@ int register_netdevice_notifier(struct notifier_block *nb)
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
+
+ /* When RTNL is removed, we need protection for netdev_chain. */
rtnl_lock();
+
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
goto unlock;
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
+ __rtnl_net_lock(net);
err = call_netdevice_register_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
if (err)
goto rollback;
}
@@ -1785,8 +1975,11 @@ unlock:
return err;
rollback:
- for_each_net_continue_reverse(net)
+ for_each_net_continue_reverse(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
@@ -1819,8 +2012,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
if (err)
goto unlock;
- for_each_net(net)
+ for_each_net(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
unlock:
rtnl_unlock();
@@ -1884,9 +2080,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __register_netdevice_notifier_net(net, nb, false);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);
@@ -1912,9 +2109,10 @@ int unregister_netdevice_notifier_net(struct net *net,
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __unregister_netdevice_notifier_net(net, nb);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);
@@ -1927,19 +2125,56 @@ static void __move_netdevice_notifier_net(struct net *src_net,
__register_netdevice_notifier_net(dst_net, nb, true);
}
+static void rtnl_net_dev_lock(struct net_device *dev)
+{
+ bool again;
+
+ do {
+ struct net *net;
+
+ again = false;
+
+ /* netns might be being dismantled. */
+ rcu_read_lock();
+ net = dev_net_rcu(dev);
+ net_passive_inc(net);
+ rcu_read_unlock();
+
+ rtnl_net_lock(net);
+
+#ifdef CONFIG_NET_NS
+ /* dev might have been moved to another netns. */
+ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+ again = true;
+ }
+#endif
+ } while (again);
+}
+
+static void rtnl_net_dev_unlock(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+}
+
int register_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn)
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
if (!err) {
nn->nb = nb;
list_add(&nn->list, &dev->net_notifier_list);
}
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
@@ -1950,10 +2185,11 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev,
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
list_del(&nn->list);
err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
@@ -2102,8 +2338,8 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
#ifdef CONFIG_NET_CLS_ACT
-DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key);
-EXPORT_SYMBOL(tcf_bypass_check_needed_key);
+DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
+EXPORT_SYMBOL(tcf_sw_enabled_key);
#endif
DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
@@ -2227,9 +2463,9 @@ int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
}
-static inline int deliver_skb(struct sk_buff *skb,
- struct packet_type *pt_prev,
- struct net_device *orig_dev)
+static int deliver_skb(struct sk_buff *skb,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
{
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
return -ENOMEM;
@@ -2248,7 +2484,7 @@ static inline void deliver_ptype_list_skb(struct sk_buff *skb,
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (ptype->type != type)
continue;
- if (pt_prev)
+ if (unlikely(pt_prev))
deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
@@ -2269,16 +2505,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
}
/**
- * dev_nit_active - return true if any network interface taps are in use
+ * dev_nit_active_rcu - return true if any network interface taps are in use
+ *
+ * The caller must hold the RCU lock
*
* @dev: network device to check for the presence of taps
*/
-bool dev_nit_active(struct net_device *dev)
+bool dev_nit_active_rcu(const struct net_device *dev)
{
- return !list_empty(&net_hotdata.ptype_all) ||
+ /* Callers may hold either RCU or RCU BH lock */
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+ return !list_empty(&dev_net(dev)->ptype_all) ||
!list_empty(&dev->ptype_all);
}
-EXPORT_SYMBOL_GPL(dev_nit_active);
+EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
/*
* Support routine. Sends outgoing frames to any network
@@ -2287,11 +2528,12 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
- struct list_head *ptype_list = &net_hotdata.ptype_all;
struct packet_type *ptype, *pt_prev = NULL;
+ struct list_head *ptype_list;
struct sk_buff *skb2 = NULL;
rcu_read_lock();
+ ptype_list = &dev_net_rcu(dev)->ptype_all;
again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (READ_ONCE(ptype->ignore_outgoing))
@@ -2303,7 +2545,7 @@ again:
if (skb_loop_sk(ptype, skb))
continue;
- if (pt_prev) {
+ if (unlikely(pt_prev)) {
deliver_skb(skb2, pt_prev, skb->dev);
pt_prev = ptype;
continue;
@@ -2335,7 +2577,7 @@ again:
pt_prev = ptype;
}
- if (ptype_list == &net_hotdata.ptype_all) {
+ if (ptype_list != &dev->ptype_all) {
ptype_list = &dev->ptype_all;
goto again;
}
@@ -2937,7 +3179,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->reg_state == NETREG_REGISTERED ||
dev->reg_state == NETREG_UNREGISTERING) {
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
txq);
@@ -2947,6 +3189,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->num_tc)
netif_setup_tc(dev, txq);
+ net_shaper_set_real_num_tx_queues(dev, txq);
+
dev_qdisc_change_real_num_tx(dev, txq);
dev->real_num_tx_queues = txq;
@@ -2966,7 +3210,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
-#ifdef CONFIG_SYSFS
/**
* netif_set_real_num_rx_queues - set actual number of RX queues used
* @dev: Network device
@@ -2985,7 +3228,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
return -EINVAL;
if (dev->reg_state == NETREG_REGISTERED) {
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
rxq);
@@ -2997,7 +3240,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
-#endif
/**
* netif_set_real_num_queues - set actual number of RX and TX queues used
@@ -3132,6 +3374,13 @@ static void __netif_reschedule(struct Qdisc *q)
void __netif_schedule(struct Qdisc *q)
{
+ /* If q->defer_list is not empty, at least one thread is
+ * in __dev_xmit_skb() before llist_del_all(&q->defer_list).
+ * This thread will attempt to run the queue.
+ */
+ if (!llist_empty(&q->defer_list))
+ return;
+
if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
__netif_reschedule(q);
}
@@ -3229,7 +3478,7 @@ void netif_device_attach(struct net_device *dev)
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_wake_all_queues(dev);
- __netdev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
@@ -3310,6 +3559,10 @@ int skb_checksum_help(struct sk_buff *skb)
return -EINVAL;
}
+ if (!skb_frags_readable(skb)) {
+ return -EFAULT;
+ }
+
/* Before computing a checksum, we should make sure no frag could
* be modified by an external entity : checksum could be wrong.
*/
@@ -3348,9 +3601,10 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
+#ifdef CONFIG_NET_CRC32C
int skb_crc32c_csum_help(struct sk_buff *skb)
{
- __le32 crc32c_csum;
+ u32 crc;
int ret = 0, offset, start;
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -3378,14 +3632,14 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
if (ret)
goto out;
- crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
- skb->len - start, ~(__u32)0,
- crc32c_csum_stub));
- *(__le32 *)(skb->data + offset) = crc32c_csum;
+ crc = ~skb_crc32c(skb, start, skb->len - start, ~0);
+ *(__le32 *)(skb->data + offset) = cpu_to_le32(crc);
skb_reset_csum_not_inet(skb);
out:
return ret;
}
+EXPORT_SYMBOL(skb_crc32c_csum_help);
+#endif /* CONFIG_NET_CRC32C */
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
@@ -3431,8 +3685,9 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
if (!(dev->features & NETIF_F_HIGHDMA)) {
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = skb_frag_page(frag);
- if (PageHighMem(skb_frag_page(frag)))
+ if (page && PageHighMem(page))
return 1;
}
}
@@ -3504,7 +3759,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (gso_segs > READ_ONCE(dev->gso_max_segs))
return features & ~NETIF_F_GSO_MASK;
- if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
+ if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
return features & ~NETIF_F_GSO_MASK;
if (!skb_shinfo(skb)->gso_type) {
@@ -3521,8 +3776,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
features &= ~dev->gso_partial_features;
- /* Make sure to clear the IPv4 ID mangling feature if the
- * IPv4 header has the potential to be fragmented.
+ /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header
+ * has the potential to be fragmented so that TSO does not generate
+ * segments with the same ID. For encapsulated packets, the ID mangling
+ * feature is guaranteed not to use the same ID for the outer IPv4
+ * headers of the generated segments if the headers have the potential
+ * to be fragmented, so there is no need to clear the IPv4 ID mangling
+ * feature (see the section about NETIF_F_TSO_MANGLEID in
+ * segmentation-offloads.rst).
*/
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
struct iphdr *iph = skb->encapsulation ?
@@ -3532,6 +3793,18 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
features &= ~NETIF_F_TSO_MANGLEID;
}
+ /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
+ * so neither does TSO that depends on it.
+ */
+ if (features & NETIF_F_IPV6_CSUM &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 ||
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
+ vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
+ skb_transport_header_was_set(skb) &&
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
+ !ipv6_has_hopopt_jumbo(skb))
+ features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);
+
return features;
}
@@ -3572,7 +3845,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
unsigned int len;
int rc;
- if (dev_nit_active(dev))
+ if (dev_nit_active_rcu(dev))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
@@ -3631,6 +3904,11 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
return 0;
if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
+ if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
+ !ipv6_has_hopopt_jumbo(skb))
+ goto sw_checksum;
+
switch (skb->csum_offset) {
case offsetof(struct tcphdr, check):
case offsetof(struct udphdr, check):
@@ -3638,14 +3916,80 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
}
}
+sw_checksum:
return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
+/* Checks if this SKB belongs to an HW offloaded socket
+ * and whether any SW fallbacks are required based on dev.
+ * Check decrypted mark in case skb_orphan() cleared socket.
+ */
+static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev,
+ struct sk_buff *skb);
+ struct sock *sk = skb->sk;
+
+ sk_validate = NULL;
+ if (sk) {
+ if (sk_fullsock(sk))
+ sk_validate = sk->sk_validate_xmit_skb;
+ else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT)
+ sk_validate = inet_twsk(sk)->tw_validate_xmit_skb;
+ }
+
+ if (sk_validate) {
+ skb = sk_validate(sk, dev, skb);
+ } else if (unlikely(skb_is_decrypted(skb))) {
+ pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
+ kfree_skb(skb);
+ skb = NULL;
+ }
+#endif
+
+ return skb;
+}
+
+static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct skb_shared_info *shinfo;
+ struct net_iov *niov;
+
+ if (likely(skb_frags_readable(skb)))
+ goto out;
+
+ if (!dev->netmem_tx)
+ goto out_free;
+
+ shinfo = skb_shinfo(skb);
+
+ if (shinfo->nr_frags > 0) {
+ niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+ if (net_is_devmem_iov(niov) &&
+ net_devmem_iov_binding(niov)->dev != dev)
+ goto out_free;
+ }
+
+out:
+ return skb;
+
+out_free:
+ kfree_skb(skb);
+ return NULL;
+}
+
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
netdev_features_t features;
+ skb = validate_xmit_unreadable_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@@ -3705,7 +4049,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
next = skb->next;
skb_mark_not_on_list(skb);
- /* in case skb wont be segmented, point to itself */
+ /* in case skb won't be segmented, point to itself */
skb->prev = skb;
skb = validate_xmit_skb(skb, dev, again);
@@ -3725,21 +4069,30 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
-static void qdisc_pkt_len_init(struct sk_buff *skb)
+static void qdisc_pkt_len_segs_init(struct sk_buff *skb)
{
- const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ u16 gso_segs;
qdisc_skb_cb(skb)->pkt_len = skb->len;
+ if (!shinfo->gso_size) {
+ qdisc_skb_cb(skb)->pkt_segs = 1;
+ return;
+ }
+
+ qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs;
/* To get more precise estimation of bytes sent on wire,
* we add to pkt_len the headers size of all segments
*/
- if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
- u16 gso_segs = shinfo->gso_segs;
+ if (skb_transport_header_was_set(skb)) {
unsigned int hdr_len;
/* mac layer + network layer */
- hdr_len = skb_transport_offset(skb);
+ if (!skb->encapsulation)
+ hdr_len = skb_transport_offset(skb);
+ else
+ hdr_len = skb_inner_transport_offset(skb);
/* + transport layer */
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
@@ -3750,7 +4103,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
sizeof(_tcphdr), &_tcphdr);
if (likely(th))
hdr_len += __tcp_hdrlen(th);
- } else {
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
struct udphdr _udphdr;
if (skb_header_pointer(skb, hdr_len,
@@ -3758,10 +4111,16 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
hdr_len += sizeof(struct udphdr);
}
- if (shinfo->gso_type & SKB_GSO_DODGY)
- gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
- shinfo->gso_size);
+ if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
+ int payload = skb->len - hdr_len;
+ /* Malicious packet. */
+ if (payload <= 0)
+ return;
+ gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
+ shinfo->gso_segs = gso_segs;
+ qdisc_skb_cb(skb)->pkt_segs = gso_segs;
+ }
qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
}
@@ -3782,9 +4141,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
{
+ struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
spinlock_t *root_lock = qdisc_lock(q);
- struct sk_buff *to_free = NULL;
- bool contended;
+ struct llist_node *ll_list, *first_n;
+ unsigned long defer_count = 0;
int rc;
qdisc_calculate_pkt_len(skb, q);
@@ -3800,9 +4160,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
if (unlikely(!nolock_qdisc_is_empty(q))) {
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
__qdisc_run(q);
- qdisc_run_end(q);
+ to_free2 = qdisc_run_end(q);
- goto no_lock_out;
+ goto free_skbs;
}
qdisc_bstats_cpu_update(q, skb);
@@ -3810,81 +4170,93 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
!nolock_qdisc_is_empty(q))
__qdisc_run(q);
- qdisc_run_end(q);
- return NET_XMIT_SUCCESS;
+ to_free2 = qdisc_run_end(q);
+ rc = NET_XMIT_SUCCESS;
+ goto free_skbs;
}
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
- qdisc_run(q);
-
-no_lock_out:
- if (unlikely(to_free))
- kfree_skb_list_reason(to_free,
- tcf_get_drop_reason(to_free));
- return rc;
+ to_free2 = qdisc_run(q);
+ goto free_skbs;
}
- if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
- return NET_XMIT_DROP;
- }
- /*
- * Heuristic to force contended enqueues to serialize on a
- * separate lock before trying to get qdisc main lock.
- * This permits qdisc->running owner to get the lock more
- * often and dequeue packets faster.
- * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
- * and then other tasks will only enqueue packets. The packets will be
- * sent after the qdisc owner is scheduled again. To prevent this
- * scenario the task always serialize on the lock.
+ /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
+ * In the try_cmpxchg() loop, we want to increment q->defer_count
+ * at most once to limit the number of skbs in defer_list.
+ * We perform the defer_count increment only if the list is not empty,
+ * because some arches have slow atomic_long_inc_return().
*/
- contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
- if (unlikely(contended))
- spin_lock(&q->busylock);
+ first_n = READ_ONCE(q->defer_list.first);
+ do {
+ if (first_n && !defer_count) {
+ defer_count = atomic_long_inc_return(&q->defer_count);
+ if (unlikely(defer_count > READ_ONCE(q->limit))) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+ return NET_XMIT_DROP;
+ }
+ }
+ skb->ll_node.next = first_n;
+ } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
+
+ /* If defer_list was not empty, we know the cpu which queued
+ * the first skb will process the whole list for us.
+ */
+ if (first_n)
+ return NET_XMIT_SUCCESS;
spin_lock(root_lock);
+
+ ll_list = llist_del_all(&q->defer_list);
+ /* There is a small race because we clear defer_count not atomically
+ * with the prior llist_del_all(). This means defer_list could grow
+ * over q->limit.
+ */
+ atomic_long_set(&q->defer_count, 0);
+
+ ll_list = llist_reverse_order(ll_list);
+
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
- __qdisc_drop(skb, &to_free);
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node)
+ __qdisc_drop(skb, &to_free);
rc = NET_XMIT_DROP;
- } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
- qdisc_run_begin(q)) {
+ goto unlock;
+ }
+ if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+ !llist_next(ll_list) && qdisc_run_begin(q)) {
/*
* This is a work-conserving queue; there are no old skbs
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
+ DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
+ struct sk_buff,
+ ll_node));
qdisc_bstats_update(q, skb);
-
- if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
- if (unlikely(contended)) {
- spin_unlock(&q->busylock);
- contended = false;
- }
+ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
__qdisc_run(q);
- }
-
- qdisc_run_end(q);
+ to_free2 = qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
- WRITE_ONCE(q->owner, smp_processor_id());
- rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
- WRITE_ONCE(q->owner, -1);
- if (qdisc_run_begin(q)) {
- if (unlikely(contended)) {
- spin_unlock(&q->busylock);
- contended = false;
- }
- __qdisc_run(q);
- qdisc_run_end(q);
+ int count = 0;
+
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+ prefetch(next);
+ prefetch(&next->priority);
+ skb_mark_not_on_list(skb);
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ count++;
}
+ to_free2 = qdisc_run(q);
+ if (count != 1)
+ rc = NET_XMIT_SUCCESS;
}
+unlock:
spin_unlock(root_lock);
- if (unlikely(to_free))
- kfree_skb_list_reason(to_free,
- tcf_get_drop_reason(to_free));
- if (unlikely(contended))
- spin_unlock(&q->busylock);
+
+free_skbs:
+ tcf_kfree_skb_list(to_free);
+ tcf_kfree_skb_list(to_free2);
return rc;
}
@@ -3980,13 +4352,16 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
if (!miniq)
return ret;
- if (static_branch_unlikely(&tcf_bypass_check_needed_key)) {
- if (tcf_block_bypass_sw(miniq->block))
- return ret;
- }
+ /* Global bypass */
+ if (!static_branch_likely(&tcf_sw_enabled_key))
+ return ret;
+
+ /* Block-wise bypass */
+ if (tcf_block_bypass_sw(miniq->block))
+ return ret;
tc_skb_cb(skb)->mru = 0;
- tc_skb_cb(skb)->post_ct = false;
+ qdisc_skb_cb(skb)->post_ct = false;
tcf_set_drop_reason(skb, *drop_reason);
mini_qdisc_bstats_cpu_update(miniq, skb);
@@ -4052,12 +4427,12 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
return skb;
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
- if (*pt_prev) {
+ if (unlikely(*pt_prev)) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
- qdisc_skb_cb(skb)->pkt_len = skb->len;
+ qdisc_pkt_len_segs_init(skb);
tcx_set_ingress(skb, true);
if (static_branch_unlikely(&tcx_needed_key)) {
@@ -4245,12 +4620,31 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
}
EXPORT_SYMBOL(dev_pick_tx_zero);
-u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev)
+int sk_tx_queue_get(const struct sock *sk)
{
- return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
+ int resel, val;
+
+ if (!sk)
+ return -1;
+ /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
+ * and sk_tx_queue_set().
+ */
+ val = READ_ONCE(sk->sk_tx_queue_mapping);
+
+ if (val == NO_QUEUE_MAPPING)
+ return -1;
+
+ if (!sk_fullsock(sk))
+ return val;
+
+ resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
+ if (resel && time_is_before_jiffies(
+ READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
+ return -1;
+
+ return val;
}
-EXPORT_SYMBOL(dev_pick_tx_cpu_id);
+EXPORT_SYMBOL(sk_tx_queue_get);
u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev)
@@ -4267,8 +4661,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
if (new_index < 0)
new_index = skb_tx_hash(dev, sb_dev, skb);
- if (queue_index != new_index && sk &&
- sk_fullsock(sk) &&
+ if (sk && sk_fullsock(sk) &&
rcu_access_pointer(sk->sk_dst_cache))
sk_tx_queue_set(sk, new_index);
@@ -4339,7 +4732,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_reset_mac_header(skb);
skb_assert_len(skb);
- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ if (unlikely(skb_shinfo(skb)->tx_flags &
+ (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
@@ -4349,7 +4743,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_update_prio(skb);
- qdisc_pkt_len_init(skb);
+ qdisc_pkt_len_segs_init(skb);
tcx_set_ingress(skb, false);
#ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) {
@@ -4508,7 +4902,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
/* Paired with smp_mb__before_atomic() in
- * napi_enable()/dev_set_threaded().
+ * napi_enable()/netif_set_threaded().
* Use READ_ONCE() to guarantee a complete
* read on napi->thread. Only call
* wake_up_process() when it's not NULL.
@@ -4525,13 +4919,14 @@ static inline void ____napi_schedule(struct softnet_data *sd,
}
use_local_napi:
+ DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action()
* we have to raise NET_RX_SOFTIRQ.
*/
if (!sd->in_net_rx_action)
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RPS
@@ -4541,9 +4936,45 @@ EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
+static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
+{
+ return hash_32(hash, flow_table->log);
+}
+
+#ifdef CONFIG_RFS_ACCEL
+/**
+ * rps_flow_is_active - check whether the flow is recently active.
+ * @rflow: Specific flow to check activity.
+ * @flow_table: per-queue flowtable that @rflow belongs to.
+ * @cpu: CPU saved in @rflow.
+ *
+ * If the CPU has processed many packets since the flow's last activity
+ * (beyond 10 times the table size), the flow is considered stale.
+ *
+ * Return: true if flow was recently active.
+ */
+static bool rps_flow_is_active(struct rps_dev_flow *rflow,
+ struct rps_dev_flow_table *flow_table,
+ unsigned int cpu)
+{
+ unsigned int flow_last_active;
+ unsigned int sd_input_head;
+
+ if (cpu >= nr_cpu_ids)
+ return false;
+
+ sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
+ flow_last_active = READ_ONCE(rflow->last_qtail);
+
+ return (int)(sd_input_head - flow_last_active) <
+ (int)(10 << flow_table->log);
+}
+#endif
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
- struct rps_dev_flow *rflow, u16 next_cpu)
+ struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
+ u32 flow_id)
{
if (next_cpu < nr_cpu_ids) {
u32 head;
@@ -4551,8 +4982,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
+ struct rps_dev_flow *tmp_rflow;
+ unsigned int tmp_cpu;
u16 rxq_index;
- u32 flow_id;
int rc;
/* Should we steer this flow to a different hardware queue? */
@@ -4567,14 +4999,29 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = skb_get_hash(skb) & flow_table->mask;
+
+ tmp_rflow = &flow_table->flows[flow_id];
+ tmp_cpu = READ_ONCE(tmp_rflow->cpu);
+
+ if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
+ if (rps_flow_is_active(tmp_rflow, flow_table,
+ tmp_cpu)) {
+ if (hash != READ_ONCE(tmp_rflow->hash) ||
+ next_cpu == tmp_cpu)
+ goto out;
+ }
+ }
+
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
goto out;
+
old_rflow = rflow;
- rflow = &flow_table->flows[flow_id];
+ rflow = tmp_rflow;
WRITE_ONCE(rflow->filter, rc);
+ WRITE_ONCE(rflow->hash, hash);
+
if (old_rflow->filter == rc)
WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
out:
@@ -4600,6 +5047,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow_table *flow_table;
struct rps_map *map;
int cpu = -1;
+ u32 flow_id;
u32 tcpu;
u32 hash;
@@ -4646,7 +5094,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[hash & flow_table->mask];
+ flow_id = rfs_slot(hash, flow_table);
+ rflow = &flow_table->flows[flow_id];
tcpu = rflow->cpu;
/*
@@ -4665,7 +5114,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
- rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
+ flow_id);
}
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
@@ -4709,17 +5159,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
- unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
- if (flow_table && flow_id <= flow_table->mask) {
+ if (flow_table && flow_id < (1UL << flow_table->log)) {
+ unsigned int cpu;
+
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
- if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
- ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
- READ_ONCE(rflow->last_qtail)) <
- (int)(10 * flow_table->mask)))
+ if (READ_ONCE(rflow->filter) == filter_id &&
+ rps_flow_is_active(rflow, flow_table, cpu))
expire = false;
}
rcu_read_unlock();
@@ -4735,7 +5184,8 @@ static void rps_trigger_softirq(void *data)
struct softnet_data *sd = data;
____napi_schedule(sd, &sd->backlog);
- sd->received_rps++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->received_rps, sd->received_rps + 1);
}
#endif /* CONFIG_RPS */
@@ -4784,8 +5234,9 @@ static void napi_schedule_rps(struct softnet_data *sd)
__napi_schedule_irqoff(&mysd->backlog);
}
-void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
+void kick_defer_list_purge(unsigned int cpu)
{
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
unsigned long flags;
if (use_backlog_threads()) {
@@ -4805,14 +5256,15 @@ void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif
-static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen,
+ int max_backlog)
{
#ifdef CONFIG_NET_FLOW_LIMIT
- struct sd_flow_limit *fl;
- struct softnet_data *sd;
unsigned int old_flow, new_flow;
+ const struct softnet_data *sd;
+ struct sd_flow_limit *fl;
- if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
+ if (likely(qlen < (max_backlog >> 1)))
return false;
sd = this_cpu_ptr(&softnet_data);
@@ -4820,7 +5272,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
if (fl) {
- new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
+ new_flow = hash_32(skb_get_hash(skb), fl->log_buckets);
old_flow = fl->history[fl->history_head];
fl->history[fl->history_head] = new_flow;
@@ -4831,7 +5283,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
fl->buckets[old_flow]--;
if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
- fl->count++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(fl->count, fl->count + 1);
rcu_read_unlock();
return true;
}
@@ -4856,19 +5309,19 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
u32 tail;
reason = SKB_DROP_REASON_DEV_READY;
- if (!netif_running(skb->dev))
+ if (unlikely(!netif_running(skb->dev)))
goto bad_dev;
- reason = SKB_DROP_REASON_CPU_BACKLOG;
sd = &per_cpu(softnet_data, cpu);
qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
max_backlog = READ_ONCE(net_hotdata.max_backlog);
- if (unlikely(qlen > max_backlog))
+ if (unlikely(qlen > max_backlog) ||
+ skb_flow_limit(skb, qlen, max_backlog))
goto cpu_backlog_drop;
backlog_lock_irq_save(sd, &flags);
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
+ if (likely(qlen <= max_backlog)) {
if (!qlen) {
/* Schedule NAPI for backlog device. We can use
* non atomic operation as we own the queue lock.
@@ -4889,7 +5342,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
backlog_unlock_irq_restore(sd, &flags);
cpu_backlog_drop:
- atomic_inc(&sd->dropped);
+ reason = SKB_DROP_REASON_CPU_BACKLOG;
+ numa_drop_add(&sd->drop_counters, 1);
bad_dev:
dev_core_stats_rx_dropped_inc(skb->dev);
kfree_skb_reason(skb, reason);
@@ -4920,7 +5374,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
}
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
@@ -5022,12 +5476,15 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
}
static int
-netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
+netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
{
struct sk_buff *skb = *pskb;
int err, hroom, troom;
- if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog);
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ if (!err)
return 0;
/* In case we have to go down the path and also linearize,
@@ -5046,7 +5503,7 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct sk_buff *skb = *pskb;
u32 mac_len, act = XDP_DROP;
@@ -5099,7 +5556,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
* and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
* queues, so they do not have this starvation issue.
*/
-void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -5124,7 +5581,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
+int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
@@ -5150,6 +5607,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
bpf_net_ctx_clear(bpf_net_ctx);
return XDP_DROP;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
}
return XDP_PASS;
out_redir:
@@ -5247,7 +5705,7 @@ int netif_rx(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_rx);
-static __latent_entropy void net_tx_action(struct softirq_action *h)
+static __latent_entropy void net_tx_action(void)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -5291,8 +5749,9 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
rcu_read_lock();
while (head) {
- struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
+ struct sk_buff *to_free;
+ struct Qdisc *q = head;
head = head->next_sched;
@@ -5319,9 +5778,10 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
}
clear_bit(__QDISC_STATE_SCHED, &q->state);
- qdisc_run(q);
+ to_free = qdisc_run(q);
if (root_lock)
spin_unlock(root_lock);
+ tcf_kfree_skb_list(to_free);
}
rcu_read_unlock();
@@ -5431,7 +5891,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
if (nf_hook_ingress_active(skb)) {
int ingress_retval;
- if (*pt_prev) {
+ if (unlikely(*pt_prev)) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
@@ -5447,6 +5907,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO;
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct sk_buff *skb = *pskb;
@@ -5462,8 +5923,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
orig_dev = skb->dev;
skb_reset_network_header(skb);
+#if !defined(CONFIG_DEBUG_NET)
+ /* We plan to no longer reset the transport header here.
+ * Give some time to fuzzers and dev build to catch bugs
+ * in network stacks.
+ */
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
+#endif
skb_reset_mac_len(skb);
pt_prev = NULL;
@@ -5499,14 +5966,15 @@ another_round:
if (pfmemalloc)
goto skip_taps;
- list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
- if (pt_prev)
+ list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
+ list) {
+ if (unlikely(pt_prev))
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
- if (pt_prev)
+ if (unlikely(pt_prev))
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
@@ -5531,11 +5999,13 @@ skip_taps:
#endif
skb_reset_redirect(skb);
skip_classify:
- if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
+ if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) {
+ drop_reason = SKB_DROP_REASON_PFMEMALLOC;
goto drop;
+ }
if (skb_vlan_tag_present(skb)) {
- if (pt_prev) {
+ if (unlikely(pt_prev)) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
@@ -5547,7 +6017,7 @@ skip_classify:
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
- if (pt_prev) {
+ if (unlikely(pt_prev)) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
@@ -5611,6 +6081,14 @@ check_vlan_id:
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
+
+ /* orig_dev and skb->dev could belong to different netns;
+ * Even in such case we need to traverse only the list
+ * coming from skb->dev, as the ptype owner (packet socket)
+ * will use dev_net(skb->dev) to do namespace filtering.
+ */
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &dev_net_rcu(skb->dev)->ptype_specific);
}
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
@@ -5622,8 +6100,6 @@ check_vlan_id:
}
if (pt_prev) {
- if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
- goto drop;
*ppt_prev = pt_prev;
} else {
drop:
@@ -5631,7 +6107,8 @@ drop:
dev_core_stats_rx_dropped_inc(skb->dev);
else
dev_core_stats_rx_nohandler_inc(skb->dev);
- kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
+
+ kfree_skb_reason(skb, drop_reason);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
@@ -5724,10 +6201,9 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
struct packet_type *pt_curr = NULL;
/* Current (common) orig_dev of sublist */
struct net_device *od_curr = NULL;
- struct list_head sublist;
struct sk_buff *skb, *next;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
@@ -5822,7 +6298,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
static_branch_dec(&generic_xdp_needed_key);
} else if (new && !old) {
static_branch_inc(&generic_xdp_needed_key);
- dev_disable_lro(dev);
+ netif_disable_lro(dev);
dev_disable_gro_hw(dev);
}
break;
@@ -5865,9 +6341,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
void netif_receive_skb_list_internal(struct list_head *head)
{
struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
skb);
@@ -5949,22 +6424,22 @@ void netif_receive_skb_list(struct list_head *head)
}
EXPORT_SYMBOL(netif_receive_skb_list);
-static DEFINE_PER_CPU(struct work_struct, flush_works);
-
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
struct sk_buff *skb, *tmp;
+ struct sk_buff_head list;
struct softnet_data *sd;
+ __skb_queue_head_init(&list);
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
backlog_lock_irq_disable(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
- dev_kfree_skb_irq(skb);
+ __skb_queue_tail(&list, skb);
rps_input_queue_head_incr(sd);
}
}
@@ -5972,14 +6447,16 @@ static void flush_backlog(struct work_struct *work)
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
- kfree_skb(skb);
+ __skb_queue_tail(&list, skb);
rps_input_queue_head_incr(sd);
}
}
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
local_bh_enable();
+
+ __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY);
}
static bool flush_required(int cpu)
@@ -6007,36 +6484,54 @@ static bool flush_required(int cpu)
return true;
}
+struct flush_backlogs {
+ cpumask_t flush_cpus;
+ struct work_struct w[];
+};
+
+static struct flush_backlogs *flush_backlogs_alloc(void)
+{
+ return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
+ GFP_KERNEL);
+}
+
+static struct flush_backlogs *flush_backlogs_fallback;
+static DEFINE_MUTEX(flush_backlogs_mutex);
+
static void flush_all_backlogs(void)
{
- static cpumask_t flush_cpus;
+ struct flush_backlogs *ptr = flush_backlogs_alloc();
unsigned int cpu;
- /* since we are under rtnl lock protection we can use static data
- * for the cpumask and avoid allocating on stack the possibly
- * large mask
- */
- ASSERT_RTNL();
+ if (!ptr) {
+ mutex_lock(&flush_backlogs_mutex);
+ ptr = flush_backlogs_fallback;
+ }
+ cpumask_clear(&ptr->flush_cpus);
cpus_read_lock();
- cpumask_clear(&flush_cpus);
for_each_online_cpu(cpu) {
if (flush_required(cpu)) {
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
- cpumask_set_cpu(cpu, &flush_cpus);
+ INIT_WORK(&ptr->w[cpu], flush_backlog);
+ queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
+ __cpumask_set_cpu(cpu, &ptr->flush_cpus);
}
}
/* we can have in flight packet[s] on the cpus we are not flushing,
* synchronize_net() in unregister_netdevice_many() will take care of
- * them
+ * them.
*/
- for_each_cpu(cpu, &flush_cpus)
- flush_work(per_cpu_ptr(&flush_works, cpu));
+ for_each_cpu(cpu, &ptr->flush_cpus)
+ flush_work(&ptr->w[cpu]);
cpus_read_unlock();
+
+ if (ptr != flush_backlogs_fallback)
+ kfree(ptr);
+ else
+ mutex_unlock(&flush_backlogs_mutex);
}
static void net_rps_send_ipi(struct softnet_data *remsd)
@@ -6225,25 +6720,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
return false;
if (work_done) {
- if (n->gro_bitmask)
- timeout = READ_ONCE(n->dev->gro_flush_timeout);
- n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
+ if (n->gro.bitmask)
+ timeout = napi_get_gro_flush_timeout(n);
+ n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
}
if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
- timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ timeout = napi_get_gro_flush_timeout(n);
if (timeout)
ret = false;
}
- if (n->gro_bitmask) {
- /* When the NAPI instance uses a timeout and keeps postponing
- * it, we need to bound somehow the time packets are kept in
- * the GRO layer
- */
- napi_gro_flush(n, !!timeout);
- }
- gro_normal_list(n);
+ /*
+ * When the NAPI instance uses a timeout and keeps postponing
+ * it, we need to bound somehow the time packets are kept in
+ * the GRO layer.
+ */
+ gro_flush_normal(&n->gro, !!timeout);
if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
@@ -6281,37 +6774,25 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
}
EXPORT_SYMBOL(napi_complete_done);
-/* must be called under rcu_read_lock(), as we dont take a reference */
-struct napi_struct *napi_by_id(unsigned int napi_id)
-{
- unsigned int hash = napi_id % HASH_SIZE(napi_hash);
- struct napi_struct *napi;
-
- hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
- if (napi->napi_id == napi_id)
- return napi;
-
- return NULL;
-}
-
-static void skb_defer_free_flush(struct softnet_data *sd)
+static void skb_defer_free_flush(void)
{
+ struct llist_node *free_list;
struct sk_buff *skb, *next;
+ struct skb_defer_node *sdn;
+ int node;
- /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
- if (!READ_ONCE(sd->defer_list))
- return;
+ for_each_node(node) {
+ sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node;
- spin_lock(&sd->defer_lock);
- skb = sd->defer_list;
- sd->defer_list = NULL;
- sd->defer_count = 0;
- spin_unlock(&sd->defer_lock);
+ if (llist_empty(&sdn->defer_list))
+ continue;
+ atomic_long_set(&sdn->defer_count, 0);
+ free_list = llist_del_all(&sdn->defer_list);
- while (skb != NULL) {
- next = skb->next;
- napi_consume_skb(skb, 1);
- skb = next;
+ llist_for_each_entry_safe(skb, next, free_list, ll_node) {
+ prefetch(next);
+ napi_consume_skb(skb, 1);
+ }
}
}
@@ -6320,19 +6801,14 @@ static void skb_defer_free_flush(struct softnet_data *sd)
static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
if (!skip_schedule) {
- gro_normal_list(napi);
+ gro_normal_list(&napi->gro);
__napi_schedule(napi);
return;
}
- if (napi->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(napi, HZ >= 1000);
- }
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush_normal(&napi->gro, HZ >= 1000);
- gro_normal_list(napi);
clear_bit(NAPI_STATE_SCHED, &napi->state);
}
@@ -6365,8 +6841,8 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (flags & NAPI_F_PREFER_BUSY_POLL) {
- napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
- timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+ napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
+ timeout = napi_get_gro_flush_timeout(napi);
if (napi->defer_hard_irqs_count && timeout) {
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
skip_schedule = true;
@@ -6439,12 +6915,12 @@ restart:
}
work = napi_poll(napi, budget);
trace_napi_poll(napi, work, budget);
- gro_normal_list(napi);
+ gro_normal_list(&napi->gro);
count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
- skb_defer_free_flush(this_cpu_ptr(&softnet_data));
+ skb_defer_free_flush();
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
@@ -6497,26 +6973,84 @@ void napi_busy_loop(unsigned int napi_id,
}
EXPORT_SYMBOL(napi_busy_loop);
+void napi_suspend_irqs(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ rcu_read_lock();
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ unsigned long timeout = napi_get_irq_suspend_timeout(napi);
+
+ if (timeout)
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ }
+ rcu_read_unlock();
+}
+
+void napi_resume_irqs(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ rcu_read_lock();
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ /* If irq_suspend_timeout is set to 0 between the call to
+ * napi_suspend_irqs and now, the original value still
+ * determines the safety timeout as intended and napi_watchdog
+ * will resume irq processing.
+ */
+ if (napi_get_irq_suspend_timeout(napi)) {
+ local_bh_disable();
+ napi_schedule(napi);
+ local_bh_enable();
+ }
+ }
+ rcu_read_unlock();
+}
+
#endif /* CONFIG_NET_RX_BUSY_POLL */
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ napi->gro.cached_napi_id = napi_id;
+
+ WRITE_ONCE(napi->napi_id, napi_id);
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&napi_hash_lock, flags);
+ WARN_ON_ONCE(napi_by_id(napi_id));
+ __napi_hash_add_with_id(napi, napi_id);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
+}
+
static void napi_hash_add(struct napi_struct *napi)
{
+ unsigned long flags;
+
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
return;
- spin_lock(&napi_hash_lock);
+ spin_lock_irqsave(&napi_hash_lock, flags);
/* 0..NR_CPUS range is reserved for sender_cpu use */
do {
- if (unlikely(++napi_gen_id < MIN_NAPI_ID))
+ if (unlikely(!napi_id_valid(++napi_gen_id)))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
- napi->napi_id = napi_gen_id;
- hlist_add_head_rcu(&napi->napi_hash_node,
- &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+ __napi_hash_add_with_id(napi, napi_gen_id);
- spin_unlock(&napi_hash_lock);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
}
/* Warning : caller is responsible to make sure rcu grace period
@@ -6524,11 +7058,13 @@ static void napi_hash_add(struct napi_struct *napi)
*/
static void napi_hash_del(struct napi_struct *napi)
{
- spin_lock(&napi_hash_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&napi_hash_lock, flags);
hlist_del_init_rcu(&napi->napi_hash_node);
- spin_unlock(&napi_hash_lock);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
}
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
@@ -6549,31 +7085,100 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void init_gro_hash(struct napi_struct *napi)
+static void napi_stop_kthread(struct napi_struct *napi)
{
- int i;
+ unsigned long val, new;
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- INIT_LIST_HEAD(&napi->gro_hash[i].list);
- napi->gro_hash[i].count = 0;
+ /* Wait until the napi STATE_THREADED is unset. */
+ while (true) {
+ val = READ_ONCE(napi->state);
+
+ /* If napi kthread own this napi or the napi is idle,
+ * STATE_THREADED can be unset here.
+ */
+ if ((val & NAPIF_STATE_SCHED_THREADED) ||
+ !(val & NAPIF_STATE_SCHED)) {
+ new = val & (~(NAPIF_STATE_THREADED |
+ NAPIF_STATE_THREADED_BUSY_POLL));
+ } else {
+ msleep(20);
+ continue;
+ }
+
+ if (try_cmpxchg(&napi->state, &val, new))
+ break;
+ }
+
+ /* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by
+ * the kthread.
+ */
+ while (true) {
+ if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state))
+ break;
+
+ msleep(20);
+ }
+
+ kthread_stop(napi->thread);
+ napi->thread = NULL;
+}
+
+static void napi_set_threaded_state(struct napi_struct *napi,
+ enum netdev_napi_threaded threaded_mode)
+{
+ bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED;
+ bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL;
+
+ assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+ assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll);
+}
+
+int napi_set_threaded(struct napi_struct *napi,
+ enum netdev_napi_threaded threaded)
+{
+ if (threaded) {
+ if (!napi->thread) {
+ int err = napi_kthread_create(napi);
+
+ if (err)
+ return err;
+ }
+ }
+
+ if (napi->config)
+ napi->config->threaded = threaded;
+
+ /* Setting/unsetting threaded mode on a napi might not immediately
+ * take effect, if the current napi instance is actively being
+ * polled. In this case, the switch between threaded mode and
+ * softirq mode will happen in the next round of napi_schedule().
+ * This should not cause hiccups/stalls to the live traffic.
+ */
+ if (!threaded && napi->thread) {
+ napi_stop_kthread(napi);
+ } else {
+ /* Make sure kthread is created before THREADED bit is set. */
+ smp_mb__before_atomic();
+ napi_set_threaded_state(napi, threaded);
}
- napi->gro_bitmask = 0;
+
+ return 0;
}
-int dev_set_threaded(struct net_device *dev, bool threaded)
+int netif_set_threaded(struct net_device *dev,
+ enum netdev_napi_threaded threaded)
{
struct napi_struct *napi;
- int err = 0;
+ int i, err = 0;
- if (dev->threaded == threaded)
- return 0;
+ netdev_assert_locked_or_invisible(dev);
if (threaded) {
list_for_each_entry(napi, &dev->napi_list, dev_list) {
if (!napi->thread) {
err = napi_kthread_create(napi);
if (err) {
- threaded = false;
+ threaded = NETDEV_NAPI_THREADED_DISABLED;
break;
}
}
@@ -6582,23 +7187,33 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
WRITE_ONCE(dev->threaded, threaded);
- /* Make sure kthread is created before THREADED bit
- * is set.
- */
- smp_mb__before_atomic();
-
- /* Setting/unsetting threaded mode on a napi might not immediately
- * take effect, if the current napi instance is actively being
- * polled. In this case, the switch between threaded mode and
- * softirq mode will happen in the next round of napi_schedule().
- * This should not cause hiccups/stalls to the live traffic.
- */
+ /* The error should not occur as the kthreads are already created. */
list_for_each_entry(napi, &dev->napi_list, dev_list)
- assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+ WARN_ON_ONCE(napi_set_threaded(napi, threaded));
+
+ /* Override the config for all NAPIs even if currently not listed */
+ for (i = 0; i < dev->num_napi_configs; i++)
+ dev->napi_config[i].threaded = threaded;
return err;
}
-EXPORT_SYMBOL(dev_set_threaded);
+
+/**
+ * netif_threaded_enable() - enable threaded NAPIs
+ * @dev: net_device instance
+ *
+ * Enable threaded mode for the NAPI instances of the device. This may be useful
+ * for devices where multiple NAPI instances get scheduled by a single
+ * interrupt. Threaded NAPI allows moving the NAPI processing to cores other
+ * than the core where IRQ is mapped.
+ *
+ * This function should be called before @dev is registered.
+ */
+void netif_threaded_enable(struct net_device *dev)
+{
+ WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED));
+}
+EXPORT_SYMBOL(netif_threaded_enable);
/**
* netif_queue_set_napi - Associate queue with the napi
@@ -6619,8 +7234,7 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
if (WARN_ON_ONCE(napi && !napi->dev))
return;
- if (dev->reg_state >= NETREG_REGISTERED)
- ASSERT_RTNL();
+ netdev_ops_assert_locked_or_invisible(dev);
switch (type) {
case NETDEV_QUEUE_TYPE_RX:
@@ -6637,20 +7251,257 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
}
EXPORT_SYMBOL(netif_queue_set_napi);
-void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
+static void
+netif_napi_irq_notify(struct irq_affinity_notify *notify,
+ const cpumask_t *mask)
{
+ struct napi_struct *napi =
+ container_of(notify, struct napi_struct, notify);
+#ifdef CONFIG_RFS_ACCEL
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+ int err;
+#endif
+
+ if (napi->config && napi->dev->irq_affinity_auto)
+ cpumask_copy(&napi->config->affinity_mask, mask);
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask);
+ if (err)
+ netdev_warn(napi->dev, "RMAP update failed (%d)\n",
+ err);
+ }
+#endif
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static void netif_napi_affinity_release(struct kref *ref)
+{
+ struct napi_struct *napi =
+ container_of(ref, struct napi_struct, notify.kref);
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+
+ netdev_assert_locked(napi->dev);
+ WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER,
+ &napi->state));
+
+ if (!napi->dev->rx_cpu_rmap_auto)
+ return;
+ rmap->obj[napi->napi_rmap_idx] = NULL;
+ napi->napi_rmap_idx = -1;
+ cpu_rmap_put(rmap);
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ if (dev->rx_cpu_rmap_auto)
+ return 0;
+
+ dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
+ if (!dev->rx_cpu_rmap)
+ return -ENOMEM;
+
+ dev->rx_cpu_rmap_auto = true;
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+ struct cpu_rmap *rmap = dev->rx_cpu_rmap;
+
+ if (!dev->rx_cpu_rmap_auto)
+ return;
+
+ /* Free the rmap */
+ cpu_rmap_put(rmap);
+ dev->rx_cpu_rmap = NULL;
+ dev->rx_cpu_rmap_auto = false;
+}
+
+#else
+static void netif_napi_affinity_release(struct kref *ref)
+{
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+}
+#endif
+
+void netif_set_affinity_auto(struct net_device *dev)
+{
+ unsigned int i, maxqs, numa;
+
+ maxqs = max(dev->num_tx_queues, dev->num_rx_queues);
+ numa = dev_to_node(&dev->dev);
+
+ for (i = 0; i < maxqs; i++)
+ cpumask_set_cpu(cpumask_local_spread(i, numa),
+ &dev->napi_config[i].affinity_mask);
+
+ dev->irq_affinity_auto = true;
+}
+EXPORT_SYMBOL(netif_set_affinity_auto);
+
+void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
+{
+ int rc;
+
+ netdev_assert_locked_or_invisible(napi->dev);
+
+ if (napi->irq == irq)
+ return;
+
+ /* Remove existing resources */
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
+ napi->irq = irq;
+ if (irq < 0 ||
+ (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto))
+ return;
+
+ /* Abort for buggy drivers */
+ if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config))
+ return;
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi);
+ if (rc < 0)
+ return;
+
+ cpu_rmap_get(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = rc;
+ }
+#endif
+
+ /* Use core IRQ notifier */
+ napi->notify.notify = netif_napi_irq_notify;
+ napi->notify.release = netif_napi_affinity_release;
+ rc = irq_set_affinity_notifier(irq, &napi->notify);
+ if (rc) {
+ netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
+ rc);
+ goto put_rmap;
+ }
+
+ set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state);
+ return;
+
+put_rmap:
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL;
+ cpu_rmap_put(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = -1;
+ }
+#endif
+ napi->notify.notify = NULL;
+ napi->notify.release = NULL;
+}
+EXPORT_SYMBOL(netif_napi_set_irq_locked);
+
+static void napi_restore_config(struct napi_struct *n)
+{
+ n->defer_hard_irqs = n->config->defer_hard_irqs;
+ n->gro_flush_timeout = n->config->gro_flush_timeout;
+ n->irq_suspend_timeout = n->config->irq_suspend_timeout;
+
+ if (n->dev->irq_affinity_auto &&
+ test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
+ irq_set_affinity(n->irq, &n->config->affinity_mask);
+
+ /* a NAPI ID might be stored in the config, if so use it. if not, use
+ * napi_hash_add to generate one for us.
+ */
+ if (n->config->napi_id) {
+ napi_hash_add_with_id(n, n->config->napi_id);
+ } else {
+ napi_hash_add(n);
+ n->config->napi_id = n->napi_id;
+ }
+
+ WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded));
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+ n->config->defer_hard_irqs = n->defer_hard_irqs;
+ n->config->gro_flush_timeout = n->gro_flush_timeout;
+ n->config->irq_suspend_timeout = n->irq_suspend_timeout;
+ napi_hash_del(n);
+}
+
+/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
+ * inherit an existing ID try to insert it at the right position.
+ */
+static void
+netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
+{
+ unsigned int new_id, pos_id;
+ struct list_head *higher;
+ struct napi_struct *pos;
+
+ new_id = UINT_MAX;
+ if (napi->config && napi->config->napi_id)
+ new_id = napi->config->napi_id;
+
+ higher = &dev->napi_list;
+ list_for_each_entry(pos, &dev->napi_list, dev_list) {
+ if (napi_id_valid(pos->napi_id))
+ pos_id = pos->napi_id;
+ else if (pos->config)
+ pos_id = pos->config->napi_id;
+ else
+ pos_id = UINT_MAX;
+
+ if (pos_id <= new_id)
+ break;
+ higher = &pos->dev_list;
+ }
+ list_add_rcu(&napi->dev_list, higher); /* adds after higher */
+}
+
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+static void napi_get_frags_check(struct napi_struct *napi)
+{
+ struct sk_buff *skb;
+
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
+
+void netif_napi_add_weight_locked(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ netdev_assert_locked(dev);
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
return;
INIT_LIST_HEAD(&napi->poll_list);
INIT_HLIST_NODE(&napi->napi_hash_node);
- hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- napi->timer.function = napi_watchdog;
- init_gro_hash(napi);
+ hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ gro_init(&napi->gro);
napi->skb = NULL;
- INIT_LIST_HEAD(&napi->rx_list);
- napi->rx_count = 0;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
netdev_err_once(dev, "%s() called with weight %d\n", __func__,
@@ -6663,24 +7514,33 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
- list_add_rcu(&napi->dev_list, &dev->napi_list);
- napi_hash_add(napi);
+ netif_napi_dev_list_add(dev, napi);
+
+ /* default settings from sysfs are applied to all NAPIs. any per-NAPI
+ * configuration will be loaded in napi_enable
+ */
+ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+ napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
napi_get_frags_check(napi);
/* Create kthread for this napi if dev->threaded is set.
* Clear dev->threaded if kthread creation failed so that
* threaded mode will not be enabled in napi_enable().
*/
- if (dev->threaded && napi_kthread_create(napi))
- dev->threaded = false;
- netif_napi_set_irq(napi, -1);
+ if (napi_get_threaded_config(dev, napi))
+ if (napi_kthread_create(napi))
+ dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
+ netif_napi_set_irq_locked(napi, -1);
}
-EXPORT_SYMBOL(netif_napi_add_weight);
+EXPORT_SYMBOL(netif_napi_add_weight_locked);
-void napi_disable(struct napi_struct *n)
+void napi_disable_locked(struct napi_struct *n)
{
unsigned long val, new;
might_sleep();
+ netdev_assert_locked(n->dev);
+
set_bit(NAPI_STATE_DISABLE, &n->state);
val = READ_ONCE(n->state);
@@ -6691,26 +7551,47 @@ void napi_disable(struct napi_struct *n)
}
new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
- new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
+ new &= ~(NAPIF_STATE_THREADED |
+ NAPIF_STATE_THREADED_BUSY_POLL |
+ NAPIF_STATE_PREFER_BUSY_POLL);
} while (!try_cmpxchg(&n->state, &val, new));
hrtimer_cancel(&n->timer);
+ if (n->config)
+ napi_save_config(n);
+ else
+ napi_hash_del(n);
+
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
-EXPORT_SYMBOL(napi_disable);
+EXPORT_SYMBOL(napi_disable_locked);
/**
- * napi_enable - enable NAPI scheduling
- * @n: NAPI context
+ * napi_disable() - prevent NAPI from scheduling
+ * @n: NAPI context
*
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
+ * Stop NAPI from being scheduled on this context.
+ * Waits till any outstanding processing completes.
+ * Takes netdev_lock() for associated net_device.
*/
-void napi_enable(struct napi_struct *n)
+void napi_disable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_disable_locked(n);
+ netdev_unlock(n->dev);
+}
+EXPORT_SYMBOL(napi_disable);
+
+void napi_enable_locked(struct napi_struct *n)
{
unsigned long new, val = READ_ONCE(n->state);
+ if (n->config)
+ napi_restore_config(n);
+ else
+ napi_hash_add(n);
+
do {
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
@@ -6719,40 +7600,54 @@ void napi_enable(struct napi_struct *n)
new |= NAPIF_STATE_THREADED;
} while (!try_cmpxchg(&n->state, &val, new));
}
-EXPORT_SYMBOL(napi_enable);
+EXPORT_SYMBOL(napi_enable_locked);
-static void flush_gro_hash(struct napi_struct *napi)
+/**
+ * napi_enable() - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Enable scheduling of a NAPI instance.
+ * Must be paired with napi_disable().
+ * Takes netdev_lock() for associated net_device.
+ */
+void napi_enable(struct napi_struct *n)
{
- int i;
-
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- struct sk_buff *skb, *n;
-
- list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
- kfree_skb(skb);
- napi->gro_hash[i].count = 0;
- }
+ netdev_lock(n->dev);
+ napi_enable_locked(n);
+ netdev_unlock(n->dev);
}
+EXPORT_SYMBOL(napi_enable);
/* Must be called in process context */
-void __netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del_locked(struct napi_struct *napi)
{
+ netdev_assert_locked(napi->dev);
+
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
- napi_hash_del(napi);
+ /* Make sure NAPI is disabled (or was never enabled). */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
+
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
+ if (napi->config) {
+ napi->index = -1;
+ napi->config = NULL;
+ }
+
list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
- flush_gro_hash(napi);
- napi->gro_bitmask = 0;
+ gro_cleanup(&napi->gro);
if (napi->thread) {
kthread_stop(napi->thread);
napi->thread = NULL;
}
}
-EXPORT_SYMBOL(__netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del_locked);
static int __napi_poll(struct napi_struct *n, bool *repoll)
{
@@ -6804,14 +7699,8 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
return work;
}
- if (n->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(n, HZ >= 1000);
- }
-
- gro_normal_list(n);
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush_normal(&n->gro, HZ >= 1000);
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
@@ -6839,9 +7728,14 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
work = __napi_poll(n, &do_repoll);
- if (do_repoll)
+ if (do_repoll) {
+#if defined(CONFIG_DEBUG_NET)
+ if (unlikely(!napi_is_scheduled(n)))
+ pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n",
+ n->dev->name, n->poll);
+#endif
list_add_tail(&n->poll_list, repoll);
-
+ }
netpoll_poll_unlock(have);
return work;
@@ -6871,7 +7765,7 @@ static int napi_thread_wait(struct napi_struct *napi)
return -1;
}
-static void napi_threaded_poll_loop(struct napi_struct *napi)
+static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
struct softnet_data *sd;
@@ -6898,29 +7792,54 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
- skb_defer_free_flush(sd);
+ skb_defer_free_flush();
bpf_net_ctx_clear(bpf_net_ctx);
+
+ /* When busy poll is enabled, the old packets are not flushed in
+ * napi_complete_done. So flush them here.
+ */
+ if (busy_poll)
+ gro_flush_normal(&napi->gro, HZ >= 1000);
local_bh_enable();
+ /* Call cond_resched here to avoid watchdog warnings. */
+ if (repoll || busy_poll) {
+ rcu_softirq_qs_periodic(last_qs);
+ cond_resched();
+ }
+
if (!repoll)
break;
-
- rcu_softirq_qs_periodic(last_qs);
- cond_resched();
}
}
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
+ bool want_busy_poll;
+ bool in_busy_poll;
+ unsigned long val;
+
+ while (!napi_thread_wait(napi)) {
+ val = READ_ONCE(napi->state);
+
+ want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
+ in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL;
+
+ if (unlikely(val & NAPIF_STATE_DISABLE))
+ want_busy_poll = false;
+
+ if (want_busy_poll != in_busy_poll)
+ assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
+ want_busy_poll);
- while (!napi_thread_wait(napi))
- napi_threaded_poll_loop(napi);
+ napi_threaded_poll_loop(napi, want_busy_poll);
+ }
return 0;
}
-static __latent_entropy void net_rx_action(struct softirq_action *h)
+static __latent_entropy void net_rx_action(void)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
@@ -6940,7 +7859,7 @@ start:
for (;;) {
struct napi_struct *n;
- skb_defer_free_flush(sd);
+ skb_defer_free_flush();
if (list_empty(&list)) {
if (list_empty(&repoll)) {
@@ -6967,7 +7886,8 @@ start:
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
- sd->time_squeeze++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1);
break;
}
}
@@ -8640,23 +9560,20 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev_change_rx_flags(dev, IFF_PROMISC);
}
- if (notify)
+ if (notify) {
+ /* The ops lock is only required to ensure consistent locking
+ * for `NETDEV_CHANGE` notifiers. This function is sometimes
+ * called without the lock, even for devices that are ops
+ * locked, such as in `dev_uc_sync_multiple` when using
+ * bonding or teaming.
+ */
+ netdev_ops_assert_locked(dev);
__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
+ }
return 0;
}
-/**
- * dev_set_promiscuity - update promiscuity count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove promiscuity from a device. While the count in the device
- * remains above zero the interface remains promiscuous. Once it hits zero
- * the device reverts back to normal filtering operation. A negative inc
- * value is used to drop promiscuity on the device.
- * Return 0 if successful or a negative errno code on error.
- */
-int dev_set_promiscuity(struct net_device *dev, int inc)
+int netif_set_promiscuity(struct net_device *dev, int inc)
{
unsigned int old_flags = dev->flags;
int err;
@@ -8668,9 +9585,8 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
dev_set_rx_mode(dev);
return err;
}
-EXPORT_SYMBOL(dev_set_promiscuity);
-static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
+int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
unsigned int allmulti, flags;
@@ -8705,25 +9621,6 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
return 0;
}
-/**
- * dev_set_allmulti - update allmulti count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove reception of all multicast frames to a device. While the
- * count in the device remains above zero the interface remains listening
- * to all interfaces. Once it hits zero the device reverts back to normal
- * filtering operation. A negative @inc value is used to drop the counter
- * when releasing a resource needing all multicasts.
- * Return 0 if successful or a negative errno code on error.
- */
-
-int dev_set_allmulti(struct net_device *dev, int inc)
-{
- return __dev_set_allmulti(dev, inc, true);
-}
-EXPORT_SYMBOL(dev_set_allmulti);
-
/*
* Upload unicast and multicast address lists to device and
* configure RX filtering. When the device doesn't support unicast
@@ -8766,12 +9663,12 @@ void dev_set_rx_mode(struct net_device *dev)
}
/**
- * dev_get_flags - get flags reported to userspace
- * @dev: device
+ * netif_get_flags() - get flags reported to userspace
+ * @dev: device
*
- * Get the combination of flag bits exported through APIs to userspace.
+ * Get the combination of flag bits exported through APIs to userspace.
*/
-unsigned int dev_get_flags(const struct net_device *dev)
+unsigned int netif_get_flags(const struct net_device *dev)
{
unsigned int flags;
@@ -8794,7 +9691,7 @@ unsigned int dev_get_flags(const struct net_device *dev)
return flags;
}
-EXPORT_SYMBOL(dev_get_flags);
+EXPORT_SYMBOL(netif_get_flags);
int __dev_change_flags(struct net_device *dev, unsigned int flags,
struct netlink_ext_ack *extack)
@@ -8839,7 +9736,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
- unsigned int old_flags = dev->flags;
+ old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
@@ -8856,7 +9753,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
dev->gflags ^= IFF_ALLMULTI;
- __dev_set_allmulti(dev, inc, false);
+ netif_set_allmulti(dev, inc, false);
}
return ret;
@@ -8891,17 +9788,8 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
}
}
-/**
- * dev_change_flags - change device settings
- * @dev: device
- * @flags: device state flags
- * @extack: netlink extended ack
- *
- * Change settings on device based state flags. The flags are
- * in the userspace exported format.
- */
-int dev_change_flags(struct net_device *dev, unsigned int flags,
- struct netlink_ext_ack *extack)
+int netif_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
int ret;
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
@@ -8914,9 +9802,8 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
__dev_notify_flags(dev, old_flags, changes, 0, NULL);
return ret;
}
-EXPORT_SYMBOL(dev_change_flags);
-int __dev_set_mtu(struct net_device *dev, int new_mtu)
+int __netif_set_mtu(struct net_device *dev, int new_mtu)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -8927,7 +9814,7 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)
WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
-EXPORT_SYMBOL(__dev_set_mtu);
+EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL");
int dev_validate_mtu(struct net_device *dev, int new_mtu,
struct netlink_ext_ack *extack)
@@ -8946,18 +9833,22 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu,
}
/**
- * dev_set_mtu_ext - Change maximum transfer unit
- * @dev: device
- * @new_mtu: new transfer unit
- * @extack: netlink extended ack
+ * netif_set_mtu_ext() - Change maximum transfer unit
+ * @dev: device
+ * @new_mtu: new transfer unit
+ * @extack: netlink extended ack
+ *
+ * Change the maximum transfer size of the network device.
*
- * Change the maximum transfer size of the network device.
+ * Return: 0 on success, -errno on failure.
*/
-int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
- struct netlink_ext_ack *extack)
+int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
{
int err, orig_mtu;
+ netdev_ops_assert_locked(dev);
+
if (new_mtu == dev->mtu)
return 0;
@@ -8974,7 +9865,7 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
return err;
orig_mtu = dev->mtu;
- err = __dev_set_mtu(dev, new_mtu);
+ err = __netif_set_mtu(dev, new_mtu);
if (!err) {
err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
@@ -8984,7 +9875,7 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
/* setting mtu back and notifying everyone again,
* so that they have a chance to revert changes.
*/
- __dev_set_mtu(dev, orig_mtu);
+ __netif_set_mtu(dev, orig_mtu);
call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
new_mtu);
}
@@ -8992,25 +9883,20 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
return err;
}
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int netif_set_mtu(struct net_device *dev, int new_mtu)
{
struct netlink_ext_ack extack;
int err;
memset(&extack, 0, sizeof(extack));
- err = dev_set_mtu_ext(dev, new_mtu, &extack);
+ err = netif_set_mtu_ext(dev, new_mtu, &extack);
if (err && extack._msg)
net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
return err;
}
-EXPORT_SYMBOL(dev_set_mtu);
+EXPORT_SYMBOL(netif_set_mtu);
-/**
- * dev_change_tx_queue_len - Change TX queue length of a netdevice
- * @dev: device
- * @new_len: new tx queue length
- */
-int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
unsigned int orig_len = dev->tx_queue_len;
int res;
@@ -9037,24 +9923,21 @@ err_rollback:
return res;
}
-/**
- * dev_set_group - Change group this device belongs to
- * @dev: device
- * @new_group: group this device should belong to
- */
-void dev_set_group(struct net_device *dev, int new_group)
+void netif_set_group(struct net_device *dev, int new_group)
{
dev->group = new_group;
}
/**
- * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
- * @dev: device
- * @addr: new address
- * @extack: netlink extended ack
+ * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR.
+ * @dev: device
+ * @addr: new address
+ * @extack: netlink extended ack
+ *
+ * Return: 0 on success, -errno on failure.
*/
-int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
- struct netlink_ext_ack *extack)
+int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
+ struct netlink_ext_ack *extack)
{
struct netdev_notifier_pre_changeaddr_info info = {
.info.dev = dev,
@@ -9066,33 +9949,25 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
return notifier_to_errno(rc);
}
-EXPORT_SYMBOL(dev_pre_changeaddr_notify);
+EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL");
-/**
- * dev_set_mac_address - Change Media Access Control Address
- * @dev: device
- * @sa: new address
- * @extack: netlink extended ack
- *
- * Change the hardware (MAC) address of the device
- */
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
- struct netlink_ext_ack *extack)
+int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
if (!ops->ndo_set_mac_address)
return -EOPNOTSUPP;
- if (sa->sa_family != dev->type)
+ if (ss->ss_family != dev->type)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
+ err = netif_pre_changeaddr_notify(dev, ss->__data, extack);
if (err)
return err;
- if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
- err = ops->ndo_set_mac_address(dev, sa);
+ if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
+ err = ops->ndo_set_mac_address(dev, ss);
if (err)
return err;
}
@@ -9101,25 +9976,13 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
add_device_randomness(dev->dev_addr, dev->addr_len);
return 0;
}
-EXPORT_SYMBOL(dev_set_mac_address);
DECLARE_RWSEM(dev_addr_sem);
-int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
- struct netlink_ext_ack *extack)
+/* "sa" is a true struct sockaddr with limited "sa_data" member. */
+int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
- int ret;
-
- down_write(&dev_addr_sem);
- ret = dev_set_mac_address(dev, sa, extack);
- up_write(&dev_addr_sem);
- return ret;
-}
-EXPORT_SYMBOL(dev_set_mac_address_user);
-
-int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
-{
- size_t size = sizeof(sa->sa_data_min);
+ size_t size = sizeof(sa->sa_data);
struct net_device *dev;
int ret = 0;
@@ -9143,16 +10006,9 @@ unlock:
up_read(&dev_addr_sem);
return ret;
}
-EXPORT_SYMBOL(dev_get_mac_address);
+EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL");
-/**
- * dev_change_carrier - Change device carrier
- * @dev: device
- * @new_carrier: new value
- *
- * Change device carrier
- */
-int dev_change_carrier(struct net_device *dev, bool new_carrier)
+int netif_change_carrier(struct net_device *dev, bool new_carrier)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -9203,16 +10059,17 @@ int dev_get_phys_port_name(struct net_device *dev,
}
/**
- * dev_get_port_parent_id - Get the device's port parent identifier
- * @dev: network device
- * @ppid: pointer to a storage for the port's parent identifier
- * @recurse: allow/disallow recursion to lower devices
+ * netif_get_port_parent_id() - Get the device's port parent identifier
+ * @dev: network device
+ * @ppid: pointer to a storage for the port's parent identifier
+ * @recurse: allow/disallow recursion to lower devices
*
- * Get the devices's port parent identifier
+ * Get the devices's port parent identifier.
+ *
+ * Return: 0 on success, -errno on failure.
*/
-int dev_get_port_parent_id(struct net_device *dev,
- struct netdev_phys_item_id *ppid,
- bool recurse)
+int netif_get_port_parent_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid, bool recurse)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct netdev_phys_item_id first = { };
@@ -9231,7 +10088,7 @@ int dev_get_port_parent_id(struct net_device *dev,
return err;
netdev_for_each_lower_dev(dev, lower_dev, iter) {
- err = dev_get_port_parent_id(lower_dev, ppid, true);
+ err = netif_get_port_parent_id(lower_dev, ppid, true);
if (err)
break;
if (!first.id_len)
@@ -9242,7 +10099,7 @@ int dev_get_port_parent_id(struct net_device *dev,
return err;
}
-EXPORT_SYMBOL(dev_get_port_parent_id);
+EXPORT_SYMBOL(netif_get_port_parent_id);
/**
* netdev_port_same_parent_id - Indicate if two network devices have
@@ -9255,23 +10112,17 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
struct netdev_phys_item_id a_id = { };
struct netdev_phys_item_id b_id = { };
- if (dev_get_port_parent_id(a, &a_id, true) ||
- dev_get_port_parent_id(b, &b_id, true))
+ if (netif_get_port_parent_id(a, &a_id, true) ||
+ netif_get_port_parent_id(b, &b_id, true))
return false;
return netdev_phys_item_id_same(&a_id, &b_id);
}
EXPORT_SYMBOL(netdev_port_same_parent_id);
-/**
- * dev_change_proto_down - set carrier according to proto_down.
- *
- * @dev: device
- * @proto_down: new value
- */
-int dev_change_proto_down(struct net_device *dev, bool proto_down)
+int netif_change_proto_down(struct net_device *dev, bool proto_down)
{
- if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
+ if (!dev->change_proto_down)
return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
@@ -9284,14 +10135,14 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
}
/**
- * dev_change_proto_down_reason - proto down reason
+ * netdev_change_proto_down_reason_locked - proto down reason
*
* @dev: device
* @mask: proto down mask
* @value: proto down value
*/
-void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
- u32 value)
+void netdev_change_proto_down_reason_locked(struct net_device *dev,
+ unsigned long mask, u32 value)
{
u32 proto_down_reason;
int b;
@@ -9368,6 +10219,40 @@ u8 dev_xdp_prog_count(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
+u8 dev_xdp_sb_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog &&
+ !dev->xdp_state[i].prog->aux->xdp_has_frags)
+ count++;
+ return count;
+}
+
+int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+{
+ if (!dev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ bpf->command == XDP_SETUP_PROG &&
+ bpf->prog && !bpf->prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "unable to propagate XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
+ return -EBUSY;
+ }
+
+ return dev->netdev_ops->ndo_bpf(dev, bpf);
+}
+EXPORT_SYMBOL_GPL(netif_xdp_propagate);
+
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
struct bpf_prog *prog = dev_xdp_prog(dev, mode);
@@ -9396,6 +10281,19 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
struct netdev_bpf xdp;
int err;
+ netdev_ops_assert_locked(dev);
+
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ prog && !prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
+ return -EBUSY;
+ }
+
memset(&xdp, 0, sizeof(xdp));
xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
xdp.extack = extack;
@@ -9544,6 +10442,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
NL_SET_ERR_MSG(extack, "Program bound to different device");
return -EINVAL;
}
+ if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
+ NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
+ return -EINVAL;
+ }
if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
return -EINVAL;
@@ -9613,7 +10515,9 @@ static void bpf_xdp_link_release(struct bpf_link *link)
* already NULL, in which case link was already auto-detached
*/
if (xdp_link->dev) {
+ netdev_lock_ops(xdp_link->dev);
WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+ netdev_unlock_ops(xdp_link->dev);
xdp_link->dev = NULL;
}
@@ -9695,10 +10599,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
goto out_unlock;
}
+ netdev_lock_ops(xdp_link->dev);
mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
xdp_link->flags, new_prog);
+ netdev_unlock_ops(xdp_link->dev);
if (err)
goto out_unlock;
@@ -9741,7 +10647,8 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
goto unlock;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog,
+ attr->link_create.attach_type);
link->dev = dev;
link->flags = attr->link_create.flags;
@@ -9751,7 +10658,9 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
goto unlock;
}
+ netdev_lock_ops(dev);
err = dev_xdp_attach_link(dev, &extack, link);
+ netdev_unlock_ops(dev);
rtnl_unlock();
if (err) {
@@ -9820,6 +10729,20 @@ err_out:
return err;
}
+u32 dev_get_min_mp_channel_count(const struct net_device *dev)
+{
+ int i;
+
+ netdev_ops_assert_locked(dev);
+
+ for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
+ if (dev->_rx[i].mp_params.mp_priv)
+ /* The channel count is the idx plus 1. */
+ return i + 1;
+
+ return 0;
+}
+
/**
* dev_index_reserve() - allocate an ifindex in a namespace
* @net: the applicable net namespace
@@ -9857,6 +10780,15 @@ static void dev_index_release(struct net *net, int ifindex)
WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}
+static bool from_cleanup_net(void)
+{
+#ifdef CONFIG_NET_NS
+ return current == READ_ONCE(cleanup_net_task);
+#else
+ return false;
+#endif
+}
+
/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
@@ -9899,6 +10831,7 @@ static void netdev_sync_lower_features(struct net_device *upper,
if (!(features & feature) && (lower->features & feature)) {
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
&feature, lower->name);
+ netdev_lock_ops(lower);
lower->wanted_features &= ~feature;
__netdev_update_features(lower);
@@ -9907,10 +10840,20 @@ static void netdev_sync_lower_features(struct net_device *upper,
&feature, lower->name);
else
netdev_features_change(lower);
+ netdev_unlock_ops(lower);
}
}
}
+static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
+{
+ netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+ bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
+ bool hw_csum = features & NETIF_F_HW_CSUM;
+
+ return ip_csum || hw_csum;
+}
+
static netdev_features_t netdev_fix_features(struct net_device *dev,
netdev_features_t features)
{
@@ -9992,15 +10935,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~NETIF_F_LRO;
}
- if (features & NETIF_F_HW_TLS_TX) {
- bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
- (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
- bool hw_csum = features & NETIF_F_HW_CSUM;
-
- if (!ip_csum && !hw_csum) {
- netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
- features &= ~NETIF_F_HW_TLS_TX;
- }
+ if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_TX;
}
if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
@@ -10008,6 +10945,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~NETIF_F_HW_TLS_RX;
}
+ if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
+ features &= ~NETIF_F_GSO_UDP_L4;
+ }
+
return features;
}
@@ -10019,6 +10961,7 @@ int __netdev_update_features(struct net_device *dev)
int err = -1;
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
features = netdev_get_wanted_features(dev);
@@ -10071,12 +11014,14 @@ sync_lower:
* *before* calling udp_tunnel_get_rx_info,
* but *after* calling udp_tunnel_drop_rx_info.
*/
+ udp_tunnel_nic_lock(dev);
if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
dev->features = features;
udp_tunnel_get_rx_info(dev);
} else {
udp_tunnel_drop_rx_info(dev);
}
+ udp_tunnel_nic_unlock(dev);
}
if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
@@ -10312,6 +11257,17 @@ static void netdev_do_free_pcpu_stats(struct net_device *dev)
}
}
+static void netdev_free_phy_link_topology(struct net_device *dev)
+{
+ struct phy_link_topology *topo = dev->link_topo;
+
+ if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
+ xa_destroy(&topo->phys);
+ kfree(topo);
+ dev->link_topo = NULL;
+ }
+}
+
/**
* register_netdevice() - register a network device
* @dev: device to register
@@ -10434,12 +11390,16 @@ int register_netdevice(struct net_device *dev)
ret = netdev_register_kobject(dev);
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
+ netdev_unlock(dev);
if (ret)
goto err_uninit_notify;
+ netdev_lock_ops(dev);
__netdev_update_features(dev);
+ netdev_unlock_ops(dev);
/*
* Default initial state at registry is that the
@@ -10465,7 +11425,9 @@ int register_netdevice(struct net_device *dev)
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
+ netdev_lock_ops(dev);
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ netdev_unlock_ops(dev);
ret = notifier_to_errno(ret);
if (ret) {
/* Expect explicit free_netdev() on failure */
@@ -10477,8 +11439,7 @@ int register_netdevice(struct net_device *dev)
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
out:
@@ -10502,26 +11463,20 @@ err_free_name:
EXPORT_SYMBOL(register_netdevice);
/* Initialize the core of a dummy net device.
- * This is useful if you are calling this function after alloc_netdev(),
- * since it does not memset the net_device fields.
+ * The setup steps dummy netdevs need which normal netdevs get by going
+ * through register_netdevice().
*/
-static void init_dummy_netdev_core(struct net_device *dev)
+static void init_dummy_netdev(struct net_device *dev)
{
/* make sure we BUG if trying to hit standard
* register/unregister code path
*/
dev->reg_state = NETREG_DUMMY;
- /* NAPI wants this */
- INIT_LIST_HEAD(&dev->napi_list);
-
/* a dummy interface is started by default */
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
- /* napi_busy_loop stats accounting wants this */
- dev_net_set(dev, &init_net);
-
/* Note : We dont allocate pcpu_refcnt for dummy devices,
* because users of this 'device' dont need to change
* its refcount.
@@ -10529,28 +11484,6 @@ static void init_dummy_netdev_core(struct net_device *dev)
}
/**
- * init_dummy_netdev - init a dummy network device for NAPI
- * @dev: device to init
- *
- * This takes a network device structure and initializes the minimum
- * amount of fields so it can be used to schedule NAPI polls without
- * registering a full blown interface. This is to be used by drivers
- * that need to tie several hardware interfaces to a single NAPI
- * poll scheduler due to HW limitations.
- */
-void init_dummy_netdev(struct net_device *dev)
-{
- /* Clear everything. Note we don't initialize spinlocks
- * as they aren't supposed to be taken by any of the
- * NAPI code and this dummy netdev is supposed to be
- * only ever used for NAPI polls
- */
- memset(dev, 0, sizeof(struct net_device));
- init_dummy_netdev_core(dev);
-}
-EXPORT_SYMBOL_GPL(init_dummy_netdev);
-
-/**
* register_netdev - register a network device
* @dev: device to register
*
@@ -10565,12 +11498,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev);
*/
int register_netdev(struct net_device *dev)
{
+ struct net *net = dev_net(dev);
int err;
- if (rtnl_lock_killable())
+ if (rtnl_net_lock_killable(net))
return -EINTR;
+
err = register_netdevice(dev);
- rtnl_unlock();
+
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdev);
@@ -10708,9 +11645,8 @@ void netdev_run_todo(void)
list_replace_init(&net_unlink_list, &unlink_list);
while (!list_empty(&unlink_list)) {
- struct net_device *dev = list_first_entry(&unlink_list,
- struct net_device,
- unlink_list);
+ dev = list_first_entry(&unlink_list, struct net_device,
+ unlink_list);
list_del_init(&dev->unlink_list);
dev->nested_level = dev->lower_level - 1;
}
@@ -10732,7 +11668,9 @@ void netdev_run_todo(void)
continue;
}
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
+ netdev_unlock(dev);
linkwatch_sync_dev(dev);
}
@@ -10859,7 +11797,7 @@ noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
return;
}
- field = (__force unsigned long __percpu *)((__force void *)p + offset);
+ field = (unsigned long __percpu *)((void __percpu *)p + offset);
this_cpu_inc(*field);
}
EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
@@ -10880,6 +11818,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
const struct net_device_core_stats __percpu *p;
+ /*
+ * IPv{4,6} and udp tunnels share common stat helpers and use
+ * different stat type (NETDEV_PCPU_STAT_TSTATS vs
+ * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
+ */
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
+ offsetof(struct pcpu_dstats, rx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
+ offsetof(struct pcpu_dstats, rx_packets));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
+ offsetof(struct pcpu_dstats, tx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
+ offsetof(struct pcpu_dstats, tx_packets));
+
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
ops->ndo_get_stats64(dev, storage);
@@ -11000,8 +11952,8 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
WARN_ON(dev->reg_state == NETREG_REGISTERED);
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- dev->gro_flush_timeout = 20000;
- dev->napi_defer_hard_irqs = 1;
+ netdev_set_gro_flush_timeout(dev, 20000);
+ netdev_set_defer_hard_irqs(dev, 1);
}
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
@@ -11025,6 +11977,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
+ size_t napi_config_sz;
+ unsigned int maxqs;
BUG_ON(strlen(name) >= sizeof(dev->name));
@@ -11038,6 +11992,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
return NULL;
}
+ maxqs = max(txqs, rxqs);
+
dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!dev)
@@ -11045,7 +12001,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->priv_len = sizeof_priv;
- ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
+ ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev");
#ifdef CONFIG_PCPU_DEV_REFCNT
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
@@ -11090,6 +12046,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
+
+ mutex_init(&dev->lock);
+
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
@@ -11111,7 +12070,18 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->ethtool)
goto free_all;
- strcpy(dev->name, name);
+ dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg)
+ goto free_all;
+ dev->cfg_pending = dev->cfg;
+
+ dev->num_napi_configs = maxqs;
+ napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+ dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+ if (!dev->napi_config)
+ goto free_all;
+
+ strscpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
@@ -11135,6 +12105,22 @@ free_dev:
}
EXPORT_SYMBOL(alloc_netdev_mqs);
+static void netdev_napi_exit(struct net_device *dev)
+{
+ if (!list_empty(&dev->napi_list)) {
+ struct napi_struct *p, *n;
+
+ netdev_lock(dev);
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ __netif_napi_del_locked(p);
+ netdev_unlock(dev);
+
+ synchronize_net();
+ }
+
+ kvfree(dev->napi_config);
+}
+
/**
* free_netdev - free network device
* @dev: device
@@ -11146,8 +12132,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
*/
void free_netdev(struct net_device *dev)
{
- struct napi_struct *p, *n;
-
might_sleep();
/* When called immediately after register_netdevice() failed the unwind
@@ -11160,6 +12144,8 @@ void free_netdev(struct net_device *dev)
return;
}
+ WARN_ON(dev->cfg != dev->cfg_pending);
+ kfree(dev->cfg);
kfree(dev->ethtool);
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
@@ -11169,8 +12155,9 @@ void free_netdev(struct net_device *dev)
/* Flush device addresses */
dev_addr_flush(dev);
- list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
- netif_napi_del(p);
+ netdev_napi_exit(dev);
+
+ netif_del_cpu_rmap(dev);
ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
@@ -11182,6 +12169,10 @@ void free_netdev(struct net_device *dev)
free_percpu(dev->xdp_bulkq);
dev->xdp_bulkq = NULL;
+ netdev_free_phy_link_topology(dev);
+
+ mutex_destroy(&dev->lock);
+
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED ||
dev->reg_state == NETREG_DUMMY) {
@@ -11206,7 +12197,7 @@ EXPORT_SYMBOL(free_netdev);
struct net_device *alloc_netdev_dummy(int sizeof_priv)
{
return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
- init_dummy_netdev_core);
+ init_dummy_netdev);
}
EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
@@ -11219,7 +12210,7 @@ EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
void synchronize_net(void)
{
might_sleep();
- if (rtnl_is_locked())
+ if (from_cleanup_net() || rtnl_is_locked())
synchronize_rcu_expedited();
else
synchronize_rcu();
@@ -11233,21 +12224,8 @@ static void netdev_rss_contexts_free(struct net_device *dev)
mutex_lock(&dev->ethtool->rss_lock);
xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
- struct ethtool_rxfh_param rxfh;
-
- rxfh.indir = ethtool_rxfh_context_indir(ctx);
- rxfh.key = ethtool_rxfh_context_key(ctx);
- rxfh.hfunc = ctx->hfunc;
- rxfh.input_xfrm = ctx->input_xfrm;
- rxfh.rss_context = context;
- rxfh.rss_delete = true;
-
xa_erase(&dev->ethtool->rss_ctx, context);
- if (dev->ethtool_ops->create_rxfh_context)
- dev->ethtool_ops->remove_rxfh_context(dev, ctx,
- context, NULL);
- else
- dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL);
+ dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL);
kfree(ctx);
}
xa_destroy(&dev->ethtool->rss_ctx);
@@ -11282,6 +12260,48 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
}
EXPORT_SYMBOL(unregister_netdevice_queue);
+static void dev_memory_provider_uninstall(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ struct netdev_rx_queue *rxq = &dev->_rx[i];
+ struct pp_memory_provider_params *p = &rxq->mp_params;
+
+ if (p->mp_ops && p->mp_ops->uninstall)
+ p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
+ }
+}
+
+/* devices must be UP and netdev_lock()'d */
+static void netif_close_many_and_unlock(struct list_head *close_head)
+{
+ struct net_device *dev, *tmp;
+
+ netif_close_many(close_head, false);
+
+ /* ... now unlock them */
+ list_for_each_entry_safe(dev, tmp, close_head, close_list) {
+ netdev_unlock(dev);
+ list_del_init(&dev->close_list);
+ }
+}
+
+static void netif_close_many_and_unlock_cond(struct list_head *close_head)
+{
+#ifdef CONFIG_LOCKDEP
+ /* We can only track up to MAX_LOCK_DEPTH locks per task.
+ *
+ * Reserve half the available slots for additional locks possibly
+ * taken by notifiers and (soft)irqs.
+ */
+ unsigned int limit = MAX_LOCK_DEPTH / 2;
+
+ if (lockdep_depth(current) > limit)
+ netif_close_many_and_unlock(close_head);
+#endif
+}
+
void unregister_netdevice_many_notify(struct list_head *head,
u32 portid, const struct nlmsghdr *nlh)
{
@@ -11312,15 +12332,30 @@ void unregister_netdevice_many_notify(struct list_head *head,
BUG_ON(dev->reg_state != NETREG_REGISTERED);
}
- /* If device is running, close it first. */
- list_for_each_entry(dev, head, unreg_list)
- list_add_tail(&dev->close_list, &close_head);
- dev_close_many(&close_head, true);
+ /* If device is running, close it first. Start with ops locked... */
+ list_for_each_entry(dev, head, unreg_list) {
+ if (!(dev->flags & IFF_UP))
+ continue;
+ if (netdev_need_ops_lock(dev)) {
+ list_add_tail(&dev->close_list, &close_head);
+ netdev_lock(dev);
+ }
+ netif_close_many_and_unlock_cond(&close_head);
+ }
+ netif_close_many_and_unlock(&close_head);
+ /* ... now go over the rest. */
+ list_for_each_entry(dev, head, unreg_list) {
+ if (!netdev_need_ops_lock(dev))
+ list_add_tail(&dev->close_list, &close_head);
+ }
+ netif_close_many(&close_head, true);
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
unlist_netdevice(dev);
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
+ netdev_unlock(dev);
}
flush_all_backlogs();
@@ -11330,9 +12365,12 @@ void unregister_netdevice_many_notify(struct list_head *head,
struct sk_buff *skb = NULL;
/* Shutdown queueing discipline. */
+ netdev_lock_ops(dev);
dev_shutdown(dev);
dev_tcx_uninstall(dev);
dev_xdp_uninstall(dev);
+ dev_memory_provider_uninstall(dev);
+ netdev_unlock_ops(dev);
bpf_dev_bound_netdev_unregister(dev);
netdev_offload_xstats_disable_all(dev);
@@ -11342,8 +12380,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
GFP_KERNEL, NULL, 0,
portid, nlh);
@@ -11366,6 +12403,8 @@ void unregister_netdevice_many_notify(struct list_head *head,
mutex_destroy(&dev->ethtool->rss_lock);
+ net_shaper_flush_netdev(dev);
+
if (skb)
rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
@@ -11398,7 +12437,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
* @head: list of devices
*
* Note: As most callers use a stack allocated list_head,
- * we force a list_del() to make sure stack wont be corrupted later.
+ * we force a list_del() to make sure stack won't be corrupted later.
*/
void unregister_netdevice_many(struct list_head *head)
{
@@ -11419,30 +12458,15 @@ EXPORT_SYMBOL(unregister_netdevice_many);
*/
void unregister_netdev(struct net_device *dev)
{
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
unregister_netdevice(dev);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);
-/**
- * __dev_change_net_namespace - move device to different nethost namespace
- * @dev: device
- * @net: network namespace
- * @pat: If not NULL name pattern to try if the current device name
- * is already taken in the destination network namespace.
- * @new_ifindex: If not zero, specifies device index in the target
- * namespace.
- *
- * This function shuts down a device interface and moves it
- * to a new network namespace. On success 0 is returned, on
- * a failure a netagive errno code is returned.
- *
- * Callers must hold the rtnl semaphore.
- */
-
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
- const char *pat, int new_ifindex)
+ const char *pat, int new_ifindex,
+ struct netlink_ext_ack *extack)
{
struct netdev_name_node *name_node;
struct net *net_old = dev_net(dev);
@@ -11453,12 +12477,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
/* Don't allow namespace local devices to be moved. */
err = -EINVAL;
- if (dev->features & NETIF_F_NETNS_LOCAL)
+ if (dev->netns_immutable) {
+ NL_SET_ERR_MSG(extack, "The interface netns is immutable");
goto out;
+ }
- /* Ensure the device has been registrered */
- if (dev->reg_state != NETREG_REGISTERED)
+ /* Ensure the device has been registered */
+ if (dev->reg_state != NETREG_REGISTERED) {
+ NL_SET_ERR_MSG(extack, "The interface isn't registered");
goto out;
+ }
/* Get out if there is nothing todo */
err = 0;
@@ -11471,30 +12499,49 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
err = -EEXIST;
if (netdev_name_in_use(net, dev->name)) {
/* We get here if we can't use the current device name */
- if (!pat)
+ if (!pat) {
+ NL_SET_ERR_MSG(extack,
+ "An interface with the same name exists in the target netns");
goto out;
+ }
err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Unable to use '%s' for the new interface name in the target netns",
+ pat);
goto out;
+ }
}
/* Check that none of the altnames conflicts. */
err = -EEXIST;
- netdev_for_each_altname(dev, name_node)
- if (netdev_name_in_use(net, name_node->name))
+ netdev_for_each_altname(dev, name_node) {
+ if (netdev_name_in_use(net, name_node->name)) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "An interface with the altname %s exists in the target netns",
+ name_node->name);
goto out;
+ }
+ }
/* Check that new_ifindex isn't used yet. */
if (new_ifindex) {
err = dev_index_reserve(net, new_ifindex);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "The ifindex %d is not available in the target netns",
+ new_ifindex);
goto out;
+ }
} else {
/* If there is an ifindex conflict assign a new one */
err = dev_index_reserve(net, dev->ifindex);
if (err == -EBUSY)
err = dev_index_reserve(net, 0);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Unable to allocate a new ifindex in the target netns");
goto out;
+ }
new_ifindex = err;
}
@@ -11502,16 +12549,23 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
* And now a mini version of register_netdevice unregister_netdevice.
*/
+ netdev_lock_ops(dev);
/* If device is running close it first. */
- dev_close(dev);
-
+ netif_close(dev);
/* And unlink it from device chain */
unlist_netdevice(dev);
+ if (!netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ dev->moving_ns = true;
+ netdev_unlock(dev);
+
synchronize_net();
/* Shutdown queueing discipline. */
+ netdev_lock_ops(dev);
dev_shutdown(dev);
+ netdev_unlock_ops(dev);
/* Notify protocols, that we are about to destroy
* this device. They should clean all the things.
@@ -11542,7 +12596,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
move_netdevice_notifiers_dev_net(dev, net);
/* Actually switch the network namespace */
+ netdev_lock(dev);
dev_net_set(dev, net);
+ netdev_unlock(dev);
dev->ifindex = new_ifindex;
if (new_name[0]) {
@@ -11568,11 +12624,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
err = netdev_change_owner(dev, net_old, net);
WARN_ON(err);
+ netdev_lock(dev);
+ dev->moving_ns = false;
+ if (!netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+
/* Add the device back in the hashes */
list_netdevice(dev);
-
/* Notify protocols, that a new device appeared. */
call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ netdev_unlock_ops(dev);
/*
* Prevent userspace races by waiting until the network
@@ -11585,7 +12646,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
out:
return err;
}
-EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
static int dev_cpu_dead(unsigned int oldcpu)
{
@@ -11683,6 +12743,94 @@ netdev_features_t netdev_increment_features(netdev_features_t all,
}
EXPORT_SYMBOL(netdev_increment_features);
+/**
+ * netdev_compute_master_upper_features - compute feature from lowers
+ * @dev: the upper device
+ * @update_header: whether to update upper device's header_len/headroom/tailroom
+ *
+ * Recompute the upper device's feature based on all lower devices.
+ */
+void netdev_compute_master_upper_features(struct net_device *dev, bool update_header)
+{
+ unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
+ netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES;
+ netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES;
+ netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES;
+ netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES;
+ netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES;
+ unsigned short max_header_len = ETH_HLEN;
+ unsigned int tso_max_size = TSO_MAX_SIZE;
+ unsigned short max_headroom = 0;
+ unsigned short max_tailroom = 0;
+ u16 tso_max_segs = TSO_MAX_SEGS;
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ mpls_features = netdev_base_features(mpls_features);
+ vlan_features = netdev_base_features(vlan_features);
+ enc_features = netdev_base_features(enc_features);
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ gso_partial_features = netdev_increment_features(gso_partial_features,
+ lower_dev->gso_partial_features,
+ MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES);
+
+ vlan_features = netdev_increment_features(vlan_features,
+ lower_dev->vlan_features,
+ MASTER_UPPER_DEV_VLAN_FEATURES);
+
+ enc_features = netdev_increment_features(enc_features,
+ lower_dev->hw_enc_features,
+ MASTER_UPPER_DEV_ENC_FEATURES);
+
+ if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
+ xfrm_features = netdev_increment_features(xfrm_features,
+ lower_dev->hw_enc_features,
+ MASTER_UPPER_DEV_XFRM_FEATURES);
+
+ mpls_features = netdev_increment_features(mpls_features,
+ lower_dev->mpls_features,
+ MASTER_UPPER_DEV_MPLS_FEATURES);
+
+ dst_release_flag &= lower_dev->priv_flags;
+
+ if (update_header) {
+ max_header_len = max(max_header_len, lower_dev->hard_header_len);
+ max_headroom = max(max_headroom, lower_dev->needed_headroom);
+ max_tailroom = max(max_tailroom, lower_dev->needed_tailroom);
+ }
+
+ tso_max_size = min(tso_max_size, lower_dev->tso_max_size);
+ tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs);
+ }
+
+ dev->gso_partial_features = gso_partial_features;
+ dev->vlan_features = vlan_features;
+ dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
+ if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
+ dev->hw_enc_features |= xfrm_features;
+ dev->mpls_features = mpls_features;
+
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
+ dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+ dev->priv_flags |= IFF_XMIT_DST_RELEASE;
+
+ if (update_header) {
+ dev->hard_header_len = max_header_len;
+ dev->needed_headroom = max_headroom;
+ dev->needed_tailroom = max_tailroom;
+ }
+
+ netif_set_tso_max_segs(dev, tso_max_segs);
+ netif_set_tso_max_size(dev, tso_max_size);
+
+ netdev_change_features(dev);
+}
+EXPORT_SYMBOL(netdev_compute_master_upper_features);
+
static struct hlist_head * __net_init netdev_create_hash(void)
{
int i;
@@ -11700,7 +12848,7 @@ static struct hlist_head * __net_init netdev_create_hash(void)
static int __net_init netdev_init(struct net *net)
{
BUILD_BUG_ON(GRO_HASH_BUCKETS >
- 8 * sizeof_field(struct napi_struct, gro_bitmask));
+ BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));
INIT_LIST_HEAD(&net->dev_base_head);
@@ -11835,7 +12983,7 @@ static void __net_exit default_device_exit_net(struct net *net)
char fb_name[IFNAMSIZ];
/* Ignore unmoveable devices (i.e. loopback) */
- if (dev->features & NETIF_F_NETNS_LOCAL)
+ if (dev->netns_immutable)
continue;
/* Leave virtual devices for the generic cleanup */
@@ -11896,7 +13044,7 @@ static struct pernet_operations __net_initdata default_device_ops = {
static void __init net_dev_struct_check(void)
{
/* TX read-mostly hotpath */
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
@@ -11934,8 +13082,6 @@ static void __init net_dev_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
@@ -11947,7 +13093,7 @@ static void __init net_dev_struct_check(void)
#ifdef CONFIG_NET_XGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
- CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
}
/*
@@ -11969,12 +13115,19 @@ static int net_page_pool_create(int cpuid)
.nid = cpu_to_mem(cpuid),
};
struct page_pool *pp_ptr;
+ int err;
pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
if (IS_ERR(pp_ptr))
return -ENOMEM;
- per_cpu(system_page_pool, cpuid) = pp_ptr;
+ err = xdp_reg_page_pool(pp_ptr);
+ if (err) {
+ page_pool_destroy(pp_ptr);
+ return err;
+ }
+
+ per_cpu(system_page_pool.pool, cpuid) = pp_ptr;
#endif
return 0;
}
@@ -11991,7 +13144,7 @@ static void run_backlog_napi(unsigned int cpu)
{
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
- napi_threaded_poll_loop(&sd->backlog);
+ napi_threaded_poll_loop(&sd->backlog, false);
}
static void backlog_napi_setup(unsigned int cpu)
@@ -12039,12 +13192,13 @@ static int __init net_dev_init(void)
* Initialise the packet receive queues.
*/
+ flush_backlogs_fallback = flush_backlogs_alloc();
+ if (!flush_backlogs_fallback)
+ goto out;
+
for_each_possible_cpu(i) {
- struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
- INIT_WORK(flush, flush_backlog);
-
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
@@ -12057,9 +13211,8 @@ static int __init net_dev_init(void)
sd->cpu = i;
#endif
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
- spin_lock_init(&sd->defer_lock);
- init_gro_hash(&sd->backlog);
+ gro_init(&sd->backlog.gro);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
INIT_LIST_HEAD(&sd->backlog.poll_list);
@@ -12067,6 +13220,11 @@ static int __init net_dev_init(void)
if (net_page_pool_create(i))
goto out;
}
+ net_hotdata.skb_defer_nodes =
+ __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids,
+ __alignof__(struct skb_defer_node));
+ if (!net_hotdata.skb_defer_nodes)
+ goto out;
if (use_backlog_threads())
smpboot_register_percpu_thread(&backlog_threads);
@@ -12103,12 +13261,13 @@ out:
for_each_possible_cpu(i) {
struct page_pool *pp_ptr;
- pp_ptr = per_cpu(system_page_pool, i);
+ pp_ptr = per_cpu(system_page_pool.pool, i);
if (!pp_ptr)
continue;
+ xdp_unreg_page_pool(pp_ptr);
page_pool_destroy(pp_ptr);
- per_cpu(system_page_pool, i) = NULL;
+ per_cpu(system_page_pool.pool, i) = NULL;
}
}